Skip to content

Commit dca7d73

Browse files
authored
ref(seer): Refactor night shift into modules and use search backend (#112635)
Refactors night shift from a single file into a package with separate modules for cron scheduling, simple triage, agentic triage, and shared models. Switches `fixability_score_strategy` from a direct ORM query to using `search.backend.query()` with the `recommended` sort. This gives us the same ranking algorithm used in the issues list UI (recency, spike detection, severity, user impact, event volume) as a pre-filter, then re-ranks by fixability score in-memory. Other changes: - Adds `reset_snuba_data` flag to `SnubaTestCase` so tests can opt out of dropping ClickHouse data between runs - Adds `bin/seer/trigger-night-shift` script for local testing - Registers a new Snuba referrer for the search query - Uses pydantic `model_validate_json` for LLM response parsing in agentic triage
1 parent eafc821 commit dca7d73

File tree

10 files changed

+392
-148
lines changed

10 files changed

+392
-148
lines changed

bin/seer/trigger-night-shift

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
#!/usr/bin/env python
2+
3+
from sentry.runner import configure
4+
5+
configure()
6+
7+
import argparse
8+
import sys
9+
10+
from sentry.tasks.seer.night_shift.cron import run_night_shift_for_org
11+
12+
13+
def main(org_id: int) -> None:
14+
sys.stdout.write(f"> Running night shift for organization {org_id}...\n")
15+
run_night_shift_for_org(org_id)
16+
sys.stdout.write("> Done.\n")
17+
18+
19+
if __name__ == "__main__":
20+
parser = argparse.ArgumentParser(description="Trigger night shift for an organization.")
21+
parser.add_argument(
22+
"org_id", nargs="?", default=1, type=int, help="Organization ID (default: 1)"
23+
)
24+
args = parser.parse_args()
25+
main(args.org_id)

src/sentry/conf/server.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -992,7 +992,7 @@ def SOCIAL_AUTH_DEFAULT_USERNAME() -> str:
992992
"sentry.tasks.seer.explorer_index",
993993
"sentry.tasks.seer.context_engine_index",
994994
"sentry.tasks.seer.lightweight_rca_cluster",
995-
"sentry.tasks.seer.night_shift",
995+
"sentry.tasks.seer.night_shift.cron",
996996
# Used for tests
997997
"sentry.taskworker.tasks.examples",
998998
)

src/sentry/snuba/referrer.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -778,6 +778,7 @@ class Referrer(StrEnum):
778778
SEARCH_SAMPLE = "search_sample"
779779
SEARCH = "search"
780780
SEARCH_GROUP_INDEX = "search.group_index"
781+
SEER_NIGHT_SHIFT_FIXABILITY_SCORE_STRATEGY = "seer.night_shift.fixability_score_strategy"
781782
SEARCH_GROUP_INDEX_SAMPLE = "search.group_index_sample"
782783
SEARCH_SNUBA_GROUP_ATTRIBUTES_SEARCH_QUERY = "search.snuba.group_attributes_search.query"
783784
SEARCH_SNUBA_GROUP_ATTRIBUTES_SEARCH_HITS = "search.snuba.group_attributes_search.hits"

src/sentry/tasks/seer/night_shift/__init__.py

Whitespace-only changes.
Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
from __future__ import annotations
2+
3+
import logging
4+
import textwrap
5+
from collections.abc import Sequence
6+
7+
import orjson
8+
import pydantic
9+
10+
from sentry.models.organization import Organization
11+
from sentry.models.project import Project
12+
from sentry.seer.signed_seer_api import LlmGenerateRequest, make_llm_generate_request
13+
from sentry.tasks.seer.night_shift.models import TriageAction, TriageResult
14+
from sentry.tasks.seer.night_shift.simple_triage import (
15+
ScoredCandidate,
16+
fixability_score_strategy,
17+
priority_label,
18+
)
19+
20+
logger = logging.getLogger("sentry.tasks.seer.night_shift")
21+
22+
23+
class _TriageVerdict(pydantic.BaseModel):
24+
group_id: int
25+
action: TriageAction
26+
reason: str
27+
28+
29+
class _TriageResponse(pydantic.BaseModel):
30+
verdicts: list[_TriageVerdict]
31+
32+
@pydantic.validator("verdicts")
33+
def filter_skips(cls, v: list[_TriageVerdict]) -> list[_TriageVerdict]:
34+
return [verdict for verdict in v if verdict.action != TriageAction.SKIP]
35+
36+
37+
def agentic_triage_strategy(
38+
projects: Sequence[Project],
39+
organization: Organization,
40+
) -> list[TriageResult]:
41+
"""
42+
Select candidates via fixability scoring, then filter through an LLM
43+
triage call that decides the action for each candidate.
44+
"""
45+
scored = fixability_score_strategy(projects)
46+
if not scored:
47+
return []
48+
49+
return _triage_candidates(scored, organization)
50+
51+
52+
def _triage_candidates(
53+
candidates: list[ScoredCandidate],
54+
organization: Organization,
55+
) -> list[TriageResult]:
56+
"""
57+
Call Seer LLM proxy to triage the candidate batch via a single LLM call.
58+
Returns candidates the LLM didn't skip, with their assigned action.
59+
"""
60+
groups_by_id = {c.group.id: c.group for c in candidates}
61+
62+
body = LlmGenerateRequest(
63+
provider="gemini",
64+
model="pro-preview",
65+
referrer="night_shift.triage",
66+
prompt=_build_triage_prompt(candidates),
67+
system_prompt="",
68+
temperature=0.0,
69+
max_tokens=4096,
70+
response_schema=_TriageResponse.schema(),
71+
)
72+
73+
try:
74+
response = make_llm_generate_request(body, timeout=60)
75+
if response.status >= 400:
76+
logger.error(
77+
"night_shift.triage_request_failed",
78+
extra={
79+
"organization_id": organization.id,
80+
"status": response.status,
81+
},
82+
)
83+
return []
84+
85+
data = orjson.loads(response.data)
86+
content = data.get("content")
87+
if not content:
88+
logger.error(
89+
"night_shift.triage_empty_response",
90+
extra={"organization_id": organization.id},
91+
)
92+
return []
93+
94+
triage_response = _TriageResponse.parse_raw(content)
95+
except Exception:
96+
logger.exception(
97+
"night_shift.triage_request_error",
98+
extra={"organization_id": organization.id},
99+
)
100+
return []
101+
102+
results = [
103+
TriageResult(group=groups_by_id[v.group_id], action=v.action)
104+
for v in triage_response.verdicts
105+
if v.group_id in groups_by_id
106+
]
107+
108+
logger.info(
109+
"night_shift.triage_verdicts",
110+
extra={
111+
"organization_id": organization.id,
112+
"verdicts": {v.group_id: v.action for v in triage_response.verdicts},
113+
},
114+
)
115+
116+
return results
117+
118+
119+
def _build_triage_prompt(
120+
candidates: list[ScoredCandidate],
121+
) -> str:
122+
candidates_block = "\n".join(
123+
f"- group_id={c.group.id} | title={c.group.title or 'Unknown error'!r} "
124+
f"| culprit={c.group.culprit or 'unknown'!r} "
125+
f"| fixability={c.fixability:.2f} | times_seen={c.times_seen} "
126+
f"| first_seen={c.group.first_seen.isoformat()} "
127+
f"| priority={priority_label(c.group.priority) or 'unknown'}"
128+
for c in candidates
129+
)
130+
131+
return textwrap.dedent(f"""\
132+
You are a triage agent for Sentry's Night Shift system. Your job is to review
133+
a batch of candidate issues and decide which ones are worth running automated
134+
root-cause analysis and code fixes on.
135+
136+
For each candidate, choose one action:
137+
- "autofix": Run the full automated pipeline (root cause → solution → code changes).
138+
Choose this for issues that look clearly fixable from their title/culprit and have
139+
a high fixability score.
140+
- "root_cause_only": Only run root-cause analysis, don't attempt a fix.
141+
Choose this for issues that are worth investigating but may be too complex or
142+
ambiguous to auto-fix confidently.
143+
- "skip": Don't process this issue.
144+
Choose this for issues that are vague, likely duplicates of each other in this
145+
batch, or not worth spending compute on.
146+
147+
Provide a brief reason for each decision.
148+
149+
Candidates:
150+
{candidates_block}
151+
""")
Lines changed: 7 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -2,37 +2,26 @@
22

33
import logging
44
from collections.abc import Sequence
5-
from dataclasses import dataclass
65
from datetime import timedelta
76

87
import sentry_sdk
9-
from django.db.models import F
108

119
from sentry import features, options
1210
from sentry.constants import ObjectStatus
13-
from sentry.models.group import Group, GroupStatus
1411
from sentry.models.organization import Organization, OrganizationStatus
1512
from sentry.models.project import Project
1613
from sentry.seer.autofix.constants import AutofixAutomationTuningSettings
17-
from sentry.seer.autofix.utils import is_issue_category_eligible
1814
from sentry.seer.models.project_repository import SeerProjectRepository
1915
from sentry.tasks.base import instrumented_task
16+
from sentry.tasks.seer.night_shift.agentic_triage import agentic_triage_strategy
2017
from sentry.taskworker.namespaces import seer_tasks
21-
from sentry.types.group import PriorityLevel
2218
from sentry.utils.iterators import chunked
2319
from sentry.utils.query import RangeQuerySetWrapper
2420

2521
logger = logging.getLogger("sentry.tasks.seer.night_shift")
2622

2723
NIGHT_SHIFT_DISPATCH_STEP_SECONDS = 37
2824
NIGHT_SHIFT_SPREAD_DURATION = timedelta(hours=4)
29-
NIGHT_SHIFT_MAX_CANDIDATES = 10
30-
NIGHT_SHIFT_ISSUE_FETCH_LIMIT = 100
31-
32-
# Weights for candidate scoring. Set to 0 to disable a signal.
33-
WEIGHT_FIXABILITY = 1.0
34-
WEIGHT_SEVERITY = 0.0
35-
WEIGHT_TIMES_SEEN = 0.0
3625

3726
FEATURE_NAMES = [
3827
"organizations:seer-night-shift",
@@ -64,9 +53,6 @@ def schedule_night_shift() -> None:
6453
100,
6554
):
6655
for org in _get_eligible_orgs_from_batch(org_batch):
67-
if bool(org.get_option("sentry:hide_ai_features")):
68-
continue
69-
7056
delay = (batch_index * NIGHT_SHIFT_DISPATCH_STEP_SECONDS) % spread_seconds
7157

7258
run_night_shift_for_org.apply_async(
@@ -81,28 +67,6 @@ def schedule_night_shift() -> None:
8167
)
8268

8369

84-
@dataclass
85-
class _ScoredCandidate:
86-
"""A candidate issue with raw signals for ranking."""
87-
88-
group_id: int
89-
project_id: int
90-
fixability: float
91-
times_seen: int
92-
severity: float
93-
94-
@property
95-
def score(self) -> float:
96-
return (
97-
WEIGHT_FIXABILITY * self.fixability
98-
+ WEIGHT_SEVERITY * self.severity
99-
+ WEIGHT_TIMES_SEEN * min(self.times_seen / 1000.0, 1.0)
100-
)
101-
102-
def __lt__(self, other: _ScoredCandidate) -> bool:
103-
return self.score < other.score
104-
105-
10670
@instrumented_task(
10771
name="sentry.tasks.seer.night_shift.run_night_shift_for_org",
10872
namespace=seer_tasks,
@@ -134,25 +98,21 @@ def run_night_shift_for_org(organization_id: int) -> None:
13498
)
13599
return
136100

137-
top_candidates = _fixability_score_strategy(eligible_projects)
101+
candidates = agentic_triage_strategy(eligible_projects, organization)
138102

139103
logger.info(
140104
"night_shift.candidates_selected",
141105
extra={
142106
"organization_id": organization_id,
143107
"organization_slug": organization.slug,
144108
"num_eligible_projects": len(eligible_projects),
145-
"num_candidates": len(top_candidates),
109+
"num_candidates": len(candidates),
146110
"candidates": [
147111
{
148-
"group_id": c.group_id,
149-
"project_id": c.project_id,
150-
"score": c.score,
151-
"fixability": c.fixability,
152-
"severity": c.severity,
153-
"times_seen": c.times_seen,
112+
"group_id": c.group.id,
113+
"action": c.action,
154114
}
155-
for c in top_candidates
115+
for c in candidates
156116
],
157117
},
158118
)
@@ -165,7 +125,7 @@ def _get_eligible_orgs_from_batch(
165125
Check feature flags for a batch of orgs using batch_has_for_organizations.
166126
Returns orgs that have all required feature flags enabled.
167127
"""
168-
eligible = list(orgs)
128+
eligible = [org for org in orgs if not org.get_option("sentry:hide_ai_features")]
169129

170130
for feature_name in FEATURE_NAMES:
171131
batch_result = features.batch_has_for_organizations(feature_name, eligible)
@@ -197,41 +157,3 @@ def _get_eligible_projects(organization: Organization) -> list[Project]:
197157
for p in projects
198158
if p.get_option("sentry:autofix_automation_tuning") != AutofixAutomationTuningSettings.OFF
199159
]
200-
201-
202-
def _fixability_score_strategy(
203-
projects: Sequence[Project],
204-
) -> list[_ScoredCandidate]:
205-
"""
206-
Rank issues by existing fixability score with times_seen as tiebreaker.
207-
Simple baseline — doesn't require any additional LLM calls.
208-
"""
209-
all_candidates: list[_ScoredCandidate] = []
210-
211-
for project_id_batch in chunked(projects, 100):
212-
groups = Group.objects.filter(
213-
project_id__in=[p.id for p in project_id_batch],
214-
status=GroupStatus.UNRESOLVED,
215-
seer_autofix_last_triggered__isnull=True,
216-
seer_explorer_autofix_last_triggered__isnull=True,
217-
).order_by(
218-
F("seer_fixability_score").desc(nulls_last=True),
219-
F("times_seen").desc(),
220-
)[:NIGHT_SHIFT_ISSUE_FETCH_LIMIT]
221-
222-
for group in groups:
223-
if not is_issue_category_eligible(group):
224-
continue
225-
226-
all_candidates.append(
227-
_ScoredCandidate(
228-
group_id=group.id,
229-
project_id=group.project_id,
230-
fixability=group.seer_fixability_score or 0.0,
231-
times_seen=group.times_seen,
232-
severity=(group.priority or 0) / PriorityLevel.HIGH,
233-
)
234-
)
235-
236-
all_candidates.sort(reverse=True)
237-
return all_candidates[:NIGHT_SHIFT_MAX_CANDIDATES]
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
from __future__ import annotations
2+
3+
import enum
4+
from dataclasses import dataclass
5+
6+
from sentry.models.group import Group
7+
8+
9+
class TriageAction(enum.StrEnum):
10+
AUTOFIX = "autofix"
11+
ROOT_CAUSE_ONLY = "root_cause_only"
12+
SKIP = "skip"
13+
14+
15+
@dataclass
16+
class TriageResult:
17+
group: Group
18+
action: TriageAction = TriageAction.AUTOFIX

0 commit comments

Comments
 (0)