Skip to content

Commit 49c2c5b

Browse files
committed
feat(aci): Database-tracked coordinated task-based DetectorGroup backfill
1 parent d573be1 commit 49c2c5b

File tree

10 files changed

+1264
-1
lines changed

10 files changed

+1264
-1
lines changed

migrations_lockfile.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,4 +39,4 @@ tempest: 0001_squashed_0002_make_message_type_nullable
3939

4040
uptime: 0048_delete_uptime_status_columns
4141

42-
workflow_engine: 0093_add_action_config_index
42+
workflow_engine: 0094_add_error_backfill_status
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
# Generated by Django 5.2.1 on 2025-10-29 22:02
2+
3+
import django.db.models.deletion
4+
from django.db import migrations, models
5+
6+
import sentry.db.models.fields.bounded
7+
import sentry.db.models.fields.foreignkey
8+
from sentry.new_migrations.migrations import CheckedMigration
9+
10+
11+
class Migration(CheckedMigration):
12+
# This flag is used to mark that a migration shouldn't be automatically run in production.
13+
# This should only be used for operations where it's safe to run the migration after your
14+
# code has deployed. So this should not be used for most operations that alter the schema
15+
# of a table.
16+
# Here are some things that make sense to mark as post deployment:
17+
# - Large data migrations. Typically we want these to be run manually so that they can be
18+
# monitored and not block the deploy for a long period of time while they run.
19+
# - Adding indexes to large tables. Since this can take a long time, we'd generally prefer to
20+
# run this outside deployments so that we don't block them. Note that while adding an index
21+
# is a schema change, it's completely safe to run the operation after the code has deployed.
22+
# Once deployed, run these manually via: https://develop.sentry.dev/database-migrations/#migration-deployment
23+
24+
is_post_deployment = False
25+
26+
dependencies = [
27+
("workflow_engine", "0093_add_action_config_index"),
28+
]
29+
30+
operations = [
31+
migrations.CreateModel(
32+
name="ErrorBackfillStatus",
33+
fields=[
34+
(
35+
"id",
36+
sentry.db.models.fields.bounded.BoundedBigAutoField(
37+
primary_key=True, serialize=False
38+
),
39+
),
40+
("date_updated", models.DateTimeField(auto_now=True)),
41+
("date_added", models.DateTimeField(auto_now_add=True)),
42+
("status", models.CharField(db_index=True, default="not_started", max_length=20)),
43+
(
44+
"detector",
45+
sentry.db.models.fields.foreignkey.FlexibleForeignKey(
46+
on_delete=django.db.models.deletion.CASCADE,
47+
to="workflow_engine.detector",
48+
unique=True,
49+
),
50+
),
51+
],
52+
options={
53+
"db_table": "workflow_engine_error_backfill_status",
54+
"indexes": [
55+
models.Index(fields=["status", "date_updated"], name="errbkfl_stat_upd_idx")
56+
],
57+
},
58+
),
59+
]

src/sentry/workflow_engine/models/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
"DetectorGroup",
1616
"DetectorState",
1717
"DetectorWorkflow",
18+
"ErrorBackfillStatus",
1819
"IncidentGroupOpenPeriod",
1920
"Workflow",
2021
"WorkflowDataConditionGroup",
@@ -36,6 +37,7 @@
3637
from .detector_group import DetectorGroup
3738
from .detector_state import DetectorState
3839
from .detector_workflow import DetectorWorkflow
40+
from .error_backfill_status import ErrorBackfillStatus
3941
from .incident_groupopenperiod import IncidentGroupOpenPeriod
4042
from .workflow import Workflow
4143
from .workflow_action_group_status import WorkflowActionGroupStatus
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
from django.db import models
2+
3+
import sentry
4+
from sentry.backup.scopes import RelocationScope
5+
from sentry.db.models import DefaultFieldsModel, FlexibleForeignKey, region_silo_model
6+
7+
8+
@region_silo_model
9+
class ErrorBackfillStatus(DefaultFieldsModel):
10+
"""
11+
Tracks the backfill status for creating DetectorGroup records for error detectors.
12+
13+
This model coordinates the gradual backfill of DetectorGroup associations for existing
14+
error groups. Each record represents an error detector (one per project) that needs all
15+
of its groups to be associated with DetectorGroup records. The status field tracks progress
16+
through the backfill lifecycle.
17+
"""
18+
19+
__relocation_scope__ = RelocationScope.Excluded
20+
21+
detector = FlexibleForeignKey("workflow_engine.Detector", on_delete=models.CASCADE, unique=True)
22+
23+
# Status values: not_started, in_progress, completed
24+
status = models.CharField(
25+
max_length=20,
26+
choices=[
27+
("not_started", "Not Started"),
28+
("in_progress", "In Progress"),
29+
("completed", "Completed"),
30+
],
31+
default="not_started",
32+
db_index=True,
33+
)
34+
35+
class Meta:
36+
db_table = "workflow_engine_error_backfill_status"
37+
app_label = "workflow_engine"
38+
indexes = [
39+
models.Index(fields=["status", "date_updated"], name="errbkfl_stat_upd_idx"),
40+
]
41+
42+
__repr__ = sentry.db.models.sane_repr("detector_id", "status")
Lines changed: 251 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,251 @@
1+
"""
2+
Processor functions for backfilling DetectorGroup associations for error detectors.
3+
4+
These functions contain the actual business logic for the backfill process, separated
5+
from the task definitions to avoid heavy import dependencies.
6+
"""
7+
8+
import logging
9+
from datetime import UTC, datetime, timedelta
10+
11+
from django.db.models import Exists, OuterRef
12+
13+
from sentry.grouping.grouptype import ErrorGroupType
14+
from sentry.models.group import Group, GroupStatus
15+
from sentry.utils import metrics
16+
from sentry.utils.query import RangeQuerySetWrapper
17+
from sentry.workflow_engine.models import Detector, DetectorGroup, ErrorBackfillStatus
18+
19+
logger = logging.getLogger(__name__)
20+
21+
GROUPS_PER_BATCH = 400
22+
23+
24+
def process_detector_backfill(backfill_status_id: int) -> None:
25+
"""
26+
Process a single ErrorBackfillStatus record, creating DetectorGroup associations
27+
for all open ErrorGroupType Groups in the detector's project.
28+
"""
29+
try:
30+
backfill_status = ErrorBackfillStatus.objects.select_for_update().get(id=backfill_status_id)
31+
except ErrorBackfillStatus.DoesNotExist:
32+
logger.warning(
33+
"error_detector_backfill.status_not_found",
34+
extra={"backfill_status_id": backfill_status_id},
35+
)
36+
return
37+
38+
if backfill_status.status != "in_progress":
39+
backfill_status.status = "in_progress"
40+
backfill_status.save(update_fields=["status", "date_updated"])
41+
42+
try:
43+
detector = Detector.objects.get(id=backfill_status.detector_id)
44+
project_id = detector.project_id
45+
46+
all_unresolved_groups = Group.objects.filter(
47+
project_id=project_id,
48+
status=GroupStatus.UNRESOLVED,
49+
type=ErrorGroupType.type_id,
50+
)
51+
52+
# Use NOT EXISTS subquery for efficiency
53+
existing_detector_groups_subquery = DetectorGroup.objects.filter(
54+
detector_id=detector.id, group_id=OuterRef("id")
55+
)
56+
57+
groups_needing_detector_groups = all_unresolved_groups.exclude(
58+
Exists(existing_detector_groups_subquery)
59+
)
60+
61+
created_count = 0
62+
63+
for group in RangeQuerySetWrapper(groups_needing_detector_groups, step=GROUPS_PER_BATCH):
64+
detector_group, created = DetectorGroup.objects.get_or_create(
65+
detector_id=detector.id,
66+
group_id=group.id,
67+
)
68+
if created:
69+
detector_group.date_added = group.first_seen
70+
detector_group.save(update_fields=["date_added"])
71+
created_count += 1
72+
73+
backfill_status.status = "completed"
74+
backfill_status.save(update_fields=["status", "date_updated"])
75+
76+
metrics.incr("error_detector_backfill.process_success")
77+
metrics.incr("error_detector_backfill.groups_created", amount=created_count)
78+
79+
logger.info(
80+
"error_detector_backfill.completed",
81+
extra={
82+
"backfill_status_id": backfill_status_id,
83+
"detector_id": detector.id,
84+
"project_id": project_id,
85+
"groups_created": created_count,
86+
},
87+
)
88+
89+
except Exception as e:
90+
logger.exception(
91+
"error_detector_backfill.failed",
92+
extra={
93+
"backfill_status_id": backfill_status_id,
94+
"error": str(e),
95+
},
96+
)
97+
metrics.incr("error_detector_backfill.process_error")
98+
raise
99+
100+
101+
def coordinate_backfills(
102+
max_batch_size: int,
103+
in_progress_timeout: timedelta,
104+
completed_cleanup_age: timedelta,
105+
schedule_task_fn,
106+
) -> None:
107+
"""
108+
Coordinate the error detector backfill process: reset stuck items, delete old completed
109+
items, and schedule new pending backfills.
110+
"""
111+
stuck_cutoff = datetime.now(UTC) - in_progress_timeout
112+
stuck_count = ErrorBackfillStatus.objects.filter(
113+
status="in_progress",
114+
date_updated__lt=stuck_cutoff,
115+
).update(
116+
status="not_started",
117+
)
118+
119+
if stuck_count > 0:
120+
logger.info(
121+
"error_detector_backfill.reset_stuck",
122+
extra={"count": stuck_count},
123+
)
124+
metrics.incr("error_detector_backfill.reset_stuck", amount=stuck_count)
125+
126+
completed_cutoff = datetime.now(UTC) - completed_cleanup_age
127+
deleted_count, _ = ErrorBackfillStatus.objects.filter(
128+
status="completed",
129+
date_updated__lt=completed_cutoff,
130+
).delete()
131+
132+
if deleted_count > 0:
133+
logger.info(
134+
"error_detector_backfill.cleaned_up",
135+
extra={"count": deleted_count},
136+
)
137+
metrics.incr("error_detector_backfill.cleaned_up", amount=deleted_count)
138+
139+
pending_items = ErrorBackfillStatus.objects.filter(
140+
status="not_started",
141+
).order_by(
142+
"date_added"
143+
)[:max_batch_size]
144+
145+
scheduled_count = 0
146+
for item in pending_items:
147+
try:
148+
schedule_task_fn(item.id)
149+
scheduled_count += 1
150+
except Exception as e:
151+
logger.exception(
152+
"error_detector_backfill.schedule_failed",
153+
extra={
154+
"backfill_status_id": item.id,
155+
"error": str(e),
156+
},
157+
)
158+
159+
if scheduled_count > 0:
160+
logger.info(
161+
"error_detector_backfill.scheduled",
162+
extra={"count": scheduled_count},
163+
)
164+
metrics.incr("error_detector_backfill.scheduled", amount=scheduled_count)
165+
166+
total_pending = ErrorBackfillStatus.objects.filter(status="not_started").count()
167+
total_in_progress = ErrorBackfillStatus.objects.filter(status="in_progress").count()
168+
total_completed = ErrorBackfillStatus.objects.filter(status="completed").count()
169+
170+
logger.info(
171+
"error_detector_backfill.coordinator_run",
172+
extra={
173+
"scheduled": scheduled_count,
174+
"stuck_reset": stuck_count,
175+
"cleaned_up": deleted_count,
176+
"total_pending": total_pending,
177+
"total_in_progress": total_in_progress,
178+
"total_completed": total_completed,
179+
},
180+
)
181+
182+
metrics.gauge("error_detector_backfill.pending", total_pending)
183+
metrics.gauge("error_detector_backfill.in_progress", total_in_progress)
184+
metrics.gauge("error_detector_backfill.completed", total_completed)
185+
186+
187+
def populate_backfill_status_records(
188+
start_from: int | None = None, deadline: datetime | None = None
189+
) -> int | None:
190+
"""
191+
Populate ErrorBackfillStatus records for all error detectors.
192+
193+
Returns the detector ID to resume from if the deadline is reached, or None if complete.
194+
"""
195+
196+
def process_batch(detectors: list[Detector]) -> int:
197+
detector_ids = [d.id for d in detectors]
198+
199+
existing_ids = set(
200+
ErrorBackfillStatus.objects.filter(detector_id__in=detector_ids).values_list(
201+
"detector_id", flat=True
202+
)
203+
)
204+
205+
new_records = [
206+
ErrorBackfillStatus(detector_id=d.id, status="not_started")
207+
for d in detectors
208+
if d.id not in existing_ids
209+
]
210+
211+
if new_records:
212+
ErrorBackfillStatus.objects.bulk_create(new_records, ignore_conflicts=True)
213+
return len(new_records)
214+
return 0
215+
216+
error_detectors = Detector.objects.filter(type=ErrorGroupType.slug)
217+
if start_from is not None:
218+
error_detectors = error_detectors.filter(id__gte=start_from)
219+
220+
created_count = 0
221+
batch_size = 1000
222+
batch_detectors = []
223+
224+
for detector in RangeQuerySetWrapper(error_detectors, step=batch_size):
225+
batch_detectors.append(detector)
226+
227+
if deadline and datetime.now(UTC) >= deadline:
228+
logger.info(
229+
"error_detector_backfill.populate_deadline_reached",
230+
extra={
231+
"created_count": created_count,
232+
"resume_from": detector.id,
233+
},
234+
)
235+
metrics.incr("error_detector_backfill.populated", amount=created_count)
236+
return detector.id
237+
238+
if len(batch_detectors) >= batch_size:
239+
created_count += process_batch(batch_detectors)
240+
batch_detectors = []
241+
242+
if batch_detectors:
243+
created_count += process_batch(batch_detectors)
244+
245+
logger.info(
246+
"error_detector_backfill.populated",
247+
extra={"created_count": created_count},
248+
)
249+
250+
metrics.incr("error_detector_backfill.populated", amount=created_count)
251+
return None

0 commit comments

Comments
 (0)