getsentry
diff --git a/‎migrations_lockfile.txt‎
Lines changed: 1 addition & 1 deletion b/‎migrations_lockfile.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/sentry/workflow_engine/migrations/0094_add_error_backfill_status.py‎
Lines changed: 59 additions & 0 deletions b/‎src/sentry/workflow_engine/migrations/0094_add_error_backfill_status.py‎
Lines changed: 59 additions & 0 deletions
diff --git a/‎src/sentry/workflow_engine/models/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎src/sentry/workflow_engine/models/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/sentry/workflow_engine/models/error_backfill_status.py‎
Lines changed: 42 additions & 0 deletions b/‎src/sentry/workflow_engine/models/error_backfill_status.py‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎src/sentry/workflow_engine/processors/backfill.py‎
Lines changed: 251 additions & 0 deletions b/‎src/sentry/workflow_engine/processors/backfill.py‎
Lines changed: 251 additions & 0 deletions
@@ -39,4 +39,4 @@ tempest: 0001_squashed_0002_make_message_type_nullable
 
 uptime: 0048_delete_uptime_status_columns
 
-workflow_engine: 0093_add_action_config_index
+workflow_engine: 0094_add_error_backfill_status
@@ -0,0 +1,59 @@
+# Generated by Django 5.2.1 on 2025-10-29 22:02
+
+import django.db.models.deletion
+from django.db import migrations, models
+
+import sentry.db.models.fields.bounded
+import sentry.db.models.fields.foreignkey
+from sentry.new_migrations.migrations import CheckedMigration
+
+
+class Migration(CheckedMigration):
+    # This flag is used to mark that a migration shouldn't be automatically run in production.
+    # This should only be used for operations where it's safe to run the migration after your
+    # code has deployed. So this should not be used for most operations that alter the schema
+    # of a table.
+    # Here are some things that make sense to mark as post deployment:
+    # - Large data migrations. Typically we want these to be run manually so that they can be
+    #   monitored and not block the deploy for a long period of time while they run.
+    # - Adding indexes to large tables. Since this can take a long time, we'd generally prefer to
+    #   run this outside deployments so that we don't block them. Note that while adding an index
+    #   is a schema change, it's completely safe to run the operation after the code has deployed.
+    # Once deployed, run these manually via: https://develop.sentry.dev/database-migrations/#migration-deployment
+
+    is_post_deployment = False
+
+    dependencies = [
+        ("workflow_engine", "0093_add_action_config_index"),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name="ErrorBackfillStatus",
+            fields=[
+                (
+                    "id",
+                    sentry.db.models.fields.bounded.BoundedBigAutoField(
+                        primary_key=True, serialize=False
+                    ),
+                ),
+                ("date_updated", models.DateTimeField(auto_now=True)),
+                ("date_added", models.DateTimeField(auto_now_add=True)),
+                ("status", models.CharField(db_index=True, default="not_started", max_length=20)),
+                (
+                    "detector",
+                    sentry.db.models.fields.foreignkey.FlexibleForeignKey(
+                        on_delete=django.db.models.deletion.CASCADE,
+                        to="workflow_engine.detector",
+                        unique=True,
+                    ),
+                ),
+            ],
+            options={
+                "db_table": "workflow_engine_error_backfill_status",
+                "indexes": [
+                    models.Index(fields=["status", "date_updated"], name="errbkfl_stat_upd_idx")
+                ],
+            },
+        ),
+    ]
@@ -15,6 +15,7 @@
     "DetectorGroup",
     "DetectorState",
     "DetectorWorkflow",
+    "ErrorBackfillStatus",
     "IncidentGroupOpenPeriod",
     "Workflow",
     "WorkflowDataConditionGroup",
@@ -36,6 +37,7 @@
 from .detector_group import DetectorGroup
 from .detector_state import DetectorState
 from .detector_workflow import DetectorWorkflow
+from .error_backfill_status import ErrorBackfillStatus
 from .incident_groupopenperiod import IncidentGroupOpenPeriod
 from .workflow import Workflow
 from .workflow_action_group_status import WorkflowActionGroupStatus
 
@@ -0,0 +1,42 @@
+from django.db import models
+
+import sentry
+from sentry.backup.scopes import RelocationScope
+from sentry.db.models import DefaultFieldsModel, FlexibleForeignKey, region_silo_model
+
+
+@region_silo_model
+class ErrorBackfillStatus(DefaultFieldsModel):
+    """
+    Tracks the backfill status for creating DetectorGroup records for error detectors.
+
+    This model coordinates the gradual backfill of DetectorGroup associations for existing
+    error groups. Each record represents an error detector (one per project) that needs all
+    of its groups to be associated with DetectorGroup records. The status field tracks progress
+    through the backfill lifecycle.
+    """
+
+    __relocation_scope__ = RelocationScope.Excluded
+
+    detector = FlexibleForeignKey("workflow_engine.Detector", on_delete=models.CASCADE, unique=True)
+
+    # Status values: not_started, in_progress, completed
+    status = models.CharField(
+        max_length=20,
+        choices=[
+            ("not_started", "Not Started"),
+            ("in_progress", "In Progress"),
+            ("completed", "Completed"),
+        ],
+        default="not_started",
+        db_index=True,
+    )
+
+    class Meta:
+        db_table = "workflow_engine_error_backfill_status"
+        app_label = "workflow_engine"
+        indexes = [
+            models.Index(fields=["status", "date_updated"], name="errbkfl_stat_upd_idx"),
+        ]
+
+    __repr__ = sentry.db.models.sane_repr("detector_id", "status")
@@ -0,0 +1,251 @@
+"""
+Processor functions for backfilling DetectorGroup associations for error detectors.
+
+These functions contain the actual business logic for the backfill process, separated
+from the task definitions to avoid heavy import dependencies.
+"""
+
+import logging
+from datetime import UTC, datetime, timedelta
+
+from django.db.models import Exists, OuterRef
+
+from sentry.grouping.grouptype import ErrorGroupType
+from sentry.models.group import Group, GroupStatus
+from sentry.utils import metrics
+from sentry.utils.query import RangeQuerySetWrapper
+from sentry.workflow_engine.models import Detector, DetectorGroup, ErrorBackfillStatus
+
+logger = logging.getLogger(__name__)
+
+GROUPS_PER_BATCH = 400
+
+
+def process_detector_backfill(backfill_status_id: int) -> None:
+    """
+    Process a single ErrorBackfillStatus record, creating DetectorGroup associations
+    for all open ErrorGroupType Groups in the detector's project.
+    """
+    try:
+        backfill_status = ErrorBackfillStatus.objects.select_for_update().get(id=backfill_status_id)
+    except ErrorBackfillStatus.DoesNotExist:
+        logger.warning(
+            "error_detector_backfill.status_not_found",
+            extra={"backfill_status_id": backfill_status_id},
+        )
+        return
+
+    if backfill_status.status != "in_progress":
+        backfill_status.status = "in_progress"
+        backfill_status.save(update_fields=["status", "date_updated"])
+
+    try:
+        detector = Detector.objects.get(id=backfill_status.detector_id)
+        project_id = detector.project_id
+
+        all_unresolved_groups = Group.objects.filter(
+            project_id=project_id,
+            status=GroupStatus.UNRESOLVED,
+            type=ErrorGroupType.type_id,
+        )
+
+        # Use NOT EXISTS subquery for efficiency
+        existing_detector_groups_subquery = DetectorGroup.objects.filter(
+            detector_id=detector.id, group_id=OuterRef("id")
+        )
+
+        groups_needing_detector_groups = all_unresolved_groups.exclude(
+            Exists(existing_detector_groups_subquery)
+        )
+
+        created_count = 0
+
+        for group in RangeQuerySetWrapper(groups_needing_detector_groups, step=GROUPS_PER_BATCH):
+            detector_group, created = DetectorGroup.objects.get_or_create(
+                detector_id=detector.id,
+                group_id=group.id,
+            )
+            if created:
+                detector_group.date_added = group.first_seen
+                detector_group.save(update_fields=["date_added"])
+                created_count += 1
+
+        backfill_status.status = "completed"
+        backfill_status.save(update_fields=["status", "date_updated"])
+
+        metrics.incr("error_detector_backfill.process_success")
+        metrics.incr("error_detector_backfill.groups_created", amount=created_count)
+
+        logger.info(
+            "error_detector_backfill.completed",
+            extra={
+                "backfill_status_id": backfill_status_id,
+                "detector_id": detector.id,
+                "project_id": project_id,
+                "groups_created": created_count,
+            },
+        )
+
+    except Exception as e:
+        logger.exception(
+            "error_detector_backfill.failed",
+            extra={
+                "backfill_status_id": backfill_status_id,
+                "error": str(e),
+            },
+        )
+        metrics.incr("error_detector_backfill.process_error")
+        raise
+
+
+def coordinate_backfills(
+    max_batch_size: int,
+    in_progress_timeout: timedelta,
+    completed_cleanup_age: timedelta,
+    schedule_task_fn,
+) -> None:
+    """
+    Coordinate the error detector backfill process: reset stuck items, delete old completed
+    items, and schedule new pending backfills.
+    """
+    stuck_cutoff = datetime.now(UTC) - in_progress_timeout
+    stuck_count = ErrorBackfillStatus.objects.filter(
+        status="in_progress",
+        date_updated__lt=stuck_cutoff,
+    ).update(
+        status="not_started",
+    )
+
+    if stuck_count > 0:
+        logger.info(
+            "error_detector_backfill.reset_stuck",
+            extra={"count": stuck_count},
+        )
+        metrics.incr("error_detector_backfill.reset_stuck", amount=stuck_count)
+
+    completed_cutoff = datetime.now(UTC) - completed_cleanup_age
+    deleted_count, _ = ErrorBackfillStatus.objects.filter(
+        status="completed",
+        date_updated__lt=completed_cutoff,
+    ).delete()
+
+    if deleted_count > 0:
+        logger.info(
+            "error_detector_backfill.cleaned_up",
+            extra={"count": deleted_count},
+        )
+        metrics.incr("error_detector_backfill.cleaned_up", amount=deleted_count)
+
+    pending_items = ErrorBackfillStatus.objects.filter(
+        status="not_started",
+    ).order_by(
+        "date_added"
+    )[:max_batch_size]
+
+    scheduled_count = 0
+    for item in pending_items:
+        try:
+            schedule_task_fn(item.id)
+            scheduled_count += 1
+        except Exception as e:
+            logger.exception(
+                "error_detector_backfill.schedule_failed",
+                extra={
+                    "backfill_status_id": item.id,
+                    "error": str(e),
+                },
+            )
+
+    if scheduled_count > 0:
+        logger.info(
+            "error_detector_backfill.scheduled",
+            extra={"count": scheduled_count},
+        )
+        metrics.incr("error_detector_backfill.scheduled", amount=scheduled_count)
+
+    total_pending = ErrorBackfillStatus.objects.filter(status="not_started").count()
+    total_in_progress = ErrorBackfillStatus.objects.filter(status="in_progress").count()
+    total_completed = ErrorBackfillStatus.objects.filter(status="completed").count()
+
+    logger.info(
+        "error_detector_backfill.coordinator_run",
+        extra={
+            "scheduled": scheduled_count,
+            "stuck_reset": stuck_count,
+            "cleaned_up": deleted_count,
+            "total_pending": total_pending,
+            "total_in_progress": total_in_progress,
+            "total_completed": total_completed,
+        },
+    )
+
+    metrics.gauge("error_detector_backfill.pending", total_pending)
+    metrics.gauge("error_detector_backfill.in_progress", total_in_progress)
+    metrics.gauge("error_detector_backfill.completed", total_completed)
+
+
+def populate_backfill_status_records(
+    start_from: int | None = None, deadline: datetime | None = None
+) -> int | None:
+    """
+    Populate ErrorBackfillStatus records for all error detectors.
+
+    Returns the detector ID to resume from if the deadline is reached, or None if complete.
+    """
+
+    def process_batch(detectors: list[Detector]) -> int:
+        detector_ids = [d.id for d in detectors]
+
+        existing_ids = set(
+            ErrorBackfillStatus.objects.filter(detector_id__in=detector_ids).values_list(
+                "detector_id", flat=True
+            )
+        )
+
+        new_records = [
+            ErrorBackfillStatus(detector_id=d.id, status="not_started")
+            for d in detectors
+            if d.id not in existing_ids
+        ]
+
+        if new_records:
+            ErrorBackfillStatus.objects.bulk_create(new_records, ignore_conflicts=True)
+            return len(new_records)
+        return 0
+
+    error_detectors = Detector.objects.filter(type=ErrorGroupType.slug)
+    if start_from is not None:
+        error_detectors = error_detectors.filter(id__gte=start_from)
+
+    created_count = 0
+    batch_size = 1000
+    batch_detectors = []
+
+    for detector in RangeQuerySetWrapper(error_detectors, step=batch_size):
+        batch_detectors.append(detector)
+
+        if deadline and datetime.now(UTC) >= deadline:
+            logger.info(
+                "error_detector_backfill.populate_deadline_reached",
+                extra={
+                    "created_count": created_count,
+                    "resume_from": detector.id,
+                },
+            )
+            metrics.incr("error_detector_backfill.populated", amount=created_count)
+            return detector.id
+
+        if len(batch_detectors) >= batch_size:
+            created_count += process_batch(batch_detectors)
+            batch_detectors = []
+
+    if batch_detectors:
+        created_count += process_batch(batch_detectors)
+
+    logger.info(
+        "error_detector_backfill.populated",
+        extra={"created_count": created_count},
+    )
+
+    metrics.incr("error_detector_backfill.populated", amount=created_count)
+    return None
Original file line number	Diff line number	Diff line change
`@@ -39,4 +39,4 @@ tempest: 0001_squashed_0002_make_message_type_nullable`
`39`	`39`
`40`	`40`	`uptime: 0048_delete_uptime_status_columns`
`41`	`41`
`42`		`-workflow_engine: 0093_add_action_config_index`
	`42`	`+workflow_engine: 0094_add_error_backfill_status`