perf(eventstream): Add caching layer, raise sort (#103756)

thetruecpaul · web-flow · commit a305a6e9f818 · 2025-11-21T11:39:32.000-08:00
Two changes:
1. Add a caching layer so that we can avoid hitting the DB for hot
groups.
2. We don't need the sort in 95% of cases. Let's raise that into Python
and only do it when necessary.
diff --git a/src/sentry/services/eventstore/query_preprocessing.py b/src/sentry/services/eventstore/query_preprocessing.py
@@ -1,6 +1,10 @@
-from collections.abc import Iterable
+from collections import defaultdict
+from collections.abc import Iterable, Mapping
+from datetime import UTC, datetime, timedelta
+from typing import Any
 
 import sentry_sdk
+from django.core.cache import cache
 from django.db.models import Q, QuerySet
 
 from sentry.models.groupredirect import GroupRedirect
@@ -14,44 +18,92 @@
 SIZE_THRESHOLD_FOR_CLICKHOUSE = 2500
 
 
+def _build_group_redirect_by_group_id_cache_key(group_id: str | int) -> str:
+    return f"groupredirectsforgroupid:{group_id}"
+
+
 def _get_all_related_redirects_query(
     group_ids: set[str | int],
-) -> QuerySet[GroupRedirect, tuple[int, int]]:
-    return (
-        GroupRedirect.objects.filter(
-            Q(group_id__in=group_ids) | Q(previous_group_id__in=group_ids)
-        ).values_list("group_id", "previous_group_id")
-        # This order returns the newest redirects first. i.e. we're implicitly dropping
-        # the oldest redirects if we have >THRESHOLD. We choose to drop the oldest
-        # because they're least likely to have data in retention.
-        # Technically id != date_added, but it's a close appx (& much faster).
-        .order_by("-id")
+) -> QuerySet[GroupRedirect, Any]:
+    return GroupRedirect.objects.filter(
+        Q(group_id__in=group_ids) | Q(previous_group_id__in=group_ids)
+    ).values_list("date_added", "group_id", "previous_group_id", named=True)
+
+
+def _try_get_from_cache(
+    group_ids: Iterable[str | int],
+) -> tuple[set[tuple[str | int, datetime]], set[str | int]]:
+    """
+    CACHE STRUCTURE:
+      group_id ==> set[(group_id, date_added)]
+
+    Returns (all merged IDs from cache hits, all redirect IDs, all uncached input IDs)
+    """
+    # CACHE STRUCTURE:
+    #   group_id ==> set[ tuple[group_id, redirect_id] ]
+    id_to_keys = {
+        group_id: _build_group_redirect_by_group_id_cache_key(group_id) for group_id in group_ids
+    }
+    cache_results: Mapping[str | int, set[tuple[str | int, datetime]]] = cache.get_many(
+        id_to_keys.values()
     )
 
+    cached_data = set().union(*cache_results.values())
+    uncached_group_ids = {
+        group_id for group_id in group_ids if id_to_keys[group_id] not in cache_results.keys()
+    }
+
+    return (cached_data, uncached_group_ids)
+
 
 def get_all_merged_group_ids(
     group_ids: Iterable[str | int], threshold=SIZE_THRESHOLD_FOR_CLICKHOUSE
 ) -> set[str | int]:
     with sentry_sdk.start_span(op="get_all_merged_group_ids") as span:
-        group_id_set = set(group_ids)
-        all_related_rows = _get_all_related_redirects_query(group_id_set)
+        # Initialize all IDs with a future time to ensure they aren't filtered out.
+        running_data = {
+            (group_id, datetime.now(UTC) + timedelta(minutes=1)) for group_id in group_ids
+        }
+
+        # Step 1: Try to get data from cache
+        cached_data, uncached_group_ids = _try_get_from_cache(group_ids)
+        running_data.update(cached_data)
+
+        # Step 2: Get unordered uncached data from Postgres
+        all_related_rows = _get_all_related_redirects_query(uncached_group_ids)
+        id_to_related = defaultdict(set)
 
-        threshold_breaker_set = None
+        for row in all_related_rows:
+            if row.date_added is None:
+                continue
+            running_data.add((row.group_id, row.date_added))
+            running_data.add((row.previous_group_id, row.date_added))
 
-        for r in all_related_rows:
-            group_id_set.update(r)
+            id_to_related[row.group_id].add((row.previous_group_id, row.date_added))
+            id_to_related[row.previous_group_id].add((row.group_id, row.date_added))
 
-            # We only want to set the threshold_breaker the first time that we cross
-            # the threshold.
-            if threshold_breaker_set is None and len(group_id_set) >= threshold:
-                # Because we're incrementing the size of group_id_set by either one or two
-                # each iteration, it's fine if we're a bit over. That's negligible compared
-                # to the scale-of-thousands Clickhouse threshold.
-                threshold_breaker_set = group_id_set.copy()
+        # Step 3: Set cache-missed data into cache
+        cache.set_many(
+            data={
+                _build_group_redirect_by_group_id_cache_key(group_id): id_to_related[group_id]
+                for group_id in uncached_group_ids
+            },
+            timeout=300,  # 5 minutes
+        )
 
-        out = group_id_set if threshold_breaker_set is None else threshold_breaker_set
+        # Step 4: If and only if result size is greater than threshold, sort by
+        #         date_added and only return newest threshold # of results.
+        output_set = {datum[0] for datum in running_data}
+        span.set_data("true_group_id_len", len(output_set))
 
-        span.set_data("true_group_id_len", len(group_id_set))
-        span.set_data("returned_group_id_len", len(out))
+        if len(output_set) > threshold:
+            # Sort by datetime, decreasing, and then take first threshold results
+            output_set = {
+                datum[0]
+                for datum in sorted(running_data, key=lambda datum: datum[1], reverse=True)[
+                    :threshold
+                ]
+            }
+        span.set_data("returned_group_id_len", len(output_set))
 
-    return out
+    return output_set
diff --git a/tests/sentry/services/eventstore/test_query_preprocessing.py b/tests/sentry/services/eventstore/test_query_preprocessing.py
@@ -2,7 +2,8 @@
 
 from sentry.models.groupredirect import GroupRedirect
 from sentry.services.eventstore.query_preprocessing import (
-    _get_all_related_redirects_query,
+    _build_group_redirect_by_group_id_cache_key,
+    _try_get_from_cache,
     get_all_merged_group_ids,
 )
 from sentry.testutils.cases import TestCase
@@ -16,7 +17,7 @@ def setUp(self) -> None:
         self.g1 = self.create_group(id=1)
         self.g2 = self.create_group(id=2)
         self.g3 = self.create_group(id=3)
-        self.create_group(id=4)
+        self.g4 = self.create_group(id=4)
 
         self.gr31 = GroupRedirect.objects.create(
             id=10001,
@@ -33,13 +34,6 @@ def setUp(self) -> None:
             date_added=datetime.now(UTC) - timedelta(hours=1),
         )
 
-    def test_get_all_related_groups_query(self) -> None:
-        """
-        What we want is for this to return the newest redirects first.
-        What we're technically doing is taking the redirects with the highest IDs.
-        """
-        assert _get_all_related_redirects_query({self.g1.id})[0] == (self.g1.id, self.g2.id)
-
     def test_get_all_merged_group_ids(self) -> None:
         assert get_all_merged_group_ids([self.g1.id]) == {self.g1.id, self.g2.id, self.g3.id}
         assert get_all_merged_group_ids([self.g2.id]) == {self.g1.id, self.g2.id}
@@ -67,3 +61,17 @@ def test_threshold(self) -> None:
             )
 
         assert len(get_all_merged_group_ids([group.id], local_threshold)) <= local_threshold + 2
+
+    def test_cache(self) -> None:
+        from django.core.cache import cache
+
+        cache.set(
+            _build_group_redirect_by_group_id_cache_key(self.g1.id),
+            {(self.g2.id, self.gr21.date_added), (self.g3.id, self.gr31.date_added)},
+        )
+
+        res = _try_get_from_cache({self.g1.id, self.g4.id})
+        assert res == (
+            {(self.g2.id, self.gr21.date_added), (self.g3.id, self.gr31.date_added)},
+            {self.g4.id},
+        )