From 840d83855127f90cc489fa4d2ac63723aa1777b5 Mon Sep 17 00:00:00 2001 From: Matt Quinn Date: Mon, 24 Nov 2025 14:29:13 -0500 Subject: [PATCH 1/3] feat(segment-enrichment): Record seen segment names Continues the work started in https://github.com/getsentry/sentry/pull/103739. As a first step to adding incremental segment name clustering to segment enrichment, record each seen segment name (like we do for transaction names in Sentry's `event_manager`). To work around the different signatures for transactions and segment spans, I extracted as much of the logic as I could into a second function, with one caller per scenario. As with the previous PR, the changes are behind an org flag (organizations:normalize_segment_names_in_span_enrichment) for testing and easy rollback. --- .../transaction_clusterer/datasource/redis.py | 62 ++++++++++++------- .../transaction_clusterer/normalization.py | 3 +- .../consumers/process_segments/message.py | 14 +++-- .../ingest/test_transaction_clusterer.py | 43 +++++++++++++ .../process_segments/test_message.py | 12 +++- 5 files changed, 106 insertions(+), 28 deletions(-) diff --git a/src/sentry/ingest/transaction_clusterer/datasource/redis.py b/src/sentry/ingest/transaction_clusterer/datasource/redis.py index 58006a41102afe..5df28222c91e40 100644 --- a/src/sentry/ingest/transaction_clusterer/datasource/redis.py +++ b/src/sentry/ingest/transaction_clusterer/datasource/redis.py @@ -1,12 +1,13 @@ """Write transactions into redis sets""" import logging -from collections.abc import Iterator, Mapping +from collections.abc import Callable, Iterator, Mapping from typing import Any import sentry_sdk from django.conf import settings from rediscluster import RedisCluster +from sentry_conventions.attributes import ATTRIBUTE_NAMES from sentry.ingest.transaction_clusterer import ClustererNamespace from sentry.ingest.transaction_clusterer.datasource import ( @@ -16,6 +17,7 @@ ) from sentry.models.project import Project from sentry.options.rollout import in_random_rollout +from sentry.spans.consumers.process_segments.types import CompatibleSpan, attribute_value from sentry.utils import redis from sentry.utils.safe import safe_execute @@ -123,41 +125,59 @@ def record_transaction_name(project: Project, event_data: Mapping[str, Any], **k safe_execute(_bump_rule_lifetime, project, event_data) +def record_segment_name(project: Project, segment_span: CompatibleSpan) -> None: + if segment_name := _should_store_segment_name(segment_span): + safe_execute( + _record_sample, + ClustererNamespace.TRANSACTIONS, + project, + segment_name, + ) + + def _should_store_transaction_name(event_data: Mapping[str, Any]) -> str | None: - """Returns whether the given event must be stored as input for the - transaction clusterer.""" transaction_name = event_data.get("transaction") - if not transaction_name: - return None - - tags = event_data.get("tags") or {} transaction_info = event_data.get("transaction_info") or {} source = transaction_info.get("source") - # We also feed back transactions into the clustering algorithm - # that have already been sanitized, so we have a chance to discover - # more high cardinality segments after partial sanitation. - # For example, we may have sanitized `/orgs/*/projects/foo`, - # But the clusterer has yet to discover `/orgs/*/projects/*`. - # - # Disadvantage: the load on redis does not decrease over time. - # + def is_404() -> bool: + tags = event_data.get("tags") or {} + return bool(tags and HTTP_404_TAG in tags) + + return _should_store_segment_name_inner(transaction_name, source, is_404) + + +def _should_store_segment_name(segment_span: CompatibleSpan) -> str | None: + segment_name = attribute_value( + segment_span, ATTRIBUTE_NAMES.SENTRY_SEGMENT_NAME + ) or segment_span.get("name") + source = attribute_value(segment_span, ATTRIBUTE_NAMES.SENTRY_SPAN_SOURCE) + + def is_404() -> bool: + status_code = attribute_value(segment_span, ATTRIBUTE_NAMES.HTTP_RESPONSE_STATUS_CODE) + return status_code == 404 + + return _should_store_segment_name_inner(segment_name, source, is_404) + + +def _should_store_segment_name_inner( + name: str | None, source: str | None, is_404: Callable[[], bool] +) -> str | None: + if not name: + return None source_matches = source in (TRANSACTION_SOURCE_URL, TRANSACTION_SOURCE_SANITIZED) or ( # Relay leaves source None if it expects it to be high cardinality, (otherwise it sets it to "unknown") # (see https://github.com/getsentry/relay/blob/2d07bef86415cc0ae8af01d16baecde10cdb23a6/relay-general/src/store/transactions/processor.rs#L369-L373). # # Our data shows that a majority of these `None` source transactions contain slashes, so treat them as URL transactions: source is None - and "/" in transaction_name + and "/" in name ) - if not source_matches: return None - - if tags and HTTP_404_TAG in tags: + if is_404(): return None - - return transaction_name + return name def _bump_rule_lifetime(project: Project, event_data: Mapping[str, Any]) -> None: diff --git a/src/sentry/ingest/transaction_clusterer/normalization.py b/src/sentry/ingest/transaction_clusterer/normalization.py index 07bba43f339964..0214f78506e00b 100644 --- a/src/sentry/ingest/transaction_clusterer/normalization.py +++ b/src/sentry/ingest/transaction_clusterer/normalization.py @@ -4,6 +4,7 @@ import orjson from sentry_conventions.attributes import ATTRIBUTE_NAMES +from sentry.ingest.transaction_clusterer.datasource import TRANSACTION_SOURCE_SANITIZED from sentry.spans.consumers.process_segments.types import CompatibleSpan, attribute_value # Ported from Relay: @@ -106,7 +107,7 @@ def _scrub_identifiers(segment_span: CompatibleSpan, segment_name: str): } attributes[ATTRIBUTE_NAMES.SENTRY_SPAN_SOURCE] = { "type": "string", - "value": "sanitized", + "value": TRANSACTION_SOURCE_SANITIZED, } attributes[f"sentry._meta.fields.attributes.{ATTRIBUTE_NAMES.SENTRY_SEGMENT_NAME}"] = { "type": "string", diff --git a/src/sentry/spans/consumers/process_segments/message.py b/src/sentry/spans/consumers/process_segments/message.py index 1317224d42f5c6..43fa8757e23146 100644 --- a/src/sentry/spans/consumers/process_segments/message.py +++ b/src/sentry/spans/consumers/process_segments/message.py @@ -13,6 +13,8 @@ from sentry.constants import DataCategory from sentry.dynamic_sampling.rules.helpers.latest_releases import record_latest_release from sentry.event_manager import INSIGHT_MODULE_TO_PROJECT_FLAG_NAME +from sentry.ingest.transaction_clusterer.datasource import TRANSACTION_SOURCE_URL +from sentry.ingest.transaction_clusterer.datasource.redis import record_segment_name from sentry.ingest.transaction_clusterer.normalization import normalize_segment_name from sentry.insights import FilterSpan from sentry.insights import modules as insights_modules @@ -64,7 +66,7 @@ def process_segment( # If the project does not exist then it might have been deleted during ingestion. return [] - safe_execute(_normalize_segment_name, segment_span, project.organization) + safe_execute(_normalize_segment_name, segment_span, project) _add_segment_name(segment_span, spans) _compute_breakdowns(segment_span, spans, project) _create_models(segment_span, project) @@ -145,8 +147,10 @@ def _enrich_spans( @metrics.wraps("spans.consumers.process_segments.normalize_segment_name") -def _normalize_segment_name(segment_span: CompatibleSpan, organization: Organization) -> None: - if not features.has("organizations:normalize_segment_names_in_span_enrichment", organization): +def _normalize_segment_name(segment_span: CompatibleSpan, project: Project) -> None: + if not features.has( + "organizations:normalize_segment_names_in_span_enrichment", project.organization + ): return segment_name = attribute_value( @@ -157,10 +161,12 @@ def _normalize_segment_name(segment_span: CompatibleSpan, organization: Organiza source = attribute_value(segment_span, ATTRIBUTE_NAMES.SENTRY_SPAN_SOURCE) unknown_if_parameterized = not source - known_to_be_unparameterized = source == "url" + known_to_be_unparameterized = source == TRANSACTION_SOURCE_URL if unknown_if_parameterized or known_to_be_unparameterized: normalize_segment_name(segment_span) + record_segment_name(project, segment_span) + @metrics.wraps("spans.consumers.process_segments.add_segment_name") def _add_segment_name(segment: CompatibleSpan, spans: Sequence[CompatibleSpan]) -> None: diff --git a/tests/sentry/ingest/test_transaction_clusterer.py b/tests/sentry/ingest/test_transaction_clusterer.py index 966625a11a2362..df26b28472ebb1 100644 --- a/tests/sentry/ingest/test_transaction_clusterer.py +++ b/tests/sentry/ingest/test_transaction_clusterer.py @@ -1,6 +1,7 @@ from unittest import mock import pytest +from sentry_conventions.attributes import ATTRIBUTE_NAMES from sentry.ingest.transaction_clusterer import ClustererNamespace from sentry.ingest.transaction_clusterer.base import ReplacementRule @@ -13,6 +14,7 @@ get_active_projects, get_redis_client, get_transaction_names, + record_segment_name, record_transaction_name, ) from sentry.ingest.transaction_clusterer.meta import get_clusterer_meta @@ -209,6 +211,47 @@ def test_record_transactions( assert len(mocked_record.mock_calls) == expected +@mock.patch("sentry.ingest.transaction_clusterer.datasource.redis._record_sample") +@django_db_all +@pytest.mark.parametrize( + "source, segment_name, attributes, expected", + [ + ("url", "/a/b/c", {}, 1), + ( + "url", + "/a/b/c", + {ATTRIBUTE_NAMES.HTTP_RESPONSE_STATUS_CODE: {"type": "integer", "value": 200}}, + 1, + ), + ("route", "/", {}, 0), + ("url", None, {}, 0), + ( + "url", + "/a/b/c", + {ATTRIBUTE_NAMES.HTTP_RESPONSE_STATUS_CODE: {"type": "integer", "value": 404}}, + 0, + ), + (None, "/a/b/c", {}, 1), + (None, "foo", {}, 0), + ], +) +def test_record_segment_name( + mocked_record, default_organization, source, segment_name, attributes, expected +) -> None: + project = Project(id=111, name="project", organization_id=default_organization.id) + record_segment_name( + project, + { + "name": segment_name, + "attributes": { + ATTRIBUTE_NAMES.SENTRY_SPAN_SOURCE: {"type": "string", "value": source}, + **attributes, + }, + }, # type: ignore[typeddict-item] + ) + assert len(mocked_record.mock_calls) == expected + + def test_sort_rules() -> None: rules = { ReplacementRule("/a/*/**"): 1, diff --git a/tests/sentry/spans/consumers/process_segments/test_message.py b/tests/sentry/spans/consumers/process_segments/test_message.py index f34c0593c07d6a..2166e03729a917 100644 --- a/tests/sentry/spans/consumers/process_segments/test_message.py +++ b/tests/sentry/spans/consumers/process_segments/test_message.py @@ -274,7 +274,10 @@ def test_segment_name_propagation_when_name_missing(self): child_attributes = child_span["attributes"] or {} assert child_attributes.get("sentry.segment.name") is None - def test_segment_name_normalization_with_feature(self): + @mock.patch("sentry.spans.consumers.process_segments.message.record_segment_name") + def test_segment_name_normalization_with_feature( + self, mock_record_segment_name: mock.MagicMock + ): _, segment_span = self.generate_basic_spans() segment_span["name"] = "/foo/2fd4e1c67a2d28fced849ee1bb76e7391b93eb12/user/123/0" @@ -282,8 +285,12 @@ def test_segment_name_normalization_with_feature(self): processed_spans = process_segment([segment_span]) assert processed_spans[0]["name"] == "/foo/*/user/*/0" + mock_record_segment_name.assert_called_once() - def test_segment_name_normalization_without_feature(self): + @mock.patch("sentry.spans.consumers.process_segments.message.record_segment_name") + def test_segment_name_normalization_without_feature( + self, mock_record_segment_name: mock.MagicMock + ): _, segment_span = self.generate_basic_spans() segment_span["name"] = "/foo/2fd4e1c67a2d28fced849ee1bb76e7391b93eb12/user/123/0" @@ -293,6 +300,7 @@ def test_segment_name_normalization_without_feature(self): assert ( processed_spans[0]["name"] == "/foo/2fd4e1c67a2d28fced849ee1bb76e7391b93eb12/user/123/0" ) + mock_record_segment_name.assert_not_called() def test_segment_name_normalization_checks_source(self): _, segment_span = self.generate_basic_spans() From 8cf40afc2e05bdd037bb343dfc3e90c0a04d04c0 Mon Sep 17 00:00:00 2001 From: Matt Quinn Date: Tue, 25 Nov 2025 13:27:05 -0500 Subject: [PATCH 2/3] remove redundant tags check --- src/sentry/ingest/transaction_clusterer/datasource/redis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sentry/ingest/transaction_clusterer/datasource/redis.py b/src/sentry/ingest/transaction_clusterer/datasource/redis.py index 5df28222c91e40..3c985b1ced7bfc 100644 --- a/src/sentry/ingest/transaction_clusterer/datasource/redis.py +++ b/src/sentry/ingest/transaction_clusterer/datasource/redis.py @@ -142,7 +142,7 @@ def _should_store_transaction_name(event_data: Mapping[str, Any]) -> str | None: def is_404() -> bool: tags = event_data.get("tags") or {} - return bool(tags and HTTP_404_TAG in tags) + return HTTP_404_TAG in tags return _should_store_segment_name_inner(transaction_name, source, is_404) From 45cbe7b315ebbcbe7ebd4a19bb47d3afbb2667d0 Mon Sep 17 00:00:00 2001 From: Matt Quinn Date: Tue, 25 Nov 2025 14:31:21 -0500 Subject: [PATCH 3/3] fix type of event tags --- src/sentry/ingest/transaction_clusterer/datasource/redis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sentry/ingest/transaction_clusterer/datasource/redis.py b/src/sentry/ingest/transaction_clusterer/datasource/redis.py index 3c985b1ced7bfc..3f146f9a09001d 100644 --- a/src/sentry/ingest/transaction_clusterer/datasource/redis.py +++ b/src/sentry/ingest/transaction_clusterer/datasource/redis.py @@ -141,7 +141,7 @@ def _should_store_transaction_name(event_data: Mapping[str, Any]) -> str | None: source = transaction_info.get("source") def is_404() -> bool: - tags = event_data.get("tags") or {} + tags = event_data.get("tags") or [] return HTTP_404_TAG in tags return _should_store_segment_name_inner(transaction_name, source, is_404)