Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 41 additions & 21 deletions src/sentry/ingest/transaction_clusterer/datasource/redis.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
"""Write transactions into redis sets"""

import logging
from collections.abc import Iterator, Mapping
from collections.abc import Callable, Iterator, Mapping
from typing import Any

import sentry_sdk
from django.conf import settings
from rediscluster import RedisCluster
from sentry_conventions.attributes import ATTRIBUTE_NAMES

from sentry.ingest.transaction_clusterer import ClustererNamespace
from sentry.ingest.transaction_clusterer.datasource import (
Expand All @@ -16,6 +17,7 @@
)
from sentry.models.project import Project
from sentry.options.rollout import in_random_rollout
from sentry.spans.consumers.process_segments.types import CompatibleSpan, attribute_value
from sentry.utils import redis
from sentry.utils.safe import safe_execute

Expand Down Expand Up @@ -123,41 +125,59 @@ def record_transaction_name(project: Project, event_data: Mapping[str, Any], **k
safe_execute(_bump_rule_lifetime, project, event_data)


def record_segment_name(project: Project, segment_span: CompatibleSpan) -> None:
if segment_name := _should_store_segment_name(segment_span):
safe_execute(
_record_sample,
ClustererNamespace.TRANSACTIONS,
project,
segment_name,
)


def _should_store_transaction_name(event_data: Mapping[str, Any]) -> str | None:
"""Returns whether the given event must be stored as input for the
transaction clusterer."""
transaction_name = event_data.get("transaction")
if not transaction_name:
return None

tags = event_data.get("tags") or {}
transaction_info = event_data.get("transaction_info") or {}
source = transaction_info.get("source")

# We also feed back transactions into the clustering algorithm
# that have already been sanitized, so we have a chance to discover
# more high cardinality segments after partial sanitation.
# For example, we may have sanitized `/orgs/*/projects/foo`,
# But the clusterer has yet to discover `/orgs/*/projects/*`.
#
# Disadvantage: the load on redis does not decrease over time.
#
def is_404() -> bool:
tags = event_data.get("tags") or []
return HTTP_404_TAG in tags

return _should_store_segment_name_inner(transaction_name, source, is_404)


def _should_store_segment_name(segment_span: CompatibleSpan) -> str | None:
segment_name = attribute_value(
segment_span, ATTRIBUTE_NAMES.SENTRY_SEGMENT_NAME
) or segment_span.get("name")
source = attribute_value(segment_span, ATTRIBUTE_NAMES.SENTRY_SPAN_SOURCE)

def is_404() -> bool:
status_code = attribute_value(segment_span, ATTRIBUTE_NAMES.HTTP_RESPONSE_STATUS_CODE)
return status_code == 404

return _should_store_segment_name_inner(segment_name, source, is_404)


def _should_store_segment_name_inner(
name: str | None, source: str | None, is_404: Callable[[], bool]
) -> str | None:
if not name:
return None
source_matches = source in (TRANSACTION_SOURCE_URL, TRANSACTION_SOURCE_SANITIZED) or (
# Relay leaves source None if it expects it to be high cardinality, (otherwise it sets it to "unknown")
# (see https://github.com/getsentry/relay/blob/2d07bef86415cc0ae8af01d16baecde10cdb23a6/relay-general/src/store/transactions/processor.rs#L369-L373).
#
# Our data shows that a majority of these `None` source transactions contain slashes, so treat them as URL transactions:
source is None
and "/" in transaction_name
and "/" in name
)

if not source_matches:
return None

if tags and HTTP_404_TAG in tags:
if is_404():
return None

return transaction_name
return name


def _bump_rule_lifetime(project: Project, event_data: Mapping[str, Any]) -> None:
Expand Down
3 changes: 2 additions & 1 deletion src/sentry/ingest/transaction_clusterer/normalization.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import orjson
from sentry_conventions.attributes import ATTRIBUTE_NAMES

from sentry.ingest.transaction_clusterer.datasource import TRANSACTION_SOURCE_SANITIZED
from sentry.spans.consumers.process_segments.types import CompatibleSpan, attribute_value

# Ported from Relay:
Expand Down Expand Up @@ -106,7 +107,7 @@ def _scrub_identifiers(segment_span: CompatibleSpan, segment_name: str):
}
attributes[ATTRIBUTE_NAMES.SENTRY_SPAN_SOURCE] = {
"type": "string",
"value": "sanitized",
"value": TRANSACTION_SOURCE_SANITIZED,
}
attributes[f"sentry._meta.fields.attributes.{ATTRIBUTE_NAMES.SENTRY_SEGMENT_NAME}"] = {
"type": "string",
Expand Down
14 changes: 10 additions & 4 deletions src/sentry/spans/consumers/process_segments/message.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
from sentry.constants import DataCategory
from sentry.dynamic_sampling.rules.helpers.latest_releases import record_latest_release
from sentry.event_manager import INSIGHT_MODULE_TO_PROJECT_FLAG_NAME
from sentry.ingest.transaction_clusterer.datasource import TRANSACTION_SOURCE_URL
from sentry.ingest.transaction_clusterer.datasource.redis import record_segment_name
from sentry.ingest.transaction_clusterer.normalization import normalize_segment_name
from sentry.insights import FilterSpan
from sentry.insights import modules as insights_modules
Expand Down Expand Up @@ -64,7 +66,7 @@ def process_segment(
# If the project does not exist then it might have been deleted during ingestion.
return []

safe_execute(_normalize_segment_name, segment_span, project.organization)
safe_execute(_normalize_segment_name, segment_span, project)
_add_segment_name(segment_span, spans)
_compute_breakdowns(segment_span, spans, project)
_create_models(segment_span, project)
Expand Down Expand Up @@ -145,8 +147,10 @@ def _enrich_spans(


@metrics.wraps("spans.consumers.process_segments.normalize_segment_name")
def _normalize_segment_name(segment_span: CompatibleSpan, organization: Organization) -> None:
if not features.has("organizations:normalize_segment_names_in_span_enrichment", organization):
def _normalize_segment_name(segment_span: CompatibleSpan, project: Project) -> None:
if not features.has(
"organizations:normalize_segment_names_in_span_enrichment", project.organization
):
return

segment_name = attribute_value(
Expand All @@ -157,10 +161,12 @@ def _normalize_segment_name(segment_span: CompatibleSpan, organization: Organiza

source = attribute_value(segment_span, ATTRIBUTE_NAMES.SENTRY_SPAN_SOURCE)
unknown_if_parameterized = not source
known_to_be_unparameterized = source == "url"
known_to_be_unparameterized = source == TRANSACTION_SOURCE_URL
if unknown_if_parameterized or known_to_be_unparameterized:
normalize_segment_name(segment_span)

record_segment_name(project, segment_span)


@metrics.wraps("spans.consumers.process_segments.add_segment_name")
def _add_segment_name(segment: CompatibleSpan, spans: Sequence[CompatibleSpan]) -> None:
Expand Down
43 changes: 43 additions & 0 deletions tests/sentry/ingest/test_transaction_clusterer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from unittest import mock

import pytest
from sentry_conventions.attributes import ATTRIBUTE_NAMES

from sentry.ingest.transaction_clusterer import ClustererNamespace
from sentry.ingest.transaction_clusterer.base import ReplacementRule
Expand All @@ -13,6 +14,7 @@
get_active_projects,
get_redis_client,
get_transaction_names,
record_segment_name,
record_transaction_name,
)
from sentry.ingest.transaction_clusterer.meta import get_clusterer_meta
Expand Down Expand Up @@ -209,6 +211,47 @@ def test_record_transactions(
assert len(mocked_record.mock_calls) == expected


@mock.patch("sentry.ingest.transaction_clusterer.datasource.redis._record_sample")
@django_db_all
@pytest.mark.parametrize(
"source, segment_name, attributes, expected",
[
("url", "/a/b/c", {}, 1),
(
"url",
"/a/b/c",
{ATTRIBUTE_NAMES.HTTP_RESPONSE_STATUS_CODE: {"type": "integer", "value": 200}},
1,
),
("route", "/", {}, 0),
("url", None, {}, 0),
(
"url",
"/a/b/c",
{ATTRIBUTE_NAMES.HTTP_RESPONSE_STATUS_CODE: {"type": "integer", "value": 404}},
0,
),
(None, "/a/b/c", {}, 1),
(None, "foo", {}, 0),
],
)
def test_record_segment_name(
mocked_record, default_organization, source, segment_name, attributes, expected
) -> None:
project = Project(id=111, name="project", organization_id=default_organization.id)
record_segment_name(
project,
{
"name": segment_name,
"attributes": {
ATTRIBUTE_NAMES.SENTRY_SPAN_SOURCE: {"type": "string", "value": source},
**attributes,
},
}, # type: ignore[typeddict-item]
)
assert len(mocked_record.mock_calls) == expected


def test_sort_rules() -> None:
rules = {
ReplacementRule("/a/*/**"): 1,
Expand Down
12 changes: 10 additions & 2 deletions tests/sentry/spans/consumers/process_segments/test_message.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,16 +274,23 @@ def test_segment_name_propagation_when_name_missing(self):
child_attributes = child_span["attributes"] or {}
assert child_attributes.get("sentry.segment.name") is None

def test_segment_name_normalization_with_feature(self):
@mock.patch("sentry.spans.consumers.process_segments.message.record_segment_name")
def test_segment_name_normalization_with_feature(
self, mock_record_segment_name: mock.MagicMock
):
_, segment_span = self.generate_basic_spans()
segment_span["name"] = "/foo/2fd4e1c67a2d28fced849ee1bb76e7391b93eb12/user/123/0"

with self.feature("organizations:normalize_segment_names_in_span_enrichment"):
processed_spans = process_segment([segment_span])

assert processed_spans[0]["name"] == "/foo/*/user/*/0"
mock_record_segment_name.assert_called_once()

def test_segment_name_normalization_without_feature(self):
@mock.patch("sentry.spans.consumers.process_segments.message.record_segment_name")
def test_segment_name_normalization_without_feature(
self, mock_record_segment_name: mock.MagicMock
):
_, segment_span = self.generate_basic_spans()
segment_span["name"] = "/foo/2fd4e1c67a2d28fced849ee1bb76e7391b93eb12/user/123/0"

Expand All @@ -293,6 +300,7 @@ def test_segment_name_normalization_without_feature(self):
assert (
processed_spans[0]["name"] == "/foo/2fd4e1c67a2d28fced849ee1bb76e7391b93eb12/user/123/0"
)
mock_record_segment_name.assert_not_called()

def test_segment_name_normalization_checks_source(self):
_, segment_span = self.generate_basic_spans()
Expand Down
Loading