diff --git a/src/sentry/grouping/parameterization.py b/src/sentry/grouping/parameterization.py index a1be4c4a3376a7..87957ace6824cd 100644 --- a/src/sentry/grouping/parameterization.py +++ b/src/sentry/grouping/parameterization.py @@ -1,18 +1,42 @@ import dataclasses +import logging import re -from collections import OrderedDict, defaultdict +from collections import Counter, OrderedDict, defaultdict from collections.abc import Sequence from ipaddress import ip_address, ip_interface, ip_network -from typing import Callable +from typing import Any, Callable from sentry.utils import metrics +logger = logging.getLogger("sentry.events.grouping") + + +# Counter for logging a set amount of example data. Not meant to be used directly. (Use the +# `_log_example_data` helper instead.) +LOGGING_COUNTER: Counter[str] = Counter() + # Function parameterization regexes can specify to provide a customized replacement string. Can also # be used to do conditional replacement, by returning the original value in cases where replacement # shouldn't happen. ParameterizationReplacementFunction = Callable[[str], str] +# Log examples, up to the given limit. +def _log_example_data( + key: str, # Key used for tracking log count and in logger `event` string + extra: dict[str, Any], # Extra data to add to the log (should include example data) + limit: int = 100, # Number of logs to be gathered per deployment +) -> None: + # Note: In a multi-threaded environment, it's possible to run into a race condition where + # multiple threads are simultaneously logging what should theoretically be the last example. As + # result, we may end up logging a few more examples than the given limit. To fix this, we'd need + # to wrap everything here in a lock, but given that a few extra logs don't hurt anything, it's + # not worth blocking ingest by doing so. + if LOGGING_COUNTER[key] < limit: + logger.info(f"grouping.parameterization.{key}", extra=extra) + LOGGING_COUNTER[key] += 1 + + @dataclasses.dataclass class ParameterizationRegex: name: str # name of the pattern (also used as group name in combined regex) @@ -445,6 +469,10 @@ def _handle_regex_match(match: re.Match[str]) -> str: metrics.incr( "grouping.parameterization_false_positive", tags={"key": matched_key} ) + # TODO: Remove this once we have enough sample data + _log_example_data( + "ip_false_positive", extra={"input_str": input_str, "value": orig_value} + ) return replacement_string diff --git a/tests/sentry/grouping/test_parameterization.py b/tests/sentry/grouping/test_parameterization.py index 48cfb97bc31fc5..0bf373a68765f0 100644 --- a/tests/sentry/grouping/test_parameterization.py +++ b/tests/sentry/grouping/test_parameterization.py @@ -13,6 +13,7 @@ from sentry.grouping.parameterization import ( ParameterizationRegex, Parameterizer, + _log_example_data, experimental_parameterizer, is_valid_ip, parameterizer, @@ -717,3 +718,33 @@ def test_replacement_callback_false_positive_triggers_individual_regex_fallback( ) == 1 ) + + +@patch("sentry.grouping.parameterization.logger") +def test_example_data_logging(mock_logger: MagicMock) -> None: + for i in range(15): + _log_example_data("dog_fact_1", extra={"input_str": "dogs are great", "num": i}, limit=10) + + for i in range(105): + _log_example_data("dog_fact_2", extra={"input_str": "all dogs are good dogs", "num": i}) + + # In the first loop, we specified a limit of 10, so the logger was called 10 times, even though + # we called the helper 15 times + assert ( + count_matching_calls( + mock_logger.info, + "grouping.parameterization.dog_fact_1", + extra={"input_str": "dogs are great", "num": ANY}, + ) + == 10 + ) + # In the second loop, we didn't specify a limit, so the logger was called 100 times (the + # default limit), even though we called the helper 105 times + assert ( + count_matching_calls( + mock_logger.info, + "grouping.parameterization.dog_fact_2", + extra={"input_str": "all dogs are good dogs", "num": ANY}, + ) + == 100 + )