Skip to content

Commit ea48886

Browse files
lobsterkatiegeorge-sentry
authored andcommitted
ref(grouping): Log example false positive ip data (#112487)
This adds a new parameterization helper, `_log_example_data`, which will log the given example data up to a given limit of times per deployment. (The default is 100, but a different limit can be passed in.) It then uses the helper to log examples of our IP false positives.
1 parent 09abcc3 commit ea48886

File tree

2 files changed

+61
-2
lines changed

2 files changed

+61
-2
lines changed

src/sentry/grouping/parameterization.py

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,42 @@
11
import dataclasses
2+
import logging
23
import re
3-
from collections import OrderedDict, defaultdict
4+
from collections import Counter, OrderedDict, defaultdict
45
from collections.abc import Sequence
56
from ipaddress import ip_address, ip_interface, ip_network
6-
from typing import Callable
7+
from typing import Any, Callable
78

89
from sentry.utils import metrics
910

11+
logger = logging.getLogger("sentry.events.grouping")
12+
13+
14+
# Counter for logging a set amount of example data. Not meant to be used directly. (Use the
15+
# `_log_example_data` helper instead.)
16+
LOGGING_COUNTER: Counter[str] = Counter()
17+
1018
# Function parameterization regexes can specify to provide a customized replacement string. Can also
1119
# be used to do conditional replacement, by returning the original value in cases where replacement
1220
# shouldn't happen.
1321
ParameterizationReplacementFunction = Callable[[str], str]
1422

1523

24+
# Log examples, up to the given limit.
25+
def _log_example_data(
26+
key: str, # Key used for tracking log count and in logger `event` string
27+
extra: dict[str, Any], # Extra data to add to the log (should include example data)
28+
limit: int = 100, # Number of logs to be gathered per deployment
29+
) -> None:
30+
# Note: In a multi-threaded environment, it's possible to run into a race condition where
31+
# multiple threads are simultaneously logging what should theoretically be the last example. As
32+
# result, we may end up logging a few more examples than the given limit. To fix this, we'd need
33+
# to wrap everything here in a lock, but given that a few extra logs don't hurt anything, it's
34+
# not worth blocking ingest by doing so.
35+
if LOGGING_COUNTER[key] < limit:
36+
logger.info(f"grouping.parameterization.{key}", extra=extra)
37+
LOGGING_COUNTER[key] += 1
38+
39+
1640
@dataclasses.dataclass
1741
class ParameterizationRegex:
1842
name: str # name of the pattern (also used as group name in combined regex)
@@ -445,6 +469,10 @@ def _handle_regex_match(match: re.Match[str]) -> str:
445469
metrics.incr(
446470
"grouping.parameterization_false_positive", tags={"key": matched_key}
447471
)
472+
# TODO: Remove this once we have enough sample data
473+
_log_example_data(
474+
"ip_false_positive", extra={"input_str": input_str, "value": orig_value}
475+
)
448476

449477
return replacement_string
450478

tests/sentry/grouping/test_parameterization.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from sentry.grouping.parameterization import (
1414
ParameterizationRegex,
1515
Parameterizer,
16+
_log_example_data,
1617
experimental_parameterizer,
1718
is_valid_ip,
1819
parameterizer,
@@ -717,3 +718,33 @@ def test_replacement_callback_false_positive_triggers_individual_regex_fallback(
717718
)
718719
== 1
719720
)
721+
722+
723+
@patch("sentry.grouping.parameterization.logger")
724+
def test_example_data_logging(mock_logger: MagicMock) -> None:
725+
for i in range(15):
726+
_log_example_data("dog_fact_1", extra={"input_str": "dogs are great", "num": i}, limit=10)
727+
728+
for i in range(105):
729+
_log_example_data("dog_fact_2", extra={"input_str": "all dogs are good dogs", "num": i})
730+
731+
# In the first loop, we specified a limit of 10, so the logger was called 10 times, even though
732+
# we called the helper 15 times
733+
assert (
734+
count_matching_calls(
735+
mock_logger.info,
736+
"grouping.parameterization.dog_fact_1",
737+
extra={"input_str": "dogs are great", "num": ANY},
738+
)
739+
== 10
740+
)
741+
# In the second loop, we didn't specify a limit, so the logger was called 100 times (the
742+
# default limit), even though we called the helper 105 times
743+
assert (
744+
count_matching_calls(
745+
mock_logger.info,
746+
"grouping.parameterization.dog_fact_2",
747+
extra={"input_str": "all dogs are good dogs", "num": ANY},
748+
)
749+
== 100
750+
)

0 commit comments

Comments
 (0)