Skip to content

Commit f64843b

Browse files
authored
feat(spans): enforce max segment bytes during ingestion (#112561)
Refs STREAM-859 We want to able to skip merging further subsegments into the set but we don't want to drop data. So naturally, a subsegment not being merged into its set have to become its own separate segment. - Adds the `spans.buffer.enforce-segment-size` option to force segment size limit of `max-segment-bytes`. - When the option is enabled: - The Lua script checks cumulative ingested bytes of the target set to decide whether to merge or not. - Writes each subsegment's payloads to a separate key determined by a random salt. This is to prevent dropping overflow subsegments when we decide not to merge them into the target set. Each subsegment not being merged will become its own separate segment, and we add a new key in the queue based on the salt value above.
1 parent 884ade1 commit f64843b

File tree

4 files changed

+264
-15
lines changed

4 files changed

+264
-15
lines changed

src/sentry/options/defaults.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3198,6 +3198,13 @@
31983198
default=False,
31993199
flags=FLAG_PRIORITIZE_DISK | FLAG_AUTOMATOR_MODIFIABLE,
32003200
)
3201+
# Whether to enforce max-segment-bytes during ingestion via the Lua script.
3202+
register(
3203+
"spans.buffer.enforce-segment-size",
3204+
type=Bool,
3205+
default=False,
3206+
flags=FLAG_PRIORITIZE_DISK | FLAG_AUTOMATOR_MODIFIABLE,
3207+
)
32013208
# TTL for keys in Redis. This is a downside protection in case of bugs.
32023209
register(
32033210
"spans.buffer.redis-ttl",

src/sentry/scripts/spans/add-buffer.lua

Lines changed: 82 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@ ARGS:
2727
- has_root_span -- "true" or "false" -- Whether the subsegment contains the root of the segment.
2828
- set_timeout -- int
2929
- byte_count -- int -- The total number of bytes in the subsegment.
30+
- max_segment_bytes -- int -- Maximum allowed ingested bytes for a segment. 0 means no limit.
31+
- salt -- str -- Unique identifier for this subsegment. When the segment exceeds max_segment_bytes, this subsegment
32+
is detached into its own segment keyed by salt. Empty string disables this behavior.
3033
- *span_id -- str[] -- The span ids in the subsegment.
3134
3235
RETURNS:
@@ -45,7 +48,9 @@ local parent_span_id = ARGV[2]
4548
local has_root_span = ARGV[3] == "true"
4649
local set_timeout = tonumber(ARGV[4])
4750
local byte_count = tonumber(ARGV[5])
48-
local NUM_ARGS = 5
51+
local max_segment_bytes = tonumber(ARGV[6])
52+
local salt = ARGV[7] or ""
53+
local NUM_ARGS = 7
4954

5055
local function get_time_ms()
5156
local time = redis.call("TIME")
@@ -100,6 +105,82 @@ redis.call("expire", main_redirect_key, set_timeout)
100105
local redirect_end_time_ms = get_time_ms()
101106
table.insert(latency_table, {"redirect_step_latency_ms", redirect_end_time_ms - start_time_ms})
102107

108+
if salt ~= "" then
109+
local ingested_byte_count_key = string.format("span-buf:ibc:%s", set_key)
110+
local ingested_byte_count = tonumber(redis.call("get", ingested_byte_count_key) or 0)
111+
112+
for i = NUM_ARGS + 1, NUM_ARGS + num_spans do
113+
local span_id = ARGV[i]
114+
if span_id ~= parent_span_id then
115+
local child_set_key = string.format("span-buf:s:{%s}:%s", project_and_trace, span_id)
116+
local child_ibc_key = string.format("span-buf:ibc:%s", child_set_key)
117+
local child_ibc = tonumber(redis.call("get", child_ibc_key) or 0)
118+
byte_count = byte_count + child_ibc
119+
end
120+
end
121+
122+
-- If the segment is already too big, make this subsegment its own segment
123+
-- with salt as the identifier.
124+
if max_segment_bytes > 0 and tonumber(ingested_byte_count) + byte_count > max_segment_bytes then
125+
set_span_id = salt
126+
set_key = string.format("span-buf:s:{%s}:%s", project_and_trace, salt)
127+
ingested_byte_count_key = string.format("span-buf:ibc:%s", set_key)
128+
end
129+
130+
local ingested_count_key = string.format("span-buf:ic:%s", set_key)
131+
local members_key = string.format("span-buf:mk:{%s}:%s", project_and_trace, set_span_id)
132+
133+
for i = NUM_ARGS + 1, NUM_ARGS + num_spans do
134+
local span_id = ARGV[i]
135+
if span_id ~= parent_span_id then
136+
local child_set_key = string.format("span-buf:s:{%s}:%s", project_and_trace, span_id)
137+
138+
local child_ic_key = string.format("span-buf:ic:%s", child_set_key)
139+
local child_ic = redis.call("get", child_ic_key)
140+
if child_ic then
141+
redis.call("incrby", ingested_count_key, child_ic)
142+
redis.call("del", child_ic_key)
143+
end
144+
145+
local child_ibc_key = string.format("span-buf:ibc:%s", child_set_key)
146+
local child_ibc = redis.call("get", child_ibc_key)
147+
if child_ibc then
148+
-- byte_count already holds the child's byte count, so we don't need to add again
149+
redis.call("del", child_ibc_key)
150+
end
151+
152+
local child_members_key = string.format("span-buf:mk:{%s}:%s", project_and_trace, span_id)
153+
local child_members = redis.call("smembers", child_members_key)
154+
if #child_members > 0 then
155+
redis.call("sadd", members_key, unpack(child_members))
156+
redis.call("del", child_members_key)
157+
end
158+
end
159+
end
160+
161+
local merge_payload_keys_end_time_ms = get_time_ms()
162+
table.insert(latency_table, {"merge_payload_keys_step_latency_ms", merge_payload_keys_end_time_ms - redirect_end_time_ms})
163+
164+
redis.call("sadd", members_key, salt)
165+
redis.call("expire", members_key, set_timeout)
166+
167+
-- Track total number of spans ingested for this segment
168+
redis.call("incrby", ingested_count_key, num_spans)
169+
redis.call("incrby", ingested_byte_count_key, byte_count)
170+
redis.call("expire", ingested_count_key, set_timeout)
171+
redis.call("expire", ingested_byte_count_key, set_timeout)
172+
173+
local counter_merge_end_time_ms = get_time_ms()
174+
table.insert(latency_table, {"counter_merge_step_latency_ms", counter_merge_end_time_ms - merge_payload_keys_end_time_ms})
175+
176+
-- Capture end time and calculate latency in milliseconds
177+
local end_time_ms = get_time_ms()
178+
local latency_ms = end_time_ms - start_time_ms
179+
table.insert(latency_table, {"total_step_latency_ms", latency_ms})
180+
181+
return {set_key, has_root_span, latency_ms, latency_table, metrics_table}
182+
end
183+
103184
-- Maintain member-keys (span-buf:mk) tracking sets so the flusher
104185
-- knows which payload keys to fetch.
105186
local member_keys_key = string.format("span-buf:mk:{%s}:%s", project_and_trace, set_span_id)

src/sentry/spans/buffer.py

Lines changed: 57 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -32,16 +32,26 @@
3232
3333
Now how does that look like in Redis? For each incoming span, we:
3434
35-
1. Store the span payload in a payload key:
36-
"span-buf:s:{project_id:trace_id:span_id}:span_id". Each subsegment
37-
gets its own key, distributed across Redis cluster nodes.
35+
1. Store the span payload in a payload key. Each subsegment gets its own key,
36+
distributed across Redis cluster nodes.
37+
a. When segment size enforcement is disabled, the key uses the parent_span_id to
38+
determine where to write span payloads to.
39+
Key: `span-buf:s:{project_id:trace_id:parent_span_id}:parent_span_id`
40+
b. When segment size enforcement is enabled, the key uses a unique salt per
41+
subsegment. This allows us to skip merging the subsegment into the parent segment
42+
and not lose any data, since the subsegment will become its own separate segment
43+
and be flushed out independently.
44+
Key: `span-buf:s:{project_id:trace_id:salt}:salt`
3845
2. The Lua script (add-buffer.lua) receives the span IDs and:
3946
a. Follows redirects from parent_span_id (hashmap at
4047
"span-buf:ssr:{project_id:trace_id}") to find the segment root.
4148
b. Updates the redirect table so future spans can find the segment root.
4249
c. Merges member-keys indexes and counters (ingested count, byte count)
4350
from span IDs that were previously separate segment roots into the
4451
current segment root.
52+
d. If segment size enforcement is enabled and the segment exceeds
53+
max_segment_bytes, detaches the subsegment into its own segment
54+
keyed by the salt.
4555
3. To a "global queue", we write the segment key, sorted by timeout.
4656
4757
Eventually, flushing cronjob looks at that global queue, and removes all timed
@@ -58,6 +68,22 @@
5868
or using spillover topics, especially when their new partition count is lower
5969
than the original topic.
6070
71+
Segment size enforcement:
72+
73+
Segments can grow unboundedly as spans arrive. To prevent oversized segments from
74+
consuming excessive memory during flush, the buffer enforces a maximum byte limit
75+
per segment (controlled by `spans.buffer.max-segment-bytes` and gated behind
76+
`spans.buffer.enforce-segment-size`).
77+
78+
Each subsegment is assigned a unique salt (UUID). The Lua script tracks cumulative
79+
ingested bytes per segment via `span-buf:ibc` keys. If adding a subsegment would
80+
push the segment over the byte limit, the script detaches it into a new segment
81+
keyed by the salt instead of merging it into the parent. The detached segment is
82+
independently tracked and flushed.
83+
84+
During flush, segments that exceed `max-segment-bytes` are chunked into multiple
85+
Kafka messages to stay within downstream size limits.
86+
6187
Glossary for types of keys:
6288
6389
* span-buf:s:{project_id:trace_id:span_id}:span_id -- payload keys containing span payloads, distributed across cluster nodes.
@@ -76,6 +102,7 @@
76102
import logging
77103
import math
78104
import time
105+
import uuid
79106
from collections.abc import Generator, MutableMapping, Sequence
80107
from typing import Any, NamedTuple, cast
81108

@@ -146,6 +173,12 @@ def effective_parent_id(self):
146173
type SpanPayload = dict[str, Any]
147174

148175

176+
class Subsegment(NamedTuple):
177+
project_and_trace: tuple[str, str]
178+
salt: str
179+
subsegment: list[Span]
180+
181+
149182
class OutputSpan(NamedTuple):
150183
payload: SpanPayload
151184

@@ -254,6 +287,8 @@ def process_spans(self, spans: Sequence[Span], now: int):
254287
timeout = options.get("spans.buffer.timeout")
255288
root_timeout = options.get("spans.buffer.root-timeout")
256289
max_spans_per_evalsha = options.get("spans.buffer.max-spans-per-evalsha")
290+
max_segment_bytes = options.get("spans.buffer.max-segment-bytes")
291+
enforce_segment_size = options.get("spans.buffer.enforce-segment-size")
257292
result_meta = []
258293
is_root_span_count = 0
259294

@@ -263,25 +298,27 @@ def process_spans(self, spans: Sequence[Span], now: int):
263298

264299
# Split large subsegments into chunks to avoid Lua unpack() limits.
265300
# Chunks share the same parent_span_id but are processed separately.
266-
tree_items: list[tuple[tuple[str, str], list[Span]]] = []
301+
tree_items: list[Subsegment] = []
267302
for key, subsegment in trees.items():
268303
if max_spans_per_evalsha > 0 and len(subsegment) > max_spans_per_evalsha:
269304
for chunk in itertools.batched(subsegment, max_spans_per_evalsha):
270-
tree_items.append((key, list(chunk)))
305+
tree_items.append(Subsegment(key, uuid.uuid4().hex, list(chunk)))
271306
else:
272-
tree_items.append((key, subsegment))
307+
tree_items.append(Subsegment(key, uuid.uuid4().hex, subsegment))
273308

274-
tree_batches: Sequence[Sequence[tuple[tuple[str, str], list[Span]]]]
309+
tree_batches: Sequence[Sequence[Subsegment]]
275310
if pipeline_batch_size > 0:
276311
tree_batches = list(itertools.batched(tree_items, pipeline_batch_size))
277312
else:
278313
tree_batches = [tree_items]
279314

280315
for batch in tree_batches:
281316
with self.client.pipeline(transaction=False) as p:
282-
for (project_and_trace, parent_span_id), subsegment in batch:
317+
for (project_and_trace, parent_span_id), salt, subsegment in batch:
283318
set_members = self._prepare_payloads(subsegment)
284319
payload_key = self._get_payload_key(project_and_trace, parent_span_id)
320+
if enforce_segment_size:
321+
payload_key = self._get_payload_key(project_and_trace, salt)
285322
p.sadd(payload_key, *set_members)
286323
p.expire(payload_key, redis_ttl)
287324

@@ -296,7 +333,7 @@ def process_spans(self, spans: Sequence[Span], now: int):
296333
results: list[Any] = []
297334
for batch in tree_batches:
298335
with self.client.pipeline(transaction=False) as p:
299-
for (project_and_trace, parent_span_id), subsegment in batch:
336+
for (project_and_trace, parent_span_id), salt, subsegment in batch:
300337
byte_count = sum(len(span.payload) for span in subsegment)
301338

302339
try:
@@ -323,6 +360,8 @@ def process_spans(self, spans: Sequence[Span], now: int):
323360
is_segment_span,
324361
redis_ttl,
325362
byte_count,
363+
max_segment_bytes,
364+
salt if enforce_segment_size else "",
326365
*span_ids,
327366
)
328367

@@ -331,7 +370,7 @@ def process_spans(self, spans: Sequence[Span], now: int):
331370
# All spans in a subsegment share the same trace_id,
332371
# so they all came from the same Kafka partition.
333372
partition = subsegment[0].partition
334-
result_meta.append((project_and_trace, parent_span_id, partition))
373+
result_meta.append((project_and_trace, parent_span_id, partition, salt))
335374

336375
results.extend(p.execute())
337376

@@ -349,7 +388,9 @@ def process_spans(self, spans: Sequence[Span], now: int):
349388

350389
assert len(result_meta) == len(results)
351390

352-
for (project_and_trace, parent_span_id, partition), result in zip(result_meta, results):
391+
for (project_and_trace, parent_span_id, partition, salt), result in zip(
392+
result_meta, results
393+
):
353394
(
354395
segment_key,
355396
has_root_span,
@@ -402,9 +443,11 @@ def process_spans(self, spans: Sequence[Span], now: int):
402443

403444
subsegment_spans = trees[project_and_trace, parent_span_id]
404445
delete_set = queue_deletes.setdefault(queue_key, set())
405-
delete_set.update(
406-
self._get_span_key(project_and_trace, span.span_id) for span in subsegment_spans
407-
)
446+
if not segment_key.endswith(salt.encode("ascii")):
447+
delete_set.update(
448+
self._get_span_key(project_and_trace, span.span_id)
449+
for span in subsegment_spans
450+
)
408451
delete_set.discard(segment_key)
409452

410453
for result in results:

0 commit comments

Comments
 (0)