Skip to content

Commit 8311d56

Browse files
fix(compositor): address correctness + bench issues from review
- Fix #1 (High): skip-clear now validates source pixel alpha (all pixels must have alpha==255) before skipping canvas clear. Prevents blending against stale pooled buffer data when RGBA source has transparency. - Fix #2 (Medium): conversion cache slot indices now use position in the full layers slice (with None holes) via two-pass resolution, so cache keys stay stable when slots gain/lose frames. - Fix #3 (Medium): benchmark now calls real composite_frame() kernel instead of reimplementing compositing inline. Exercises all kernel optimizations (cache, clear-skip, identity fast-path, x-map). - Fix Devin Review: revert video pool preallocation (was allocating ~121MB across all bucket sizes at startup). Restored lazy allocation. Co-Authored-By: Claudio Costa <cstcld91@gmail.com>
1 parent e942408 commit 8311d56

File tree

4 files changed

+100
-102
lines changed

4 files changed

+100
-102
lines changed

crates/core/src/frame_pool.rs

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -325,17 +325,9 @@ pub const DEFAULT_VIDEO_BUCKET_SIZES: &[usize] = &[
325325
];
326326
pub const DEFAULT_VIDEO_MAX_BUFFERS_PER_BUCKET: usize = 16;
327327

328-
/// Number of buffers to preallocate per video bucket at startup.
329-
/// Avoids cold-start misses for the first few frames.
330-
pub const DEFAULT_VIDEO_PREALLOCATE_PER_BUCKET: usize = 2;
331-
332328
impl FramePool<u8> {
333329
pub fn video_default() -> Self {
334-
Self::preallocated_with_max(
335-
DEFAULT_VIDEO_BUCKET_SIZES,
336-
DEFAULT_VIDEO_PREALLOCATE_PER_BUCKET,
337-
DEFAULT_VIDEO_MAX_BUFFERS_PER_BUCKET,
338-
)
330+
Self::with_buckets(DEFAULT_VIDEO_BUCKET_SIZES.to_vec(), DEFAULT_VIDEO_MAX_BUFFERS_PER_BUCKET)
339331
}
340332
}
341333

crates/engine/benches/compositor_only.rs

Lines changed: 17 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -41,21 +41,10 @@ use streamkit_core::types::PixelFormat;
4141

4242
// Re-use the compositor kernel and pixel_ops directly.
4343
use streamkit_nodes::video::compositor::config::Rect;
44+
use streamkit_nodes::video::compositor::kernel::{composite_frame, ConversionCache, LayerSnapshot};
45+
use streamkit_nodes::video::compositor::overlay::DecodedOverlay;
4446
use streamkit_nodes::video::compositor::pixel_ops::rgba8_to_i420;
4547

46-
/// Inline copy of `LayerSnapshot` to avoid depending on the private `kernel` module.
47-
/// Must stay in sync with `kernel::LayerSnapshot`.
48-
struct LayerSnapshot {
49-
data: Arc<PooledVideoData>,
50-
width: u32,
51-
height: u32,
52-
pixel_format: PixelFormat,
53-
rect: Option<Rect>,
54-
opacity: f32,
55-
z_index: i32,
56-
rotation_degrees: f32,
57-
}
58-
5948
// ── Default benchmark parameters ────────────────────────────────────────────
6049

6150
const DEFAULT_WIDTH: u32 = 1280;
@@ -178,78 +167,32 @@ fn generate_nv12_frame(width: u32, height: u32) -> Vec<u8> {
178167

179168
// ── Compositing harness ─────────────────────────────────────────────────────
180169

181-
/// Directly call the compositing kernel for `frame_count` iterations,
182-
/// returning per-frame timing statistics.
170+
/// Call the real `composite_frame` kernel for `frame_count` iterations,
171+
/// returning per-frame timing statistics. This exercises all kernel
172+
/// optimizations: conversion cache, skip-canvas-clear, identity-scale
173+
/// fast-path, precomputed x-map, etc.
183174
fn bench_composite(
184175
_label: &str,
185176
canvas_w: u32,
186177
canvas_h: u32,
187178
layers: &[Option<LayerSnapshot>],
188179
frame_count: u32,
189180
) -> BenchResult {
190-
// Re-create the kernel's compositing logic inline since `composite_frame`
191-
// is pub(crate). We call the public pixel_ops functions directly.
192-
let total_bytes = (canvas_w as usize) * (canvas_h as usize) * 4;
193-
let mut canvas = vec![0u8; total_bytes];
194-
let mut i420_scratch: Vec<u8> = Vec::new();
181+
let empty_overlays: Vec<Arc<DecodedOverlay>> = Vec::new();
182+
let mut conversion_cache = ConversionCache::new();
195183

196184
let start = Instant::now();
197185

198186
for _ in 0..frame_count {
199-
// Zero the canvas.
200-
canvas.fill(0);
201-
202-
// Blit each layer.
203-
for layer in layers.iter().flatten() {
204-
let dst_rect = layer.rect.clone().unwrap_or(Rect {
205-
x: 0,
206-
y: 0,
207-
width: canvas_w,
208-
height: canvas_h,
209-
});
210-
211-
let src_data: &[u8] = match layer.pixel_format {
212-
PixelFormat::Rgba8 => layer.data.as_slice(),
213-
PixelFormat::I420 => {
214-
let needed = layer.width as usize * layer.height as usize * 4;
215-
if i420_scratch.len() < needed {
216-
i420_scratch.resize(needed, 0);
217-
}
218-
streamkit_nodes::video::compositor::pixel_ops::i420_to_rgba8_buf(
219-
layer.data.as_slice(),
220-
layer.width,
221-
layer.height,
222-
&mut i420_scratch,
223-
);
224-
&i420_scratch[..needed]
225-
},
226-
PixelFormat::Nv12 => {
227-
let needed = layer.width as usize * layer.height as usize * 4;
228-
if i420_scratch.len() < needed {
229-
i420_scratch.resize(needed, 0);
230-
}
231-
streamkit_nodes::video::compositor::pixel_ops::nv12_to_rgba8_buf(
232-
layer.data.as_slice(),
233-
layer.width,
234-
layer.height,
235-
&mut i420_scratch,
236-
);
237-
&i420_scratch[..needed]
238-
},
239-
};
240-
241-
streamkit_nodes::video::compositor::pixel_ops::scale_blit_rgba_rotated(
242-
&mut canvas,
243-
canvas_w,
244-
canvas_h,
245-
src_data,
246-
layer.width,
247-
layer.height,
248-
&dst_rect,
249-
layer.opacity,
250-
layer.rotation_degrees,
251-
);
252-
}
187+
let _result = composite_frame(
188+
canvas_w,
189+
canvas_h,
190+
layers,
191+
&empty_overlays,
192+
&empty_overlays,
193+
None,
194+
&mut conversion_cache,
195+
);
253196
}
254197

255198
let elapsed = start.elapsed();

crates/nodes/src/video/compositor/kernel.rs

Lines changed: 80 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,23 @@ impl ConversionCache {
4646
Self { entries: Vec::new() }
4747
}
4848

49+
/// Return a previously-cached RGBA slice for `slot_idx`.
50+
///
51+
/// # Panics
52+
///
53+
/// Panics if the slot has not been populated by a prior `get_or_convert`
54+
/// call for the same `layer`. This is only called in the second pass of
55+
/// `composite_frame` after the first pass has ensured every non-RGBA
56+
/// layer has been converted.
57+
fn get_cached(&self, slot_idx: usize, layer: &LayerSnapshot) -> &[u8] {
58+
#[allow(clippy::expect_used)]
59+
let cached = self.entries[slot_idx]
60+
.as_ref()
61+
.expect("get_cached called before get_or_convert");
62+
let needed = layer.width as usize * layer.height as usize * 4;
63+
&cached.rgba[..needed]
64+
}
65+
4966
/// Look up or perform a YUV→RGBA conversion for layer at `slot_idx`.
5067
/// Returns a slice of RGBA8 data.
5168
fn get_or_convert(&mut self, slot_idx: usize, layer: &LayerSnapshot) -> &[u8] {
@@ -101,12 +118,20 @@ impl ConversionCache {
101118
}
102119
}
103120

104-
/// Returns `true` if the first visible layer is fully opaque, unrotated, and
105-
/// covers the entire canvas — meaning the canvas clear can be skipped.
121+
/// Returns `true` if the first visible layer is fully opaque (both layer
122+
/// opacity *and* source pixel alpha), unrotated, and covers the entire
123+
/// canvas — meaning the canvas clear can be skipped.
124+
///
125+
/// `first_src_data` is the RGBA8 source buffer for the first layer (after
126+
/// any YUV→RGBA conversion). We check that every pixel in the region that
127+
/// will be blitted has `alpha == 255`; if any pixel is semi-transparent the
128+
/// clear cannot be skipped because the blit would blend with uninitialised
129+
/// (or stale pooled) canvas bytes.
106130
fn first_layer_covers_canvas(
107131
layers: &[Option<LayerSnapshot>],
108132
canvas_w: u32,
109133
canvas_h: u32,
134+
first_src_data: Option<&[u8]>,
110135
) -> bool {
111136
let Some(first) = layers.iter().flatten().next() else {
112137
return false;
@@ -118,12 +143,23 @@ fn first_layer_covers_canvas(
118143

119144
// Check if the layer fully covers the canvas.
120145
// A layer with no rect fills the entire canvas by default.
121-
first.rect.as_ref().map_or(true, |r| {
146+
let covers = first.rect.as_ref().map_or(true, |r| {
122147
r.x <= 0
123148
&& r.y <= 0
124149
&& i64::from(r.width) + i64::from(r.x) >= i64::from(canvas_w)
125150
&& i64::from(r.height) + i64::from(r.y) >= i64::from(canvas_h)
126-
})
151+
});
152+
if !covers {
153+
return false;
154+
}
155+
156+
// Verify that *all* source pixels are fully opaque (alpha == 255).
157+
// Without this check, semi-transparent source pixels would blend with
158+
// uninitialised canvas bytes when the clear is skipped.
159+
let Some(src) = first_src_data else {
160+
return false;
161+
};
162+
src.chunks_exact(4).all(|px| px[3] == 255)
127163
}
128164

129165
/// Snapshot of one input layer's data for the blocking compositor thread.
@@ -185,26 +221,53 @@ pub fn composite_frame(
185221

186222
let buf = pooled.as_mut_slice();
187223

188-
// Skip the canvas clear when the first layer is opaque, unrotated, and
189-
// covers the entire canvas — the blit will fully overwrite every pixel.
190-
if !first_layer_covers_canvas(layers, canvas_w, canvas_h) {
224+
// Two-pass source resolution.
225+
//
226+
// Pass 1: populate the conversion cache for every non-RGBA layer.
227+
// `slot_idx` uses the position in the `layers` slice (which preserves
228+
// `None` holes) so that cache indices stay stable even when some slots
229+
// have no frame.
230+
for (slot_idx, entry) in layers.iter().enumerate() {
231+
if let Some(layer) = entry {
232+
if layer.pixel_format != PixelFormat::Rgba8 {
233+
conversion_cache.get_or_convert(slot_idx, layer);
234+
}
235+
}
236+
}
237+
238+
// Pass 2: build resolved references. The mutable borrow of
239+
// `conversion_cache` from pass 1 is released, so we can now take
240+
// shared references into the cache alongside references into `layers`.
241+
let resolved: Vec<Option<(&LayerSnapshot, &[u8])>> = layers
242+
.iter()
243+
.enumerate()
244+
.map(|(slot_idx, entry)| {
245+
entry.as_ref().map(|layer| {
246+
let src_data: &[u8] = match layer.pixel_format {
247+
PixelFormat::Rgba8 => layer.data.as_slice(),
248+
PixelFormat::I420 | PixelFormat::Nv12 => {
249+
// Cache was populated in pass 1; this is a shared
250+
// read that cannot fail.
251+
conversion_cache.get_cached(slot_idx, layer)
252+
},
253+
};
254+
(layer, src_data)
255+
})
256+
})
257+
.collect();
258+
259+
// Now that we have the first layer's resolved RGBA data, check whether
260+
// the canvas clear can be skipped.
261+
let first_src = resolved.iter().flatten().next().map(|(_, d)| *d);
262+
if !first_layer_covers_canvas(layers, canvas_w, canvas_h, first_src) {
191263
buf[..total_bytes].fill(0);
192264
}
193265

194266
// Blit each layer (in order — first layer is bottom, last is top).
195-
// Non-RGBA layers use the conversion cache to avoid redundant per-frame
196-
// YUV→RGBA8 conversion when the source data hasn't changed.
197-
for (slot_idx, layer) in layers.iter().flatten().enumerate() {
267+
for (layer, src_data) in resolved.iter().flatten() {
198268
let dst_rect =
199269
layer.rect.clone().unwrap_or(Rect { x: 0, y: 0, width: canvas_w, height: canvas_h });
200270

201-
let src_data: &[u8] = match layer.pixel_format {
202-
PixelFormat::Rgba8 => layer.data.as_slice(),
203-
PixelFormat::I420 | PixelFormat::Nv12 => {
204-
conversion_cache.get_or_convert(slot_idx, layer)
205-
},
206-
};
207-
208271
scale_blit_rgba_rotated(
209272
buf,
210273
canvas_w,

crates/nodes/src/video/compositor/mod.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,8 @@
2525
//! - Bilinear / Lanczos scaling (MVP uses nearest-neighbor).
2626
2727
pub mod config;
28-
mod kernel;
29-
mod overlay;
28+
pub mod kernel;
29+
pub mod overlay;
3030
pub mod pixel_ops;
3131

3232
use async_trait::async_trait;

0 commit comments

Comments
 (0)