fix(compositor): address correctness + bench issues from review

streamkit-devin · streamer45 · streamkit-devin · commit 8311d56208d5 · 2026-03-01T09:15:55.000Z
- Fix #1 (High): skip-clear now validates source pixel alpha (all pixels must have alpha==255) before skipping canvas clear. Prevents blending against stale pooled buffer data when RGBA source has transparency. - Fix #2 (Medium): conversion cache slot indices now use position in the full layers slice (with None holes) via two-pass resolution, so cache keys stay stable when slots gain/lose frames. - Fix #3 (Medium): benchmark now calls real composite_frame() kernel instead of reimplementing compositing inline. Exercises all kernel optimizations (cache, clear-skip, identity fast-path, x-map). - Fix Devin Review: revert video pool preallocation (was allocating ~121MB across all bucket sizes at startup). Restored lazy allocation. Co-Authored-By: Claudio Costa <cstcld91@gmail.com>
diff --git a/crates/core/src/frame_pool.rs b/crates/core/src/frame_pool.rs
@@ -325,17 +325,9 @@ pub const DEFAULT_VIDEO_BUCKET_SIZES: &[usize] = &[
 ];
 pub const DEFAULT_VIDEO_MAX_BUFFERS_PER_BUCKET: usize = 16;
 
-/// Number of buffers to preallocate per video bucket at startup.
-/// Avoids cold-start misses for the first few frames.
-pub const DEFAULT_VIDEO_PREALLOCATE_PER_BUCKET: usize = 2;
-
 impl FramePool<u8> {
     pub fn video_default() -> Self {
-        Self::preallocated_with_max(
-            DEFAULT_VIDEO_BUCKET_SIZES,
-            DEFAULT_VIDEO_PREALLOCATE_PER_BUCKET,
-            DEFAULT_VIDEO_MAX_BUFFERS_PER_BUCKET,
-        )
+        Self::with_buckets(DEFAULT_VIDEO_BUCKET_SIZES.to_vec(), DEFAULT_VIDEO_MAX_BUFFERS_PER_BUCKET)
     }
 }
 
diff --git a/crates/engine/benches/compositor_only.rs b/crates/engine/benches/compositor_only.rs
@@ -41,21 +41,10 @@ use streamkit_core::types::PixelFormat;
 
 // Re-use the compositor kernel and pixel_ops directly.
 use streamkit_nodes::video::compositor::config::Rect;
+use streamkit_nodes::video::compositor::kernel::{composite_frame, ConversionCache, LayerSnapshot};
+use streamkit_nodes::video::compositor::overlay::DecodedOverlay;
 use streamkit_nodes::video::compositor::pixel_ops::rgba8_to_i420;
 
-/// Inline copy of `LayerSnapshot` to avoid depending on the private `kernel` module.
-/// Must stay in sync with `kernel::LayerSnapshot`.
-struct LayerSnapshot {
-    data: Arc<PooledVideoData>,
-    width: u32,
-    height: u32,
-    pixel_format: PixelFormat,
-    rect: Option<Rect>,
-    opacity: f32,
-    z_index: i32,
-    rotation_degrees: f32,
-}
-
 // ── Default benchmark parameters ────────────────────────────────────────────
 
 const DEFAULT_WIDTH: u32 = 1280;
@@ -178,78 +167,32 @@ fn generate_nv12_frame(width: u32, height: u32) -> Vec<u8> {
 
 // ── Compositing harness ─────────────────────────────────────────────────────
 
-/// Directly call the compositing kernel for `frame_count` iterations,
-/// returning per-frame timing statistics.
+/// Call the real `composite_frame` kernel for `frame_count` iterations,
+/// returning per-frame timing statistics.  This exercises all kernel
+/// optimizations: conversion cache, skip-canvas-clear, identity-scale
+/// fast-path, precomputed x-map, etc.
 fn bench_composite(
     _label: &str,
     canvas_w: u32,
     canvas_h: u32,
     layers: &[Option<LayerSnapshot>],
     frame_count: u32,
 ) -> BenchResult {
-    // Re-create the kernel's compositing logic inline since `composite_frame`
-    // is pub(crate). We call the public pixel_ops functions directly.
-    let total_bytes = (canvas_w as usize) * (canvas_h as usize) * 4;
-    let mut canvas = vec![0u8; total_bytes];
-    let mut i420_scratch: Vec<u8> = Vec::new();
+    let empty_overlays: Vec<Arc<DecodedOverlay>> = Vec::new();
+    let mut conversion_cache = ConversionCache::new();
 
     let start = Instant::now();
 
     for _ in 0..frame_count {
-        // Zero the canvas.
-        canvas.fill(0);
-
-        // Blit each layer.
-        for layer in layers.iter().flatten() {
-            let dst_rect = layer.rect.clone().unwrap_or(Rect {
-                x: 0,
-                y: 0,
-                width: canvas_w,
-                height: canvas_h,
-            });
-
-            let src_data: &[u8] = match layer.pixel_format {
-                PixelFormat::Rgba8 => layer.data.as_slice(),
-                PixelFormat::I420 => {
-                    let needed = layer.width as usize * layer.height as usize * 4;
-                    if i420_scratch.len() < needed {
-                        i420_scratch.resize(needed, 0);
-                    }
-                    streamkit_nodes::video::compositor::pixel_ops::i420_to_rgba8_buf(
-                        layer.data.as_slice(),
-                        layer.width,
-                        layer.height,
-                        &mut i420_scratch,
-                    );
-                    &i420_scratch[..needed]
-                },
-                PixelFormat::Nv12 => {
-                    let needed = layer.width as usize * layer.height as usize * 4;
-                    if i420_scratch.len() < needed {
-                        i420_scratch.resize(needed, 0);
-                    }
-                    streamkit_nodes::video::compositor::pixel_ops::nv12_to_rgba8_buf(
-                        layer.data.as_slice(),
-                        layer.width,
-                        layer.height,
-                        &mut i420_scratch,
-                    );
-                    &i420_scratch[..needed]
-                },
-            };
-
-            streamkit_nodes::video::compositor::pixel_ops::scale_blit_rgba_rotated(
-                &mut canvas,
-                canvas_w,
-                canvas_h,
-                src_data,
-                layer.width,
-                layer.height,
-                &dst_rect,
-                layer.opacity,
-                layer.rotation_degrees,
-            );
-        }
+        let _result = composite_frame(
+            canvas_w,
+            canvas_h,
+            layers,
+            &empty_overlays,
+            &empty_overlays,
+            None,
+            &mut conversion_cache,
+        );
     }
 
     let elapsed = start.elapsed();
diff --git a/crates/nodes/src/video/compositor/kernel.rs b/crates/nodes/src/video/compositor/kernel.rs
@@ -46,6 +46,23 @@ impl ConversionCache {
         Self { entries: Vec::new() }
     }
 
+    /// Return a previously-cached RGBA slice for `slot_idx`.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the slot has not been populated by a prior `get_or_convert`
+    /// call for the same `layer`.  This is only called in the second pass of
+    /// `composite_frame` after the first pass has ensured every non-RGBA
+    /// layer has been converted.
+    fn get_cached(&self, slot_idx: usize, layer: &LayerSnapshot) -> &[u8] {
+        #[allow(clippy::expect_used)]
+        let cached = self.entries[slot_idx]
+            .as_ref()
+            .expect("get_cached called before get_or_convert");
+        let needed = layer.width as usize * layer.height as usize * 4;
+        &cached.rgba[..needed]
+    }
+
     /// Look up or perform a YUV→RGBA conversion for layer at `slot_idx`.
     /// Returns a slice of RGBA8 data.
     fn get_or_convert(&mut self, slot_idx: usize, layer: &LayerSnapshot) -> &[u8] {
@@ -101,12 +118,20 @@ impl ConversionCache {
     }
 }
 
-/// Returns `true` if the first visible layer is fully opaque, unrotated, and
-/// covers the entire canvas — meaning the canvas clear can be skipped.
+/// Returns `true` if the first visible layer is fully opaque (both layer
+/// opacity *and* source pixel alpha), unrotated, and covers the entire
+/// canvas — meaning the canvas clear can be skipped.
+///
+/// `first_src_data` is the RGBA8 source buffer for the first layer (after
+/// any YUV→RGBA conversion).  We check that every pixel in the region that
+/// will be blitted has `alpha == 255`; if any pixel is semi-transparent the
+/// clear cannot be skipped because the blit would blend with uninitialised
+/// (or stale pooled) canvas bytes.
 fn first_layer_covers_canvas(
     layers: &[Option<LayerSnapshot>],
     canvas_w: u32,
     canvas_h: u32,
+    first_src_data: Option<&[u8]>,
 ) -> bool {
     let Some(first) = layers.iter().flatten().next() else {
         return false;
@@ -118,12 +143,23 @@ fn first_layer_covers_canvas(
 
     // Check if the layer fully covers the canvas.
     // A layer with no rect fills the entire canvas by default.
-    first.rect.as_ref().map_or(true, |r| {
+    let covers = first.rect.as_ref().map_or(true, |r| {
         r.x <= 0
             && r.y <= 0
             && i64::from(r.width) + i64::from(r.x) >= i64::from(canvas_w)
             && i64::from(r.height) + i64::from(r.y) >= i64::from(canvas_h)
-    })
+    });
+    if !covers {
+        return false;
+    }
+
+    // Verify that *all* source pixels are fully opaque (alpha == 255).
+    // Without this check, semi-transparent source pixels would blend with
+    // uninitialised canvas bytes when the clear is skipped.
+    let Some(src) = first_src_data else {
+        return false;
+    };
+    src.chunks_exact(4).all(|px| px[3] == 255)
 }
 
 /// Snapshot of one input layer's data for the blocking compositor thread.
@@ -185,26 +221,53 @@ pub fn composite_frame(
 
     let buf = pooled.as_mut_slice();
 
-    // Skip the canvas clear when the first layer is opaque, unrotated, and
-    // covers the entire canvas — the blit will fully overwrite every pixel.
-    if !first_layer_covers_canvas(layers, canvas_w, canvas_h) {
+    // Two-pass source resolution.
+    //
+    // Pass 1: populate the conversion cache for every non-RGBA layer.
+    // `slot_idx` uses the position in the `layers` slice (which preserves
+    // `None` holes) so that cache indices stay stable even when some slots
+    // have no frame.
+    for (slot_idx, entry) in layers.iter().enumerate() {
+        if let Some(layer) = entry {
+            if layer.pixel_format != PixelFormat::Rgba8 {
+                conversion_cache.get_or_convert(slot_idx, layer);
+            }
+        }
+    }
+
+    // Pass 2: build resolved references.  The mutable borrow of
+    // `conversion_cache` from pass 1 is released, so we can now take
+    // shared references into the cache alongside references into `layers`.
+    let resolved: Vec<Option<(&LayerSnapshot, &[u8])>> = layers
+        .iter()
+        .enumerate()
+        .map(|(slot_idx, entry)| {
+            entry.as_ref().map(|layer| {
+                let src_data: &[u8] = match layer.pixel_format {
+                    PixelFormat::Rgba8 => layer.data.as_slice(),
+                    PixelFormat::I420 | PixelFormat::Nv12 => {
+                        // Cache was populated in pass 1; this is a shared
+                        // read that cannot fail.
+                        conversion_cache.get_cached(slot_idx, layer)
+                    },
+                };
+                (layer, src_data)
+            })
+        })
+        .collect();
+
+    // Now that we have the first layer's resolved RGBA data, check whether
+    // the canvas clear can be skipped.
+    let first_src = resolved.iter().flatten().next().map(|(_, d)| *d);
+    if !first_layer_covers_canvas(layers, canvas_w, canvas_h, first_src) {
         buf[..total_bytes].fill(0);
     }
 
     // Blit each layer (in order — first layer is bottom, last is top).
-    // Non-RGBA layers use the conversion cache to avoid redundant per-frame
-    // YUV→RGBA8 conversion when the source data hasn't changed.
-    for (slot_idx, layer) in layers.iter().flatten().enumerate() {
+    for (layer, src_data) in resolved.iter().flatten() {
         let dst_rect =
             layer.rect.clone().unwrap_or(Rect { x: 0, y: 0, width: canvas_w, height: canvas_h });
 
-        let src_data: &[u8] = match layer.pixel_format {
-            PixelFormat::Rgba8 => layer.data.as_slice(),
-            PixelFormat::I420 | PixelFormat::Nv12 => {
-                conversion_cache.get_or_convert(slot_idx, layer)
-            },
-        };
-
         scale_blit_rgba_rotated(
             buf,
             canvas_w,
diff --git a/crates/nodes/src/video/compositor/mod.rs b/crates/nodes/src/video/compositor/mod.rs
@@ -25,8 +25,8 @@
 //! - Bilinear / Lanczos scaling (MVP uses nearest-neighbor).
 
 pub mod config;
-mod kernel;
-mod overlay;
+pub mod kernel;
+pub mod overlay;
 pub mod pixel_ops;
 
 use async_trait::async_trait;