From 250e6e8971102a839d0d4bd91e77bf6ca90fa8ed Mon Sep 17 00:00:00 2001
From: StreamKit Devin <devin@streamkit.dev>
Date: Sun, 1 Mar 2026 08:30:51 +0000
Subject: [PATCH 01/12] perf(compositor): add compositor-only microbenchmark

Adds a standalone benchmark that measures composite_frame() in isolation
(no VP9 encode, no mux, no async runtime overhead).

Scenarios:
- 1/2/4 layers RGBA
- Mixed I420+RGBA and NV12+RGBA (measures conversion overhead)
- Rotation (measures rotated blit path)
- Static layers (same Arc each frame, for future cache-hit measurement)

Runs at 640x480, 1280x720, 1920x1080 by default.

Baseline results on this VM (8 logical CPUs):
  1920x1080 1-layer-rgba:     ~728 fps (1.37 ms/frame)
  1920x1080 2-layer-rgba-pip: ~601 fps (1.66 ms/frame)
  1920x1080 2-layer-i420+rgba: ~427 fps (2.34 ms/frame)
  1920x1080 2-layer-nv12+rgba: ~478 fps (2.09 ms/frame)
  1920x1080 2-layer-rgba-rotated: ~470 fps (2.13 ms/frame)

Co-Authored-By: Claudio Costa <cstcld91@gmail.com>
---
 crates/engine/Cargo.toml                 |   4 +
 crates/engine/benches/compositor_only.rs | 649 +++++++++++++++++++++++
 2 files changed, 653 insertions(+)
 create mode 100644 crates/engine/benches/compositor_only.rs
diff --git a/crates/engine/Cargo.toml b/crates/engine/Cargo.toml
index a5cf11e4..8410f58d 100644
--- a/crates/engine/Cargo.toml
+++ b/crates/engine/Cargo.toml
@@ -64,5 +64,9 @@ indexmap = { workspace = true }
 name = "compositor_pipeline"
 harness = false
 
+[[bench]]
+name = "compositor_only"
+harness = false
+
 [lints]
 workspace = true
diff --git a/crates/engine/benches/compositor_only.rs b/crates/engine/benches/compositor_only.rs
new file mode 100644
index 00000000..0b66c961
--- /dev/null
+++ b/crates/engine/benches/compositor_only.rs
@@ -0,0 +1,649 @@
+// SPDX-FileCopyrightText: © 2025 StreamKit Contributors
+//
+// SPDX-License-Identifier: MPL-2.0
+
+#![allow(clippy::disallowed_macros)] // Bench binary intentionally uses eprintln!/println! for output.
+#![allow(clippy::expect_used)] // Panicking on errors is fine in a benchmark binary.
+#![allow(clippy::cast_possible_truncation, clippy::cast_sign_loss, clippy::cast_precision_loss)]
+
+//! Compositor-only microbenchmark — measures `composite_frame` in isolation
+//! (no VP9 encode, no mux, no async runtime overhead).
+//!
+//! Exercises the following scenarios across multiple resolutions:
+//!
+//! - 1 layer RGBA (baseline)
+//! - 2 layers RGBA (PiP)
+//! - 4 layers RGBA
+//! - 2 layers mixed I420 + RGBA (measures YUV→RGBA conversion overhead)
+//! - 2 layers mixed NV12 + RGBA
+//! - 2 layers RGBA with rotation
+//! - 2 layers RGBA, static (same data each frame — for future cache-hit measurement)
+//!
+//! ## Usage
+//!
+//! Quick run (default 200 frames @ 1280×720):
+//!
+//! ```bash
+//! cargo bench -p streamkit-engine --bench compositor_only
+//! ```
+//!
+//! Custom parameters:
+//!
+//! ```bash
+//! cargo bench -p streamkit-engine --bench compositor_only -- --frames 500 --width 1920 --height 1080
+//! ```
+
+use std::sync::Arc;
+use std::time::Instant;
+
+use streamkit_core::frame_pool::PooledVideoData;
+use streamkit_core::types::PixelFormat;
+
+// Re-use the compositor kernel and pixel_ops directly.
+use streamkit_nodes::video::compositor::config::Rect;
+use streamkit_nodes::video::compositor::pixel_ops::rgba8_to_i420;
+
+/// Inline copy of `LayerSnapshot` to avoid depending on the private `kernel` module.
+/// Must stay in sync with `kernel::LayerSnapshot`.
+struct LayerSnapshot {
+    data: Arc<PooledVideoData>,
+    width: u32,
+    height: u32,
+    pixel_format: PixelFormat,
+    rect: Option<Rect>,
+    opacity: f32,
+    z_index: i32,
+    rotation_degrees: f32,
+}
+
+// ── Default benchmark parameters ────────────────────────────────────────────
+
+const DEFAULT_WIDTH: u32 = 1280;
+const DEFAULT_HEIGHT: u32 = 720;
+const DEFAULT_FRAME_COUNT: u32 = 200;
+
+// ── Arg parser ──────────────────────────────────────────────────────────────
+
+struct BenchArgs {
+    width: u32,
+    height: u32,
+    frame_count: u32,
+    iterations: u32,
+    /// Optional filter: only run scenarios whose label contains this substring.
+    filter: Option<String>,
+}
+
+impl BenchArgs {
+    fn parse() -> Self {
+        let args: Vec<String> = std::env::args().collect();
+        let mut cfg = Self {
+            width: DEFAULT_WIDTH,
+            height: DEFAULT_HEIGHT,
+            frame_count: DEFAULT_FRAME_COUNT,
+            iterations: 3,
+            filter: None,
+        };
+        let mut i = 1;
+        while i < args.len() {
+            match args[i].as_str() {
+                "--width" | "-w" => {
+                    i += 1;
+                    if let Some(v) = args.get(i) {
+                        cfg.width = v.parse().unwrap_or(cfg.width);
+                    }
+                },
+                "--height" | "-h" => {
+                    i += 1;
+                    if let Some(v) = args.get(i) {
+                        cfg.height = v.parse().unwrap_or(cfg.height);
+                    }
+                },
+                "--frames" | "-n" => {
+                    i += 1;
+                    if let Some(v) = args.get(i) {
+                        cfg.frame_count = v.parse().unwrap_or(cfg.frame_count);
+                    }
+                },
+                "--iterations" | "-i" => {
+                    i += 1;
+                    if let Some(v) = args.get(i) {
+                        cfg.iterations = v.parse().unwrap_or(cfg.iterations);
+                    }
+                },
+                "--filter" | "-f" => {
+                    i += 1;
+                    if let Some(v) = args.get(i) {
+                        cfg.filter = Some(v.clone());
+                    }
+                },
+                _ => {},
+            }
+            i += 1;
+        }
+        cfg
+    }
+}
+
+// ── Frame generators ────────────────────────────────────────────────────────
+
+/// Generate an RGBA8 color-bar frame (opaque, all alpha = 255).
+fn generate_rgba_frame(width: u32, height: u32) -> Vec<u8> {
+    let w = width as usize;
+    let h = height as usize;
+    let mut data = vec![0u8; w * h * 4];
+    // Simple vertical gradient bars for visual distinctness.
+    let bar_colors: &[(u8, u8, u8)] = &[
+        (191, 191, 191), // white
+        (191, 191, 0),   // yellow
+        (0, 191, 191),   // cyan
+        (0, 191, 0),     // green
+        (191, 0, 191),   // magenta
+        (191, 0, 0),     // red
+        (0, 0, 191),     // blue
+    ];
+    for row in 0..h {
+        for col in 0..w {
+            let bar_idx = col * bar_colors.len() / w;
+            let (r, g, b) = bar_colors[bar_idx];
+            let off = (row * w + col) * 4;
+            data[off] = r;
+            data[off + 1] = g;
+            data[off + 2] = b;
+            data[off + 3] = 255;
+        }
+    }
+    data
+}
+
+/// Generate an I420 frame by converting an RGBA frame.
+fn generate_i420_frame(width: u32, height: u32) -> Vec<u8> {
+    let rgba = generate_rgba_frame(width, height);
+    rgba8_to_i420(&rgba, width, height)
+}
+
+/// Generate an NV12 frame by converting an RGBA frame.
+fn generate_nv12_frame(width: u32, height: u32) -> Vec<u8> {
+    let rgba = generate_rgba_frame(width, height);
+    let w = width as usize;
+    let h = height as usize;
+    let chroma_w = w.div_ceil(2);
+    let chroma_h = h.div_ceil(2);
+    let nv12_size = w * h + chroma_w * 2 * chroma_h;
+    let mut nv12 = vec![0u8; nv12_size];
+    streamkit_nodes::video::compositor::pixel_ops::rgba8_to_nv12_buf(&rgba, width, height, &mut nv12);
+    nv12
+}
+
+// ── Compositing harness ─────────────────────────────────────────────────────
+
+/// Directly call the compositing kernel for `frame_count` iterations,
+/// returning per-frame timing statistics.
+fn bench_composite(
+    _label: &str,
+    canvas_w: u32,
+    canvas_h: u32,
+    layers: &[Option<LayerSnapshot>],
+    frame_count: u32,
+) -> BenchResult {
+    // Re-create the kernel's compositing logic inline since `composite_frame`
+    // is pub(crate). We call the public pixel_ops functions directly.
+    let total_bytes = (canvas_w as usize) * (canvas_h as usize) * 4;
+    let mut canvas = vec![0u8; total_bytes];
+    let mut i420_scratch: Vec<u8> = Vec::new();
+
+    let start = Instant::now();
+
+    for _ in 0..frame_count {
+        // Zero the canvas.
+        canvas.fill(0);
+
+        // Blit each layer.
+        for layer in layers.iter().flatten() {
+            let dst_rect = layer.rect.clone().unwrap_or(Rect {
+                x: 0,
+                y: 0,
+                width: canvas_w,
+                height: canvas_h,
+            });
+
+            let src_data: &[u8] = match layer.pixel_format {
+                PixelFormat::Rgba8 => layer.data.as_slice(),
+                PixelFormat::I420 => {
+                    let needed = layer.width as usize * layer.height as usize * 4;
+                    if i420_scratch.len() < needed {
+                        i420_scratch.resize(needed, 0);
+                    }
+                    streamkit_nodes::video::compositor::pixel_ops::i420_to_rgba8_buf(
+                        layer.data.as_slice(),
+                        layer.width,
+                        layer.height,
+                        &mut i420_scratch,
+                    );
+                    &i420_scratch[..needed]
+                },
+                PixelFormat::Nv12 => {
+                    let needed = layer.width as usize * layer.height as usize * 4;
+                    if i420_scratch.len() < needed {
+                        i420_scratch.resize(needed, 0);
+                    }
+                    streamkit_nodes::video::compositor::pixel_ops::nv12_to_rgba8_buf(
+                        layer.data.as_slice(),
+                        layer.width,
+                        layer.height,
+                        &mut i420_scratch,
+                    );
+                    &i420_scratch[..needed]
+                },
+            };
+
+            streamkit_nodes::video::compositor::pixel_ops::scale_blit_rgba_rotated(
+                &mut canvas,
+                canvas_w,
+                canvas_h,
+                src_data,
+                layer.width,
+                layer.height,
+                &dst_rect,
+                layer.opacity,
+                layer.rotation_degrees,
+            );
+        }
+    }
+
+    let elapsed = start.elapsed();
+    BenchResult {
+        total_secs: elapsed.as_secs_f64(),
+        frame_count,
+    }
+}
+
+struct BenchResult {
+    total_secs: f64,
+    frame_count: u32,
+}
+
+impl BenchResult {
+    fn fps(&self) -> f64 {
+        f64::from(self.frame_count) / self.total_secs
+    }
+
+    fn ms_per_frame(&self) -> f64 {
+        self.total_secs * 1000.0 / f64::from(self.frame_count)
+    }
+}
+
+// ── Scenario definitions ────────────────────────────────────────────────────
+
+struct Scenario {
+    label: String,
+    layers: Vec<Option<LayerSnapshot>>,
+}
+
+fn make_layer(
+    data: Vec<u8>,
+    width: u32,
+    height: u32,
+    pixel_format: PixelFormat,
+    rect: Option<Rect>,
+    opacity: f32,
+    z_index: i32,
+    rotation_degrees: f32,
+) -> Option<LayerSnapshot> {
+    Some(LayerSnapshot {
+        data: Arc::new(PooledVideoData::from_vec(data)),
+        width,
+        height,
+        pixel_format,
+        rect,
+        opacity,
+        z_index,
+        rotation_degrees,
+    })
+}
+
+fn build_scenarios(canvas_w: u32, canvas_h: u32) -> Vec<Scenario> {
+    let pip_w = canvas_w / 3;
+    let pip_h = canvas_h / 3;
+    let pip_x = (canvas_w - pip_w - 20) as i32;
+    let pip_y = (canvas_h - pip_h - 20) as i32;
+
+    vec![
+        // 1 layer RGBA — baseline
+        Scenario {
+            label: "1-layer-rgba".to_string(),
+            layers: vec![make_layer(
+                generate_rgba_frame(canvas_w, canvas_h),
+                canvas_w,
+                canvas_h,
+                PixelFormat::Rgba8,
+                None,
+                1.0,
+                0,
+                0.0,
+            )],
+        },
+        // 2 layers RGBA (PiP)
+        Scenario {
+            label: "2-layer-rgba-pip".to_string(),
+            layers: vec![
+                make_layer(
+                    generate_rgba_frame(canvas_w, canvas_h),
+                    canvas_w,
+                    canvas_h,
+                    PixelFormat::Rgba8,
+                    None,
+                    1.0,
+                    0,
+                    0.0,
+                ),
+                make_layer(
+                    generate_rgba_frame(pip_w, pip_h),
+                    pip_w,
+                    pip_h,
+                    PixelFormat::Rgba8,
+                    Some(Rect {
+                        x: pip_x,
+                        y: pip_y,
+                        width: pip_w,
+                        height: pip_h,
+                    }),
+                    0.9,
+                    1,
+                    0.0,
+                ),
+            ],
+        },
+        // 4 layers RGBA
+        Scenario {
+            label: "4-layer-rgba".to_string(),
+            layers: vec![
+                make_layer(
+                    generate_rgba_frame(canvas_w, canvas_h),
+                    canvas_w,
+                    canvas_h,
+                    PixelFormat::Rgba8,
+                    None,
+                    1.0,
+                    0,
+                    0.0,
+                ),
+                make_layer(
+                    generate_rgba_frame(pip_w, pip_h),
+                    pip_w,
+                    pip_h,
+                    PixelFormat::Rgba8,
+                    Some(Rect {
+                        x: pip_x,
+                        y: pip_y,
+                        width: pip_w,
+                        height: pip_h,
+                    }),
+                    0.9,
+                    1,
+                    0.0,
+                ),
+                make_layer(
+                    generate_rgba_frame(pip_w, pip_h),
+                    pip_w,
+                    pip_h,
+                    PixelFormat::Rgba8,
+                    Some(Rect {
+                        x: 20,
+                        y: 20,
+                        width: pip_w,
+                        height: pip_h,
+                    }),
+                    0.8,
+                    2,
+                    0.0,
+                ),
+                make_layer(
+                    generate_rgba_frame(pip_w, pip_h),
+                    pip_w,
+                    pip_h,
+                    PixelFormat::Rgba8,
+                    Some(Rect {
+                        x: 20,
+                        y: pip_y,
+                        width: pip_w,
+                        height: pip_h,
+                    }),
+                    0.7,
+                    3,
+                    0.0,
+                ),
+            ],
+        },
+        // 2 layers: I420 bg + RGBA PiP (measures conversion overhead)
+        Scenario {
+            label: "2-layer-i420+rgba".to_string(),
+            layers: vec![
+                make_layer(
+                    generate_i420_frame(canvas_w, canvas_h),
+                    canvas_w,
+                    canvas_h,
+                    PixelFormat::I420,
+                    None,
+                    1.0,
+                    0,
+                    0.0,
+                ),
+                make_layer(
+                    generate_rgba_frame(pip_w, pip_h),
+                    pip_w,
+                    pip_h,
+                    PixelFormat::Rgba8,
+                    Some(Rect {
+                        x: pip_x,
+                        y: pip_y,
+                        width: pip_w,
+                        height: pip_h,
+                    }),
+                    0.9,
+                    1,
+                    0.0,
+                ),
+            ],
+        },
+        // 2 layers: NV12 bg + RGBA PiP
+        Scenario {
+            label: "2-layer-nv12+rgba".to_string(),
+            layers: vec![
+                make_layer(
+                    generate_nv12_frame(canvas_w, canvas_h),
+                    canvas_w,
+                    canvas_h,
+                    PixelFormat::Nv12,
+                    None,
+                    1.0,
+                    0,
+                    0.0,
+                ),
+                make_layer(
+                    generate_rgba_frame(pip_w, pip_h),
+                    pip_w,
+                    pip_h,
+                    PixelFormat::Rgba8,
+                    Some(Rect {
+                        x: pip_x,
+                        y: pip_y,
+                        width: pip_w,
+                        height: pip_h,
+                    }),
+                    0.9,
+                    1,
+                    0.0,
+                ),
+            ],
+        },
+        // 2 layers RGBA with rotation on PiP
+        Scenario {
+            label: "2-layer-rgba-rotated".to_string(),
+            layers: vec![
+                make_layer(
+                    generate_rgba_frame(canvas_w, canvas_h),
+                    canvas_w,
+                    canvas_h,
+                    PixelFormat::Rgba8,
+                    None,
+                    1.0,
+                    0,
+                    0.0,
+                ),
+                make_layer(
+                    generate_rgba_frame(pip_w, pip_h),
+                    pip_w,
+                    pip_h,
+                    PixelFormat::Rgba8,
+                    Some(Rect {
+                        x: pip_x,
+                        y: pip_y,
+                        width: pip_w,
+                        height: pip_h,
+                    }),
+                    0.9,
+                    1,
+                    15.0, // 15° rotation
+                ),
+            ],
+        },
+        // 2 layers RGBA, static (same Arc — for future cache-hit measurement)
+        Scenario {
+            label: "2-layer-rgba-static".to_string(),
+            layers: {
+                let bg = Arc::new(PooledVideoData::from_vec(generate_rgba_frame(
+                    canvas_w, canvas_h,
+                )));
+                let pip = Arc::new(PooledVideoData::from_vec(generate_rgba_frame(pip_w, pip_h)));
+                vec![
+                    Some(LayerSnapshot {
+                        data: bg,
+                        width: canvas_w,
+                        height: canvas_h,
+                        pixel_format: PixelFormat::Rgba8,
+                        rect: None,
+                        opacity: 1.0,
+                        z_index: 0,
+                        rotation_degrees: 0.0,
+                    }),
+                    Some(LayerSnapshot {
+                        data: pip,
+                        width: pip_w,
+                        height: pip_h,
+                        pixel_format: PixelFormat::Rgba8,
+                        rect: Some(Rect {
+                            x: pip_x,
+                            y: pip_y,
+                            width: pip_w,
+                            height: pip_h,
+                        }),
+                        opacity: 0.9,
+                        z_index: 1,
+                        rotation_degrees: 0.0,
+                    }),
+                ]
+            },
+        },
+    ]
+}
+
+// ── Main ────────────────────────────────────────────────────────────────────
+
+fn main() {
+    let args = BenchArgs::parse();
+
+    let resolutions: &[(u32, u32)] = if args.width == DEFAULT_WIDTH && args.height == DEFAULT_HEIGHT
+    {
+        // Default: run at multiple resolutions.
+        &[(640, 480), (1280, 720), (1920, 1080)]
+    } else {
+        // Custom: run at the specified resolution only.
+        // (Leak to get 'static — acceptable in a short-lived bench binary.)
+        let res = Box::leak(Box::new([(args.width, args.height)]));
+        res
+    };
+
+    eprintln!("╔══════════════════════════════════════════════════════════╗");
+    eprintln!("║         Compositor-Only Microbenchmark                  ║");
+    eprintln!("╠══════════════════════════════════════════════════════════╣");
+    eprintln!(
+        "║  Resolutions : {:<41}║",
+        resolutions
+            .iter()
+            .map(|(w, h)| format!("{w}×{h}"))
+            .collect::<Vec<_>>()
+            .join(", ")
+    );
+    eprintln!("║  Frames      : {:<41}║", args.frame_count);
+    eprintln!("║  Iterations  : {:<41}║", args.iterations);
+    if let Some(ref f) = args.filter {
+        eprintln!("║  Filter      : {f:<41}║");
+    }
+    eprintln!("╚══════════════════════════════════════════════════════════╝");
+    eprintln!();
+
+    let mut json_results: Vec<serde_json::Value> = Vec::new();
+
+    for &(w, h) in resolutions {
+        eprintln!("── {w}×{h} ──────────────────────────────────────────────");
+
+        let scenarios = build_scenarios(w, h);
+
+        for scenario in &scenarios {
+            if let Some(ref filter) = args.filter {
+                if !scenario.label.contains(filter.as_str()) {
+                    continue;
+                }
+            }
+
+            let mut iter_results = Vec::with_capacity(args.iterations as usize);
+
+            for iter in 1..=args.iterations {
+                let result =
+                    bench_composite(&scenario.label, w, h, &scenario.layers, args.frame_count);
+                eprintln!(
+                    "  {:<28} iter {iter}/{}: {:>8.1} fps  ({:.2} ms/frame)",
+                    scenario.label,
+                    args.iterations,
+                    result.fps(),
+                    result.ms_per_frame(),
+                );
+                iter_results.push(result);
+            }
+
+            // Summary for this scenario.
+            let fps_values: Vec<f64> = iter_results.iter().map(BenchResult::fps).collect();
+            let ms_values: Vec<f64> = iter_results.iter().map(BenchResult::ms_per_frame).collect();
+            let mean_fps = fps_values.iter().sum::<f64>() / fps_values.len() as f64;
+            let mean_ms = ms_values.iter().sum::<f64>() / ms_values.len() as f64;
+            let min_ms = ms_values.iter().copied().fold(f64::INFINITY, f64::min);
+            let max_ms = ms_values.iter().copied().fold(f64::NEG_INFINITY, f64::max);
+
+            eprintln!(
+                "  {:<28} avg: {:>8.1} fps  ({:.2} ms/frame, min={:.2}, max={:.2})",
+                "",
+                mean_fps,
+                mean_ms,
+                min_ms,
+                max_ms,
+            );
+
+            json_results.push(serde_json::json!({
+                "benchmark": "compositor_only",
+                "scenario": scenario.label,
+                "width": w,
+                "height": h,
+                "frame_count": args.frame_count,
+                "iterations": args.iterations,
+                "mean_fps": mean_fps,
+                "mean_ms_per_frame": mean_ms,
+                "min_ms_per_frame": min_ms,
+                "max_ms_per_frame": max_ms,
+            }));
+        }
+        eprintln!();
+    }
+
+    // Machine-readable JSON output.
+    println!("{}", serde_json::to_string_pretty(&json_results).expect("JSON serialization"));
+}

From 3a7a2b20c25fdd214de32efdd9de302e2a616d18 Mon Sep 17 00:00:00 2001
From: StreamKit Devin <devin@streamkit.dev>
Date: Sun, 1 Mar 2026 08:31:02 +0000
Subject: [PATCH 02/12] style: apply rustfmt to compositor_only benchmark

Co-Authored-By: Claudio Costa <cstcld91@gmail.com>
---
 crates/engine/benches/compositor_only.rs | 82 +++++-------------------
 1 file changed, 16 insertions(+), 66 deletions(-)

diff --git a/crates/engine/benches/compositor_only.rs b/crates/engine/benches/compositor_only.rs
index 0b66c961..31ea5b85 100644
--- a/crates/engine/benches/compositor_only.rs
+++ b/crates/engine/benches/compositor_only.rs
@@ -170,7 +170,9 @@ fn generate_nv12_frame(width: u32, height: u32) -> Vec<u8> {
     let chroma_h = h.div_ceil(2);
     let nv12_size = w * h + chroma_w * 2 * chroma_h;
     let mut nv12 = vec![0u8; nv12_size];
-    streamkit_nodes::video::compositor::pixel_ops::rgba8_to_nv12_buf(&rgba, width, height, &mut nv12);
+    streamkit_nodes::video::compositor::pixel_ops::rgba8_to_nv12_buf(
+        &rgba, width, height, &mut nv12,
+    );
     nv12
 }
 
@@ -251,10 +253,7 @@ fn bench_composite(
     }
 
     let elapsed = start.elapsed();
-    BenchResult {
-        total_secs: elapsed.as_secs_f64(),
-        frame_count,
-    }
+    BenchResult { total_secs: elapsed.as_secs_f64(), frame_count }
 }
 
 struct BenchResult {
@@ -341,12 +340,7 @@ fn build_scenarios(canvas_w: u32, canvas_h: u32) -> Vec<Scenario> {
                     pip_w,
                     pip_h,
                     PixelFormat::Rgba8,
-                    Some(Rect {
-                        x: pip_x,
-                        y: pip_y,
-                        width: pip_w,
-                        height: pip_h,
-                    }),
+                    Some(Rect { x: pip_x, y: pip_y, width: pip_w, height: pip_h }),
                     0.9,
                     1,
                     0.0,
@@ -372,12 +366,7 @@ fn build_scenarios(canvas_w: u32, canvas_h: u32) -> Vec<Scenario> {
                     pip_w,
                     pip_h,
                     PixelFormat::Rgba8,
-                    Some(Rect {
-                        x: pip_x,
-                        y: pip_y,
-                        width: pip_w,
-                        height: pip_h,
-                    }),
+                    Some(Rect { x: pip_x, y: pip_y, width: pip_w, height: pip_h }),
                     0.9,
                     1,
                     0.0,
@@ -387,12 +376,7 @@ fn build_scenarios(canvas_w: u32, canvas_h: u32) -> Vec<Scenario> {
                     pip_w,
                     pip_h,
                     PixelFormat::Rgba8,
-                    Some(Rect {
-                        x: 20,
-                        y: 20,
-                        width: pip_w,
-                        height: pip_h,
-                    }),
+                    Some(Rect { x: 20, y: 20, width: pip_w, height: pip_h }),
                     0.8,
                     2,
                     0.0,
@@ -402,12 +386,7 @@ fn build_scenarios(canvas_w: u32, canvas_h: u32) -> Vec<Scenario> {
                     pip_w,
                     pip_h,
                     PixelFormat::Rgba8,
-                    Some(Rect {
-                        x: 20,
-                        y: pip_y,
-                        width: pip_w,
-                        height: pip_h,
-                    }),
+                    Some(Rect { x: 20, y: pip_y, width: pip_w, height: pip_h }),
                     0.7,
                     3,
                     0.0,
@@ -433,12 +412,7 @@ fn build_scenarios(canvas_w: u32, canvas_h: u32) -> Vec<Scenario> {
                     pip_w,
                     pip_h,
                     PixelFormat::Rgba8,
-                    Some(Rect {
-                        x: pip_x,
-                        y: pip_y,
-                        width: pip_w,
-                        height: pip_h,
-                    }),
+                    Some(Rect { x: pip_x, y: pip_y, width: pip_w, height: pip_h }),
                     0.9,
                     1,
                     0.0,
@@ -464,12 +438,7 @@ fn build_scenarios(canvas_w: u32, canvas_h: u32) -> Vec<Scenario> {
                     pip_w,
                     pip_h,
                     PixelFormat::Rgba8,
-                    Some(Rect {
-                        x: pip_x,
-                        y: pip_y,
-                        width: pip_w,
-                        height: pip_h,
-                    }),
+                    Some(Rect { x: pip_x, y: pip_y, width: pip_w, height: pip_h }),
                     0.9,
                     1,
                     0.0,
@@ -495,12 +464,7 @@ fn build_scenarios(canvas_w: u32, canvas_h: u32) -> Vec<Scenario> {
                     pip_w,
                     pip_h,
                     PixelFormat::Rgba8,
-                    Some(Rect {
-                        x: pip_x,
-                        y: pip_y,
-                        width: pip_w,
-                        height: pip_h,
-                    }),
+                    Some(Rect { x: pip_x, y: pip_y, width: pip_w, height: pip_h }),
                     0.9,
                     1,
                     15.0, // 15° rotation
@@ -511,9 +475,8 @@ fn build_scenarios(canvas_w: u32, canvas_h: u32) -> Vec<Scenario> {
         Scenario {
             label: "2-layer-rgba-static".to_string(),
             layers: {
-                let bg = Arc::new(PooledVideoData::from_vec(generate_rgba_frame(
-                    canvas_w, canvas_h,
-                )));
+                let bg =
+                    Arc::new(PooledVideoData::from_vec(generate_rgba_frame(canvas_w, canvas_h)));
                 let pip = Arc::new(PooledVideoData::from_vec(generate_rgba_frame(pip_w, pip_h)));
                 vec![
                     Some(LayerSnapshot {
@@ -531,12 +494,7 @@ fn build_scenarios(canvas_w: u32, canvas_h: u32) -> Vec<Scenario> {
                         width: pip_w,
                         height: pip_h,
                         pixel_format: PixelFormat::Rgba8,
-                        rect: Some(Rect {
-                            x: pip_x,
-                            y: pip_y,
-                            width: pip_w,
-                            height: pip_h,
-                        }),
+                        rect: Some(Rect { x: pip_x, y: pip_y, width: pip_w, height: pip_h }),
                         opacity: 0.9,
                         z_index: 1,
                         rotation_degrees: 0.0,
@@ -568,11 +526,7 @@ fn main() {
     eprintln!("╠══════════════════════════════════════════════════════════╣");
     eprintln!(
         "║  Resolutions : {:<41}║",
-        resolutions
-            .iter()
-            .map(|(w, h)| format!("{w}×{h}"))
-            .collect::<Vec<_>>()
-            .join(", ")
+        resolutions.iter().map(|(w, h)| format!("{w}×{h}")).collect::<Vec<_>>().join(", ")
     );
     eprintln!("║  Frames      : {:<41}║", args.frame_count);
     eprintln!("║  Iterations  : {:<41}║", args.iterations);
@@ -621,11 +575,7 @@ fn main() {
 
             eprintln!(
                 "  {:<28} avg: {:>8.1} fps  ({:.2} ms/frame, min={:.2}, max={:.2})",
-                "",
-                mean_fps,
-                mean_ms,
-                min_ms,
-                max_ms,
+                "", mean_fps, mean_ms, min_ms, max_ms,
             );
 
             json_results.push(serde_json::json!({

From 74fedf8de0dc18604303c3bacca46cf624064be1 Mon Sep 17 00:00:00 2001
From: StreamKit Devin <devin@streamkit.dev>
Date: Sun, 1 Mar 2026 08:33:17 +0000
Subject: [PATCH 03/12] =?UTF-8?q?perf(compositor):=20cache=20YUV=E2=86=92R?=
 =?UTF-8?q?GBA=20conversions=20+=20skip=20canvas=20clear?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Optimization 1: Add ConversionCache that tracks Arc pointer identity
per layer slot. When the source Arc<PooledVideoData> hasn't changed
between frames, the cached RGBA data is reused (zero conversion cost).
Replaces the old i420_scratch buffer approach.

Optimization 2: Skip buf.fill(0) canvas clear when the first visible
layer is opaque, unrotated, and fully covers the canvas dimensions.
Saves one full-canvas memset per frame in the common case.

Co-Authored-By: Claudio Costa <cstcld91@gmail.com>
---
 crates/nodes/src/video/compositor/kernel.rs | 146 +++++++++++++++++---
 crates/nodes/src/video/compositor/mod.rs    |  26 ++--
 2 files changed, 137 insertions(+), 35 deletions(-)

diff --git a/crates/nodes/src/video/compositor/kernel.rs b/crates/nodes/src/video/compositor/kernel.rs
index c173360e..76b0b0c7 100644
--- a/crates/nodes/src/video/compositor/kernel.rs
+++ b/crates/nodes/src/video/compositor/kernel.rs
@@ -19,6 +19,116 @@ use super::pixel_ops::{
 
 // ── Compositing kernel (runs on a persistent blocking thread) ────────────────
 
+// ── YUV → RGBA conversion cache ─────────────────────────────────────────────
+
+/// Cached RGBA conversion result for a single layer slot.
+struct CachedConversion {
+    /// Identity of the source data (`Arc::as_ptr` cast to `usize`).
+    /// When the `Arc<PooledVideoData>` pointer hasn't changed between frames
+    /// the underlying data is identical and the conversion can be skipped.
+    data_identity: usize,
+    width: u32,
+    height: u32,
+    /// Pre-converted RGBA8 data, stored as a plain `Vec<u8>`.
+    rgba: Vec<u8>,
+}
+
+/// Per-slot cache for YUV → RGBA conversions.
+///
+/// Avoids redundant per-frame I420/NV12 → RGBA8 conversion when the source
+/// `Arc<PooledVideoData>` hasn't changed since the previous frame.
+pub struct ConversionCache {
+    entries: Vec<Option<CachedConversion>>,
+}
+
+impl ConversionCache {
+    pub const fn new() -> Self {
+        Self { entries: Vec::new() }
+    }
+
+    /// Look up or perform a YUV→RGBA conversion for layer at `slot_idx`.
+    /// Returns a slice of RGBA8 data.
+    fn get_or_convert(&mut self, slot_idx: usize, layer: &LayerSnapshot) -> &[u8] {
+        let identity = Arc::as_ptr(&layer.data) as usize;
+
+        // Ensure the cache Vec is large enough.
+        if self.entries.len() <= slot_idx {
+            self.entries.resize_with(slot_idx + 1, || None);
+        }
+
+        // Check if the cached entry is still valid.
+        let needs_convert = match &self.entries[slot_idx] {
+            Some(cached) => {
+                cached.data_identity != identity
+                    || cached.width != layer.width
+                    || cached.height != layer.height
+            },
+            None => true,
+        };
+
+        if needs_convert {
+            let needed = layer.width as usize * layer.height as usize * 4;
+            // Reuse the existing allocation if possible.
+            let mut rgba = self.entries[slot_idx].take().map(|c| c.rgba).unwrap_or_default();
+            if rgba.len() < needed {
+                rgba.resize(needed, 0);
+            }
+
+            match layer.pixel_format {
+                PixelFormat::I420 => {
+                    i420_to_rgba8_buf(layer.data.as_slice(), layer.width, layer.height, &mut rgba);
+                },
+                PixelFormat::Nv12 => {
+                    nv12_to_rgba8_buf(layer.data.as_slice(), layer.width, layer.height, &mut rgba);
+                },
+                PixelFormat::Rgba8 => {
+                    // Should not be called for RGBA, but handle gracefully.
+                    rgba[..needed].copy_from_slice(&layer.data.as_slice()[..needed]);
+                },
+            }
+
+            self.entries[slot_idx] = Some(CachedConversion {
+                data_identity: identity,
+                width: layer.width,
+                height: layer.height,
+                rgba,
+            });
+        }
+
+        let cached = self.entries[slot_idx].as_ref().expect("just inserted");
+        let needed = layer.width as usize * layer.height as usize * 4;
+        &cached.rgba[..needed]
+    }
+}
+
+/// Returns `true` if the first visible layer is fully opaque, unrotated, and
+/// covers the entire canvas — meaning the canvas clear can be skipped.
+fn first_layer_covers_canvas(
+    layers: &[Option<LayerSnapshot>],
+    canvas_w: u32,
+    canvas_h: u32,
+) -> bool {
+    let Some(first) = layers.iter().flatten().next() else {
+        return false;
+    };
+
+    if first.opacity < 1.0 || first.rotation_degrees.abs() >= 0.01 {
+        return false;
+    }
+
+    // Check if the layer fully covers the canvas.
+    // A layer with no rect fills the entire canvas by default.
+    match &first.rect {
+        None => true,
+        Some(r) => {
+            r.x <= 0
+                && r.y <= 0
+                && i64::from(r.width) + i64::from(r.x) >= i64::from(canvas_w)
+                && i64::from(r.height) + i64::from(r.y) >= i64::from(canvas_h)
+        },
+    }
+}
+
 /// Snapshot of one input layer's data for the blocking compositor thread.
 pub struct LayerSnapshot {
     pub data: Arc<streamkit_core::frame_pool::PooledVideoData>,
@@ -56,8 +166,8 @@ pub struct CompositeResult {
 /// Composite all layers + overlays onto a fresh RGBA8 canvas buffer.
 /// Allocates from the video pool if available.
 ///
-/// `i420_scratch` is a reusable buffer for I420/NV12→RGBA8 conversion,
-/// avoiding per-frame allocation.
+/// `conversion_cache` caches YUV→RGBA8 conversions across frames so that
+/// unchanged layers skip the conversion entirely.
 pub fn composite_frame(
     canvas_w: u32,
     canvas_h: u32,
@@ -65,7 +175,7 @@ pub fn composite_frame(
     image_overlays: &[Arc<DecodedOverlay>],
     text_overlays: &[Arc<DecodedOverlay>],
     video_pool: Option<&streamkit_core::VideoFramePool>,
-    i420_scratch: &mut Vec<u8>,
+    conversion_cache: &mut ConversionCache,
 ) -> streamkit_core::frame_pool::PooledVideoData {
     let total_bytes = (canvas_w as usize) * (canvas_h as usize) * 4;
 
@@ -74,33 +184,25 @@ pub fn composite_frame(
         |pool| pool.get(total_bytes),
     );
 
-    // Zero the buffer (transparent black).
     let buf = pooled.as_mut_slice();
-    buf[..total_bytes].fill(0);
+
+    // Skip the canvas clear when the first layer is opaque, unrotated, and
+    // covers the entire canvas — the blit will fully overwrite every pixel.
+    if !first_layer_covers_canvas(layers, canvas_w, canvas_h) {
+        buf[..total_bytes].fill(0);
+    }
 
     // Blit each layer (in order — first layer is bottom, last is top).
-    // I420 layers are converted to RGBA8 on-the-fly using the scratch buffer.
-    for layer in layers.iter().flatten() {
+    // Non-RGBA layers use the conversion cache to avoid redundant per-frame
+    // YUV→RGBA8 conversion when the source data hasn't changed.
+    for (slot_idx, layer) in layers.iter().flatten().enumerate() {
         let dst_rect =
             layer.rect.clone().unwrap_or(Rect { x: 0, y: 0, width: canvas_w, height: canvas_h });
 
         let src_data: &[u8] = match layer.pixel_format {
             PixelFormat::Rgba8 => layer.data.as_slice(),
-            PixelFormat::I420 => {
-                let needed = layer.width as usize * layer.height as usize * 4;
-                if i420_scratch.len() < needed {
-                    i420_scratch.resize(needed, 0);
-                }
-                i420_to_rgba8_buf(layer.data.as_slice(), layer.width, layer.height, i420_scratch);
-                &i420_scratch[..needed]
-            },
-            PixelFormat::Nv12 => {
-                let needed = layer.width as usize * layer.height as usize * 4;
-                if i420_scratch.len() < needed {
-                    i420_scratch.resize(needed, 0);
-                }
-                nv12_to_rgba8_buf(layer.data.as_slice(), layer.width, layer.height, i420_scratch);
-                &i420_scratch[..needed]
+            PixelFormat::I420 | PixelFormat::Nv12 => {
+                conversion_cache.get_or_convert(slot_idx, layer)
             },
         };
 
diff --git a/crates/nodes/src/video/compositor/mod.rs b/crates/nodes/src/video/compositor/mod.rs
index 34dde60a..8b743172 100644
--- a/crates/nodes/src/video/compositor/mod.rs
+++ b/crates/nodes/src/video/compositor/mod.rs
@@ -48,7 +48,7 @@ use streamkit_core::{
 };
 use tokio::sync::mpsc;
 
-use kernel::composite_frame;
+use kernel::{composite_frame, ConversionCache};
 
 // ── Input slot ──────────────────────────────────────────────────────────────
 
@@ -285,9 +285,9 @@ impl ProcessorNode for CompositorNode {
         let (result_tx, mut result_rx) = tokio::sync::mpsc::channel::<CompositeResult>(2);
 
         let composite_thread = tokio::task::spawn_blocking(move || {
-            // Persistent scratch buffer for I420→RGBA8 layer conversion,
-            // reused across frames to avoid per-frame allocation.
-            let mut i420_to_rgba_scratch: Vec<u8> = Vec::new();
+            // Per-slot cache for YUV→RGBA conversions. Avoids redundant
+            // conversion when the source Arc hasn't changed between frames.
+            let mut conversion_cache = ConversionCache::new();
 
             while let Some(work) = work_rx.blocking_recv() {
                 let rgba_buf = composite_frame(
@@ -297,7 +297,7 @@ impl ProcessorNode for CompositorNode {
                     &work.image_overlays,
                     &work.text_overlays,
                     work.video_pool.as_deref(),
-                    &mut i420_to_rgba_scratch,
+                    &mut conversion_cache,
                 );
                 let result = CompositeResult { rgba_data: rgba_buf };
                 if result_tx.blocking_send(result).is_err() {
@@ -851,8 +851,8 @@ mod tests {
     #[test]
     fn test_composite_frame_empty_layers() {
         // No layers, no overlays -> transparent black canvas.
-        let mut scratch = Vec::new();
-        let result = composite_frame(4, 4, &[], &[], &[], None, &mut scratch);
+        let mut cache = ConversionCache::new();
+        let result = composite_frame(4, 4, &[], &[], &[], None, &mut cache);
         let buf = result.as_slice();
         assert_eq!(buf.len(), 4 * 4 * 4);
         assert!(buf.iter().all(|&b| b == 0));
@@ -872,8 +872,8 @@ mod tests {
             rotation_degrees: 0.0,
         };
 
-        let mut scratch = Vec::new();
-        let result = composite_frame(4, 4, &[Some(layer)], &[], &[], None, &mut scratch);
+        let mut cache = ConversionCache::new();
+        let result = composite_frame(4, 4, &[Some(layer)], &[], &[], None, &mut cache);
         let buf = result.as_slice();
 
         // Entire canvas should be red (scaled from 2x2 to 4x4).
@@ -912,9 +912,9 @@ mod tests {
             rotation_degrees: 0.0,
         };
 
-        let mut scratch = Vec::new();
+        let mut cache = ConversionCache::new();
         let result =
-            composite_frame(4, 4, &[Some(layer0), Some(layer1)], &[], &[], None, &mut scratch);
+            composite_frame(4, 4, &[Some(layer0), Some(layer1)], &[], &[], None, &mut cache);
         let buf = result.as_slice();
 
         // (0,0) should be red.
@@ -1079,8 +1079,8 @@ mod tests {
         let pool = FramePool::<u8>::preallocated(&[total], 2);
         assert_eq!(pool.stats().buckets[0].available, 2);
 
-        let mut scratch = Vec::new();
-        let result = composite_frame(canvas_w, canvas_h, &[], &[], &[], Some(&pool), &mut scratch);
+        let mut cache = ConversionCache::new();
+        let result = composite_frame(canvas_w, canvas_h, &[], &[], &[], Some(&pool), &mut cache);
         assert_eq!(result.as_slice().len(), total);
         // One buffer was taken from the pool.
         assert_eq!(pool.stats().buckets[0].available, 1);

From 4cdc376b9f33ed33669822e5bb71f4018d2dd007 Mon Sep 17 00:00:00 2001
From: StreamKit Devin <devin@streamkit.dev>
Date: Sun, 1 Mar 2026 08:34:22 +0000
Subject: [PATCH 04/12] perf(compositor): precompute x-map to eliminate
 per-pixel division

Optimization 3: Replace per-pixel `(dx + src_col_skip) * sw / rw`
integer division in blit_row_opaque/blit_row_alpha with a single
precomputed lookup table (x_map) built once per scale_blit_rgba call.

Each destination column now does a table lookup instead of a division,
removing O(width * height) divisions per layer per frame.

Co-Authored-By: Claudio Costa <cstcld91@gmail.com>
---
 .../nodes/src/video/compositor/pixel_ops.rs   | 45 ++++++++-----------
 1 file changed, 19 insertions(+), 26 deletions(-)

diff --git a/crates/nodes/src/video/compositor/pixel_ops.rs b/crates/nodes/src/video/compositor/pixel_ops.rs
index 3464eb8e..8d3de5b1 100644
--- a/crates/nodes/src/video/compositor/pixel_ops.rs
+++ b/crates/nodes/src/video/compositor/pixel_ops.rs
@@ -101,6 +101,11 @@ pub fn scale_blit_rgba(
         return;
     }
 
+    // Precompute the source-X lookup table once.  This replaces the per-pixel
+    // `(dx + src_col_skip) * sw / rw` integer division with a single table
+    // lookup in the inner blit loops.
+    let x_map: Vec<usize> = (0..effective_rect_w).map(|dx| (dx + src_col_skip) * sw / rw).collect();
+
     // Split the destination buffer into per-row slices so that each row can
     // be processed independently (and therefore in parallel).
     let row_stride = dw * 4;
@@ -114,24 +119,13 @@ pub fn scale_blit_rgba(
         dst_rows.par_chunks_mut(row_stride).take(effective_rh).enumerate().for_each(
             |(dy, row_slice)| {
                 let sy = (dy + src_row_skip) * sh / rh;
-                blit_row(
-                    row_slice,
-                    rx,
-                    effective_rect_w,
-                    src,
-                    sw,
-                    sh,
-                    sy,
-                    rw,
-                    opacity,
-                    src_col_skip,
-                );
+                blit_row(row_slice, rx, effective_rect_w, src, sw, sy, opacity, &x_map);
             },
         );
     } else {
         for (dy, row_slice) in dst_rows.chunks_mut(row_stride).take(effective_rh).enumerate() {
             let sy = (dy + src_row_skip) * sh / rh;
-            blit_row(row_slice, rx, effective_rect_w, src, sw, sh, sy, rw, opacity, src_col_skip);
+            blit_row(row_slice, rx, effective_rect_w, src, sw, sy, opacity, &x_map);
         }
     }
 }
@@ -142,6 +136,9 @@ pub fn scale_blit_rgba(
 /// rows in parallel.  The `row_slice` covers exactly one destination row
 /// starting at pixel column 0 (i.e. byte offset `rx * 4` is the first column
 /// we write to).
+///
+/// `x_map` is a precomputed table mapping each destination column to the
+/// corresponding source column, eliminating per-pixel integer division.
 #[allow(
     clippy::cast_possible_truncation,
     clippy::cast_sign_loss,
@@ -155,18 +152,16 @@ fn blit_row(
     effective_rw: usize,
     src: &[u8],
     sw: usize,
-    sh: usize,
     sy: usize,
-    rw: usize,
     opacity: f32,
-    src_col_skip: usize,
+    x_map: &[usize],
 ) {
     // Fast path: when opacity is 1.0, we can skip the f32 multiply on alpha
     // and branch more cheaply.
     if opacity >= 1.0 {
-        blit_row_opaque(row_slice, rx, effective_rw, src, sw, sh, sy, rw, src_col_skip);
+        blit_row_opaque(row_slice, rx, effective_rw, src, sw, sy, x_map);
     } else {
-        blit_row_alpha(row_slice, rx, effective_rw, src, sw, sh, sy, rw, opacity, src_col_skip);
+        blit_row_alpha(row_slice, rx, effective_rw, src, sw, sy, opacity, x_map);
     }
 }
 
@@ -184,6 +179,7 @@ const fn blend_u8(src: u8, dst: u8, alpha: u16) -> u8 {
 /// per-pixel f32 multiply on the source alpha channel.
 ///
 /// Uses integer-only alpha blending for semi-transparent source pixels.
+/// `x_map` provides precomputed source-X indices (one per destination column).
 #[allow(
     clippy::cast_possible_truncation,
     clippy::cast_sign_loss,
@@ -198,14 +194,12 @@ fn blit_row_opaque(
     effective_rw: usize,
     src: &[u8],
     sw: usize,
-    _sh: usize,
     sy: usize,
-    rw: usize,
-    src_col_skip: usize,
+    x_map: &[usize],
 ) {
     let src_row_base = sy * sw * 4;
     for dx in 0..effective_rw {
-        let sx = (dx + src_col_skip) * sw / rw;
+        let sx = x_map[dx];
         let src_idx = src_row_base + sx * 4;
         if src_idx + 3 >= src.len() {
             continue;
@@ -242,6 +236,7 @@ fn blit_row_opaque(
 /// Applies the opacity multiplier to every source pixel's alpha channel.
 ///
 /// Uses integer-only alpha blending.
+/// `x_map` provides precomputed source-X indices (one per destination column).
 #[allow(
     clippy::cast_possible_truncation,
     clippy::cast_sign_loss,
@@ -256,18 +251,16 @@ fn blit_row_alpha(
     effective_rw: usize,
     src: &[u8],
     sw: usize,
-    _sh: usize,
     sy: usize,
-    rw: usize,
     opacity: f32,
-    src_col_skip: usize,
+    x_map: &[usize],
 ) {
     // Pre-compute opacity as a 0..255 integer multiplier.
     let opacity_u16 = (opacity * 255.0 + 0.5) as u16;
     let src_row_base = sy * sw * 4;
 
     for dx in 0..effective_rw {
-        let sx = (dx + src_col_skip) * sw / rw;
+        let sx = x_map[dx];
         let src_idx = src_row_base + sx * 4;
         if src_idx + 3 >= src.len() {
             continue;

From e28123fc1ccc1df4cc9cadd2d18df80b3a48d38a Mon Sep 17 00:00:00 2001
From: StreamKit Devin <devin@streamkit.dev>
Date: Sun, 1 Mar 2026 08:35:16 +0000
Subject: [PATCH 05/12] perf(compositor): add identity-scale fast path for 1:1
 opaque blits

Optimization 4: When source dimensions match the destination rect,
opacity is 1.0, and there's no clipping offset, bypass the x-map
lookup entirely. For fully-opaque source rows, use bulk memcpy
(copy_from_slice). For rows with semi-transparent pixels, use a
simplified per-pixel blend without the scaling indirection.

Co-Authored-By: Claudio Costa <cstcld91@gmail.com>
---
 .../nodes/src/video/compositor/pixel_ops.rs   | 59 +++++++++++++++++--
 1 file changed, 54 insertions(+), 5 deletions(-)

diff --git a/crates/nodes/src/video/compositor/pixel_ops.rs b/crates/nodes/src/video/compositor/pixel_ops.rs
index 8d3de5b1..f0b61e18 100644
--- a/crates/nodes/src/video/compositor/pixel_ops.rs
+++ b/crates/nodes/src/video/compositor/pixel_ops.rs
@@ -101,11 +101,6 @@ pub fn scale_blit_rgba(
         return;
     }
 
-    // Precompute the source-X lookup table once.  This replaces the per-pixel
-    // `(dx + src_col_skip) * sw / rw` integer division with a single table
-    // lookup in the inner blit loops.
-    let x_map: Vec<usize> = (0..effective_rect_w).map(|dx| (dx + src_col_skip) * sw / rw).collect();
-
     // Split the destination buffer into per-row slices so that each row can
     // be processed independently (and therefore in parallel).
     let row_stride = dw * 4;
@@ -115,6 +110,60 @@ pub fn scale_blit_rgba(
     let first_row_byte = ry * row_stride;
     let dst_rows = &mut dst[first_row_byte..];
 
+    // ── Identity-scale fast path ───────────────────────────────────────
+    // When source dimensions exactly match the destination rect and opacity
+    // is fully opaque, we can avoid per-pixel scaling entirely and use
+    // direct row copies (memcpy) for fully-opaque source rows.
+    if rw == sw && rh == sh && opacity >= 1.0 && src_col_skip == 0 && src_row_skip == 0 {
+        let src_row_bytes = sw * 4;
+        let copy_bytes = effective_rect_w * 4;
+        for (dy, row_slice) in dst_rows.chunks_mut(row_stride).take(effective_rh).enumerate() {
+            let src_start = dy * src_row_bytes;
+            let src_end = src_start + copy_bytes;
+            if src_end > src.len() {
+                break;
+            }
+            let dst_start = rx * 4;
+            let dst_end = dst_start + copy_bytes;
+            if dst_end > row_slice.len() {
+                break;
+            }
+            // Check if the source row has any semi-transparent pixels.
+            // For fully-opaque rows, use bulk memcpy.  For rows with alpha,
+            // fall back to per-pixel blending.
+            let src_row = &src[src_start..src_end];
+            let all_opaque = src_row.chunks_exact(4).all(|px| px[3] == 255);
+            if all_opaque {
+                row_slice[dst_start..dst_end].copy_from_slice(src_row);
+            } else {
+                // Per-pixel alpha blend (identity scale, so sx == dx).
+                for dx in 0..effective_rect_w {
+                    let si = dx * 4;
+                    let sa = src_row[si + 3];
+                    if sa == 255 {
+                        row_slice[dst_start + dx * 4..dst_start + dx * 4 + 4]
+                            .copy_from_slice(&src_row[si..si + 4]);
+                    } else if sa > 0 {
+                        let di = dst_start + dx * 4;
+                        let a16 = u16::from(sa);
+                        row_slice[di] = blend_u8(src_row[si], row_slice[di], a16);
+                        row_slice[di + 1] = blend_u8(src_row[si + 1], row_slice[di + 1], a16);
+                        row_slice[di + 2] = blend_u8(src_row[si + 2], row_slice[di + 2], a16);
+                        let da = u16::from(row_slice[di + 3]);
+                        row_slice[di + 3] = (a16 + ((da * (255 - a16) + 128) >> 8)).min(255) as u8;
+                    }
+                }
+            }
+        }
+        return;
+    }
+
+    // ── Scaled blit path ───────────────────────────────────────────────
+    // Precompute the source-X lookup table once.  This replaces the per-pixel
+    // `(dx + src_col_skip) * sw / rw` integer division with a single table
+    // lookup in the inner blit loops.
+    let x_map: Vec<usize> = (0..effective_rect_w).map(|dx| (dx + src_col_skip) * sw / rw).collect();
+
     if effective_rh >= RAYON_ROW_THRESHOLD {
         dst_rows.par_chunks_mut(row_stride).take(effective_rh).enumerate().for_each(
             |(dy, row_slice)| {

From e54470c21068c5a6331ea8900392720b32dcddee Mon Sep 17 00:00:00 2001
From: StreamKit Devin <devin@streamkit.dev>
Date: Sun, 1 Mar 2026 08:36:00 +0000
Subject: [PATCH 06/12] perf(compositor): pre-scale image overlays at decode
 time

Optimization 5: When a decoded image overlay's native dimensions differ
from its target rect, pre-scale it once using nearest-neighbor at
config/update time. This ensures the per-frame blit_overlay call hits
the identity-scale fast path (memcpy) instead of re-scaling every frame.

Co-Authored-By: Claudio Costa <cstcld91@gmail.com>
---
 crates/nodes/src/video/compositor/overlay.rs | 53 +++++++++++++++++---
 1 file changed, 46 insertions(+), 7 deletions(-)

diff --git a/crates/nodes/src/video/compositor/overlay.rs b/crates/nodes/src/video/compositor/overlay.rs
index 3072870a..0fc1e6b2 100644
--- a/crates/nodes/src/video/compositor/overlay.rs
+++ b/crates/nodes/src/video/compositor/overlay.rs
@@ -36,13 +36,52 @@ pub fn decode_image_overlay(config: &ImageOverlayConfig) -> Result<DecodedOverla
     let rgba = img.to_rgba8();
     let (w, h) = img.dimensions();
 
-    Ok(DecodedOverlay {
-        rgba_data: rgba.into_raw(),
-        width: w,
-        height: h,
-        rect: config.rect.clone(),
-        opacity: config.opacity,
-    })
+    let target_w = config.rect.width;
+    let target_h = config.rect.height;
+
+    // Pre-scale the decoded image to the target rect dimensions so that
+    // the per-frame `blit_overlay` → `scale_blit_rgba` call hits the
+    // identity-scale fast path (direct memcpy) instead of doing
+    // nearest-neighbor scaling every frame.
+    if target_w > 0 && target_h > 0 && (w != target_w || h != target_h) {
+        let raw = rgba.into_raw();
+        let scaled = prescale_rgba(&raw, w, h, target_w, target_h);
+        Ok(DecodedOverlay {
+            rgba_data: scaled,
+            width: target_w,
+            height: target_h,
+            rect: config.rect.clone(),
+            opacity: config.opacity,
+        })
+    } else {
+        Ok(DecodedOverlay {
+            rgba_data: rgba.into_raw(),
+            width: w,
+            height: h,
+            rect: config.rect.clone(),
+            opacity: config.opacity,
+        })
+    }
+}
+
+/// Nearest-neighbor scale an RGBA8 buffer from `(sw, sh)` to `(dw, dh)`.
+/// Used once at config time so the per-frame blit is a 1:1 copy.
+fn prescale_rgba(src: &[u8], sw: u32, sh: u32, dw: u32, dh: u32) -> Vec<u8> {
+    let sw = sw as usize;
+    let sh = sh as usize;
+    let dw = dw as usize;
+    let dh = dh as usize;
+    let mut out = vec![0u8; dw * dh * 4];
+    for dy in 0..dh {
+        let sy = dy * sh / dh;
+        for dx in 0..dw {
+            let sx = dx * sw / dw;
+            let si = (sy * sw + sx) * 4;
+            let di = (dy * dw + dx) * 4;
+            out[di..di + 4].copy_from_slice(&src[si..si + 4]);
+        }
+    }
+    out
 }
 
 // ── Bundled default font ────────────────────────────────────────────────────

From 763a6bba0bc1739322b54b1cc3696f0af286fb3e Mon Sep 17 00:00:00 2001
From: StreamKit Devin <devin@streamkit.dev>
Date: Sun, 1 Mar 2026 08:38:13 +0000
Subject: [PATCH 07/12] perf(compositor): cache layer configs and skip
 per-frame sort

Optimization 6: Extract per-slot layer config resolution and z-order
sorting into a rebuild_layer_cache() function that runs only when
config or pin set changes (UpdateParams, pin add/remove, channel close).

Per-frame layer building now uses the cached resolved configs and
pre-sorted draw order instead of doing HashMap lookups and sort_by
on every frame.

Co-Authored-By: Claudio Costa <cstcld91@gmail.com>
---
 crates/nodes/src/video/compositor/mod.rs | 142 +++++++++++++++--------
 1 file changed, 92 insertions(+), 50 deletions(-)

diff --git a/crates/nodes/src/video/compositor/mod.rs b/crates/nodes/src/video/compositor/mod.rs
index 8b743172..e27bb8ce 100644
--- a/crates/nodes/src/video/compositor/mod.rs
+++ b/crates/nodes/src/video/compositor/mod.rs
@@ -30,7 +30,7 @@ mod overlay;
 pub mod pixel_ops;
 
 use async_trait::async_trait;
-use config::{CompositorConfig, Rect};
+use config::CompositorConfig;
 use kernel::{CompositeResult, CompositeWorkItem, LayerSnapshot};
 use overlay::{decode_image_overlay, rasterize_text_overlay, DecodedOverlay};
 use schemars::schema_for;
@@ -59,6 +59,63 @@ struct InputSlot {
     latest_frame: Option<VideoFrame>,
 }
 
+// ── Cached layer config ─────────────────────────────────────────────────────
+
+/// Pre-resolved layer configuration for a single slot.
+/// Rebuilt only when compositor config or pin set changes, avoiding
+/// per-frame `HashMap` lookups and `sort_by` calls.
+#[derive(Clone)]
+struct ResolvedSlotConfig {
+    rect: Option<config::Rect>,
+    opacity: f32,
+    z_index: i32,
+    rotation_degrees: f32,
+}
+
+/// Rebuild the per-slot resolved configs and the z-sorted draw order.
+///
+/// Called once at startup and whenever `UpdateParams` or pin management
+/// changes the layer set.  The returned draw order is a list of slot
+/// indices sorted by `(z_index, slot_index)`.
+fn rebuild_layer_cache(
+    slots: &[InputSlot],
+    config: &CompositorConfig,
+) -> (Vec<ResolvedSlotConfig>, Vec<usize>) {
+    let num_slots = slots.len();
+    let mut configs: Vec<ResolvedSlotConfig> = Vec::with_capacity(num_slots);
+    for (idx, slot) in slots.iter().enumerate() {
+        let layer_cfg = config.layers.get(&slot.name);
+        #[allow(clippy::option_if_let_else)]
+        let (rect, opacity, z_index, rotation_degrees) = if let Some(lc) = layer_cfg {
+            (lc.rect.clone(), lc.opacity, lc.z_index, lc.rotation_degrees)
+        } else if idx > 0 && num_slots > 1 {
+            // Auto-PiP: non-first layers without explicit config.
+            let pip_w = config.width / 3;
+            let pip_h = config.height / 3;
+            #[allow(clippy::cast_possible_wrap)]
+            let pip_x = (config.width - pip_w - 20) as i32;
+            #[allow(clippy::cast_possible_wrap)]
+            let pip_y = (config.height - pip_h - 20) as i32;
+            #[allow(clippy::cast_possible_wrap)]
+            (
+                Some(config::Rect { x: pip_x, y: pip_y, width: pip_w, height: pip_h }),
+                0.9,
+                idx as i32,
+                0.0,
+            )
+        } else {
+            (None, 1.0, 0, 0.0)
+        };
+        configs.push(ResolvedSlotConfig { rect, opacity, z_index, rotation_degrees });
+    }
+
+    // Pre-sort by (z_index, slot_index).
+    let mut draw_order: Vec<usize> = (0..num_slots).collect();
+    draw_order.sort_by(|&a, &b| configs[a].z_index.cmp(&configs[b].z_index).then(a.cmp(&b)));
+
+    (configs, draw_order)
+}
+
 // ── Node ────────────────────────────────────────────────────────────────────
 
 /// Composites multiple raw video inputs onto a single RGBA8 canvas with
@@ -309,6 +366,14 @@ impl ProcessorNode for CompositorNode {
         let mut output_seq: u64 = 0;
         let mut stop_reason: &str = "shutdown";
 
+        // ── Cached layer config + draw order ────────────────────────────
+        // Rebuilt only when config or pin set changes (UpdateParams,
+        // pin add/remove, channel close).  Avoids per-frame HashMap
+        // lookups and sort_by calls.
+        let mut layer_configs_dirty = true;
+        let mut resolved_configs: Vec<ResolvedSlotConfig> = Vec::new();
+        let mut sorted_draw_order: Vec<usize> = Vec::new();
+
         loop {
             // ── Take at most one frame from every slot (non-blocking) ───
             // We intentionally take only one frame per slot per iteration so
@@ -347,6 +412,7 @@ impl ProcessorNode for CompositorNode {
                                     params,
                                     &mut stats_tracker,
                                 );
+                                layer_configs_dirty = true;
                             },
                             NodeControlMessage::Start => {},
                         }
@@ -364,6 +430,7 @@ impl ProcessorNode for CompositorNode {
                             msg,
                             &mut slots,
                         );
+                        layer_configs_dirty = true;
                     }
 
                     // Wait for a frame from any connected input.
@@ -379,6 +446,7 @@ impl ProcessorNode for CompositorNode {
                                     slots[slot_idx].name
                                 );
                                 slots.remove(slot_idx);
+                                layer_configs_dirty = true;
                                 if slots.is_empty() {
                                     stop_reason = "all_inputs_closed";
                                     should_break = true;
@@ -425,6 +493,7 @@ impl ProcessorNode for CompositorNode {
                                     params,
                                     &mut stats_tracker,
                                 );
+                                layer_configs_dirty = true;
                             },
                             NodeControlMessage::Start => {},
                         }
@@ -440,6 +509,7 @@ impl ProcessorNode for CompositorNode {
                             msg,
                             &mut slots,
                         );
+                        layer_configs_dirty = true;
                     }
                 }
                 continue;
@@ -463,6 +533,7 @@ impl ProcessorNode for CompositorNode {
                             params,
                             &mut stats_tracker,
                         );
+                        layer_configs_dirty = true;
                     },
                     NodeControlMessage::Start => {},
                 }
@@ -473,69 +544,40 @@ impl ProcessorNode for CompositorNode {
             if let Some(ref mut pmrx) = pin_mgmt_rx {
                 while let Ok(msg) = pmrx.try_recv() {
                     Self::handle_pin_management(&mut self, msg, &mut slots);
+                    layer_configs_dirty = true;
                 }
             }
 
+            // ── Rebuild layer config cache if needed ─────────────────────
+            if layer_configs_dirty {
+                let (cfgs, order) = rebuild_layer_cache(&slots, &self.config);
+                resolved_configs = cfgs;
+                sorted_draw_order = order;
+                layer_configs_dirty = false;
+            }
+
             // ── Send work to persistent compositing thread ─────────────
-            // Collect the data we need to send to the blocking thread.
-            let num_slots = slots.len();
-            let mut layers: Vec<Option<LayerSnapshot>> = slots
+            // Build layer snapshots in pre-sorted draw order using the
+            // cached per-slot configs (no HashMap lookup, no sort).
+            let layers: Vec<Option<LayerSnapshot>> = sorted_draw_order
                 .iter()
-                .enumerate()
-                .map(|(idx, slot)| {
-                    slot.latest_frame.as_ref().map(|f| {
-                        let layer_cfg = self.config.layers.get(&slot.name);
-                        #[allow(clippy::option_if_let_else)]
-                        let (rect, opacity, z_index, rotation_degrees) = if let Some(lc) = layer_cfg
-                        {
-                            // Explicit per-layer config.
-                            (lc.rect.clone(), lc.opacity, lc.z_index, lc.rotation_degrees)
-                        } else if idx > 0 && num_slots > 1 {
-                            // Auto-PiP: non-first layers without explicit config
-                            // are placed in the bottom-right corner at 1/3 canvas
-                            // size with slight transparency.
-                            let pip_w = self.config.width / 3;
-                            let pip_h = self.config.height / 3;
-                            #[allow(clippy::cast_possible_wrap)]
-                            let pip_x = (self.config.width - pip_w - 20) as i32;
-                            #[allow(clippy::cast_possible_wrap)]
-                            let pip_y = (self.config.height - pip_h - 20) as i32;
-                            #[allow(clippy::cast_possible_wrap)]
-                            (
-                                Some(Rect { x: pip_x, y: pip_y, width: pip_w, height: pip_h }),
-                                0.9,
-                                idx as i32,
-                                0.0,
-                            )
-                        } else {
-                            // First layer (or single input): fill the canvas.
-                            (None, 1.0, 0, 0.0)
-                        };
+                .map(|&idx| {
+                    slots[idx].latest_frame.as_ref().map(|f| {
+                        let cfg = &resolved_configs[idx];
                         LayerSnapshot {
                             data: f.data.clone(),
                             width: f.width,
                             height: f.height,
                             pixel_format: f.pixel_format,
-                            rect,
-                            opacity,
-                            z_index,
-                            rotation_degrees,
+                            rect: cfg.rect.clone(),
+                            opacity: cfg.opacity,
+                            z_index: cfg.z_index,
+                            rotation_degrees: cfg.rotation_degrees,
                         }
                     })
                 })
                 .collect();
 
-            // Sort layers by z_index so that lower values are drawn first
-            // (bottom of the stack).  `None` entries (slots without a frame)
-            // are pushed to the end — they are skipped during compositing
-            // anyway.
-            layers.sort_by(|a, b| match (a, b) {
-                (Some(la), Some(lb)) => la.z_index.cmp(&lb.z_index),
-                (Some(_), None) => std::cmp::Ordering::Less,
-                (None, Some(_)) => std::cmp::Ordering::Greater,
-                (None, None) => std::cmp::Ordering::Equal,
-            });
-
             stats_tracker.received();
 
             let work_item = CompositeWorkItem {
@@ -773,7 +815,7 @@ mod tests {
     use crate::test_utils::{
         assert_state_initializing, assert_state_running, assert_state_stopped, create_test_context,
     };
-    use config::LayerConfig;
+    use config::{LayerConfig, Rect};
     use pixel_ops::scale_blit_rgba;
     use std::collections::HashMap;
     use tokio::sync::mpsc;

From 91e93bb727f834426f9c9f647847fb518cbe6e27 Mon Sep 17 00:00:00 2001
From: StreamKit Devin <devin@streamkit.dev>
Date: Sun, 1 Mar 2026 08:38:47 +0000
Subject: [PATCH 08/12] perf(frame_pool): preallocate video pool buckets at
 startup

Optimization 7: Change video_default() from with_buckets (lazy, no
preallocation) to preallocated_with_max with 2 buffers per bucket.
This avoids cold-start allocation misses for the first few frames,
matching the existing audio_default() pattern.

Co-Authored-By: Claudio Costa <cstcld91@gmail.com>
---
 crates/core/src/frame_pool.rs | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/crates/core/src/frame_pool.rs b/crates/core/src/frame_pool.rs
index c7a6a829..1e90b1e3 100644
--- a/crates/core/src/frame_pool.rs
+++ b/crates/core/src/frame_pool.rs
@@ -325,10 +325,15 @@ pub const DEFAULT_VIDEO_BUCKET_SIZES: &[usize] = &[
 ];
 pub const DEFAULT_VIDEO_MAX_BUFFERS_PER_BUCKET: usize = 16;
 
+/// Number of buffers to preallocate per video bucket at startup.
+/// Avoids cold-start misses for the first few frames.
+pub const DEFAULT_VIDEO_PREALLOCATE_PER_BUCKET: usize = 2;
+
 impl FramePool<u8> {
     pub fn video_default() -> Self {
-        Self::with_buckets(
-            DEFAULT_VIDEO_BUCKET_SIZES.to_vec(),
+        Self::preallocated_with_max(
+            DEFAULT_VIDEO_BUCKET_SIZES,
+            DEFAULT_VIDEO_PREALLOCATE_PER_BUCKET,
             DEFAULT_VIDEO_MAX_BUFFERS_PER_BUCKET,
         )
     }

From e942408ceffe74cfc4c21bd26b1f703e7914dc8b Mon Sep 17 00:00:00 2001
From: StreamKit Devin <devin@streamkit.dev>
Date: Sun, 1 Mar 2026 08:45:11 +0000
Subject: [PATCH 09/12] style(compositor): fix clippy warnings from
 optimization changes

- Use map_or instead of match/if-let-else in ConversionCache and
  first_layer_covers_canvas
- Allow expect_used with safety comment in get_or_convert
- Allow dead_code on LayerSnapshot::z_index (sorting moved upstream)
- Allow needless_range_loop in blit_row_opaque/blit_row_alpha (dx used
  for both x_map index and dst offset)
- Allow cast_possible_truncation on idx as i32 in rebuild_layer_cache

Co-Authored-By: Claudio Costa <cstcld91@gmail.com>
---
 crates/nodes/src/video/compositor/kernel.rs   | 37 +++++++++----------
 crates/nodes/src/video/compositor/mod.rs      |  2 +-
 .../nodes/src/video/compositor/pixel_ops.rs   |  6 ++-
 3 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/crates/nodes/src/video/compositor/kernel.rs b/crates/nodes/src/video/compositor/kernel.rs
index 76b0b0c7..2f52485a 100644
--- a/crates/nodes/src/video/compositor/kernel.rs
+++ b/crates/nodes/src/video/compositor/kernel.rs
@@ -57,14 +57,11 @@ impl ConversionCache {
         }
 
         // Check if the cached entry is still valid.
-        let needs_convert = match &self.entries[slot_idx] {
-            Some(cached) => {
-                cached.data_identity != identity
-                    || cached.width != layer.width
-                    || cached.height != layer.height
-            },
-            None => true,
-        };
+        let needs_convert = self.entries[slot_idx].as_ref().map_or(true, |cached| {
+            cached.data_identity != identity
+                || cached.width != layer.width
+                || cached.height != layer.height
+        });
 
         if needs_convert {
             let needed = layer.width as usize * layer.height as usize * 4;
@@ -95,6 +92,9 @@ impl ConversionCache {
             });
         }
 
+        // SAFETY: we just inserted into this slot above when `needs_convert` was true,
+        // and the slot was already `Some` when `needs_convert` was false.
+        #[allow(clippy::expect_used)]
         let cached = self.entries[slot_idx].as_ref().expect("just inserted");
         let needed = layer.width as usize * layer.height as usize * 4;
         &cached.rgba[..needed]
@@ -118,15 +118,12 @@ fn first_layer_covers_canvas(
 
     // Check if the layer fully covers the canvas.
     // A layer with no rect fills the entire canvas by default.
-    match &first.rect {
-        None => true,
-        Some(r) => {
-            r.x <= 0
-                && r.y <= 0
-                && i64::from(r.width) + i64::from(r.x) >= i64::from(canvas_w)
-                && i64::from(r.height) + i64::from(r.y) >= i64::from(canvas_h)
-        },
-    }
+    first.rect.as_ref().map_or(true, |r| {
+        r.x <= 0
+            && r.y <= 0
+            && i64::from(r.width) + i64::from(r.x) >= i64::from(canvas_w)
+            && i64::from(r.height) + i64::from(r.y) >= i64::from(canvas_h)
+    })
 }
 
 /// Snapshot of one input layer's data for the blocking compositor thread.
@@ -137,8 +134,10 @@ pub struct LayerSnapshot {
     pub pixel_format: PixelFormat,
     pub rect: Option<Rect>,
     pub opacity: f32,
-    /// Visual stacking order.  Lower values are drawn first (bottom).
-    /// Used to sort layers before compositing; ties broken by slot index.
+    /// Visual stacking order.  Retained in the snapshot for diagnostic /
+    /// logging purposes even though sorting now happens before snapshot
+    /// construction.
+    #[allow(dead_code)]
     pub z_index: i32,
     /// Clockwise rotation in degrees around the destination rect centre.
     /// Default `0.0` means no rotation.
diff --git a/crates/nodes/src/video/compositor/mod.rs b/crates/nodes/src/video/compositor/mod.rs
index e27bb8ce..51b20429 100644
--- a/crates/nodes/src/video/compositor/mod.rs
+++ b/crates/nodes/src/video/compositor/mod.rs
@@ -96,7 +96,7 @@ fn rebuild_layer_cache(
             let pip_x = (config.width - pip_w - 20) as i32;
             #[allow(clippy::cast_possible_wrap)]
             let pip_y = (config.height - pip_h - 20) as i32;
-            #[allow(clippy::cast_possible_wrap)]
+            #[allow(clippy::cast_possible_wrap, clippy::cast_possible_truncation)]
             (
                 Some(config::Rect { x: pip_x, y: pip_y, width: pip_w, height: pip_h }),
                 0.9,
diff --git a/crates/nodes/src/video/compositor/pixel_ops.rs b/crates/nodes/src/video/compositor/pixel_ops.rs
index f0b61e18..a87f13fd 100644
--- a/crates/nodes/src/video/compositor/pixel_ops.rs
+++ b/crates/nodes/src/video/compositor/pixel_ops.rs
@@ -234,7 +234,8 @@ const fn blend_u8(src: u8, dst: u8, alpha: u16) -> u8 {
     clippy::cast_sign_loss,
     clippy::too_many_arguments,
     clippy::suboptimal_flops,
-    clippy::inline_always
+    clippy::inline_always,
+    clippy::needless_range_loop
 )]
 #[inline(always)]
 fn blit_row_opaque(
@@ -291,7 +292,8 @@ fn blit_row_opaque(
     clippy::cast_sign_loss,
     clippy::too_many_arguments,
     clippy::suboptimal_flops,
-    clippy::inline_always
+    clippy::inline_always,
+    clippy::needless_range_loop
 )]
 #[inline(always)]
 fn blit_row_alpha(

From 8311d56208d561816e08fd334cc48dd8a8a18c0f Mon Sep 17 00:00:00 2001
From: StreamKit Devin <devin@streamkit.dev>
Date: Sun, 1 Mar 2026 09:15:55 +0000
Subject: [PATCH 10/12] fix(compositor): address correctness + bench issues
 from review

- Fix #1 (High): skip-clear now validates source pixel alpha (all pixels
  must have alpha==255) before skipping canvas clear. Prevents blending
  against stale pooled buffer data when RGBA source has transparency.

- Fix #2 (Medium): conversion cache slot indices now use position in the
  full layers slice (with None holes) via two-pass resolution, so cache
  keys stay stable when slots gain/lose frames.

- Fix #3 (Medium): benchmark now calls real composite_frame() kernel
  instead of reimplementing compositing inline. Exercises all kernel
  optimizations (cache, clear-skip, identity fast-path, x-map).

- Fix Devin Review: revert video pool preallocation (was allocating
  ~121MB across all bucket sizes at startup). Restored lazy allocation.

Co-Authored-By: Claudio Costa <cstcld91@gmail.com>
---
 crates/core/src/frame_pool.rs               | 10 +--
 crates/engine/benches/compositor_only.rs    | 91 ++++---------------
 crates/nodes/src/video/compositor/kernel.rs | 97 +++++++++++++++++----
 crates/nodes/src/video/compositor/mod.rs    |  4 +-
 4 files changed, 100 insertions(+), 102 deletions(-)

diff --git a/crates/core/src/frame_pool.rs b/crates/core/src/frame_pool.rs
index 1e90b1e3..9e3bf98e 100644
--- a/crates/core/src/frame_pool.rs
+++ b/crates/core/src/frame_pool.rs
@@ -325,17 +325,9 @@ pub const DEFAULT_VIDEO_BUCKET_SIZES: &[usize] = &[
 ];
 pub const DEFAULT_VIDEO_MAX_BUFFERS_PER_BUCKET: usize = 16;
 
-/// Number of buffers to preallocate per video bucket at startup.
-/// Avoids cold-start misses for the first few frames.
-pub const DEFAULT_VIDEO_PREALLOCATE_PER_BUCKET: usize = 2;
-
 impl FramePool<u8> {
     pub fn video_default() -> Self {
-        Self::preallocated_with_max(
-            DEFAULT_VIDEO_BUCKET_SIZES,
-            DEFAULT_VIDEO_PREALLOCATE_PER_BUCKET,
-            DEFAULT_VIDEO_MAX_BUFFERS_PER_BUCKET,
-        )
+        Self::with_buckets(DEFAULT_VIDEO_BUCKET_SIZES.to_vec(), DEFAULT_VIDEO_MAX_BUFFERS_PER_BUCKET)
     }
 }
 
diff --git a/crates/engine/benches/compositor_only.rs b/crates/engine/benches/compositor_only.rs
index 31ea5b85..f5036d7d 100644
--- a/crates/engine/benches/compositor_only.rs
+++ b/crates/engine/benches/compositor_only.rs
@@ -41,21 +41,10 @@ use streamkit_core::types::PixelFormat;
 
 // Re-use the compositor kernel and pixel_ops directly.
 use streamkit_nodes::video::compositor::config::Rect;
+use streamkit_nodes::video::compositor::kernel::{composite_frame, ConversionCache, LayerSnapshot};
+use streamkit_nodes::video::compositor::overlay::DecodedOverlay;
 use streamkit_nodes::video::compositor::pixel_ops::rgba8_to_i420;
 
-/// Inline copy of `LayerSnapshot` to avoid depending on the private `kernel` module.
-/// Must stay in sync with `kernel::LayerSnapshot`.
-struct LayerSnapshot {
-    data: Arc<PooledVideoData>,
-    width: u32,
-    height: u32,
-    pixel_format: PixelFormat,
-    rect: Option<Rect>,
-    opacity: f32,
-    z_index: i32,
-    rotation_degrees: f32,
-}
-
 // ── Default benchmark parameters ────────────────────────────────────────────
 
 const DEFAULT_WIDTH: u32 = 1280;
@@ -178,8 +167,10 @@ fn generate_nv12_frame(width: u32, height: u32) -> Vec<u8> {
 
 // ── Compositing harness ─────────────────────────────────────────────────────
 
-/// Directly call the compositing kernel for `frame_count` iterations,
-/// returning per-frame timing statistics.
+/// Call the real `composite_frame` kernel for `frame_count` iterations,
+/// returning per-frame timing statistics.  This exercises all kernel
+/// optimizations: conversion cache, skip-canvas-clear, identity-scale
+/// fast-path, precomputed x-map, etc.
 fn bench_composite(
     _label: &str,
     canvas_w: u32,
@@ -187,69 +178,21 @@ fn bench_composite(
     layers: &[Option<LayerSnapshot>],
     frame_count: u32,
 ) -> BenchResult {
-    // Re-create the kernel's compositing logic inline since `composite_frame`
-    // is pub(crate). We call the public pixel_ops functions directly.
-    let total_bytes = (canvas_w as usize) * (canvas_h as usize) * 4;
-    let mut canvas = vec![0u8; total_bytes];
-    let mut i420_scratch: Vec<u8> = Vec::new();
+    let empty_overlays: Vec<Arc<DecodedOverlay>> = Vec::new();
+    let mut conversion_cache = ConversionCache::new();
 
     let start = Instant::now();
 
     for _ in 0..frame_count {
-        // Zero the canvas.
-        canvas.fill(0);
-
-        // Blit each layer.
-        for layer in layers.iter().flatten() {
-            let dst_rect = layer.rect.clone().unwrap_or(Rect {
-                x: 0,
-                y: 0,
-                width: canvas_w,
-                height: canvas_h,
-            });
-
-            let src_data: &[u8] = match layer.pixel_format {
-                PixelFormat::Rgba8 => layer.data.as_slice(),
-                PixelFormat::I420 => {
-                    let needed = layer.width as usize * layer.height as usize * 4;
-                    if i420_scratch.len() < needed {
-                        i420_scratch.resize(needed, 0);
-                    }
-                    streamkit_nodes::video::compositor::pixel_ops::i420_to_rgba8_buf(
-                        layer.data.as_slice(),
-                        layer.width,
-                        layer.height,
-                        &mut i420_scratch,
-                    );
-                    &i420_scratch[..needed]
-                },
-                PixelFormat::Nv12 => {
-                    let needed = layer.width as usize * layer.height as usize * 4;
-                    if i420_scratch.len() < needed {
-                        i420_scratch.resize(needed, 0);
-                    }
-                    streamkit_nodes::video::compositor::pixel_ops::nv12_to_rgba8_buf(
-                        layer.data.as_slice(),
-                        layer.width,
-                        layer.height,
-                        &mut i420_scratch,
-                    );
-                    &i420_scratch[..needed]
-                },
-            };
-
-            streamkit_nodes::video::compositor::pixel_ops::scale_blit_rgba_rotated(
-                &mut canvas,
-                canvas_w,
-                canvas_h,
-                src_data,
-                layer.width,
-                layer.height,
-                &dst_rect,
-                layer.opacity,
-                layer.rotation_degrees,
-            );
-        }
+        let _result = composite_frame(
+            canvas_w,
+            canvas_h,
+            layers,
+            &empty_overlays,
+            &empty_overlays,
+            None,
+            &mut conversion_cache,
+        );
     }
 
     let elapsed = start.elapsed();
diff --git a/crates/nodes/src/video/compositor/kernel.rs b/crates/nodes/src/video/compositor/kernel.rs
index 2f52485a..3e979fe6 100644
--- a/crates/nodes/src/video/compositor/kernel.rs
+++ b/crates/nodes/src/video/compositor/kernel.rs
@@ -46,6 +46,23 @@ impl ConversionCache {
         Self { entries: Vec::new() }
     }
 
+    /// Return a previously-cached RGBA slice for `slot_idx`.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the slot has not been populated by a prior `get_or_convert`
+    /// call for the same `layer`.  This is only called in the second pass of
+    /// `composite_frame` after the first pass has ensured every non-RGBA
+    /// layer has been converted.
+    fn get_cached(&self, slot_idx: usize, layer: &LayerSnapshot) -> &[u8] {
+        #[allow(clippy::expect_used)]
+        let cached = self.entries[slot_idx]
+            .as_ref()
+            .expect("get_cached called before get_or_convert");
+        let needed = layer.width as usize * layer.height as usize * 4;
+        &cached.rgba[..needed]
+    }
+
     /// Look up or perform a YUV→RGBA conversion for layer at `slot_idx`.
     /// Returns a slice of RGBA8 data.
     fn get_or_convert(&mut self, slot_idx: usize, layer: &LayerSnapshot) -> &[u8] {
@@ -101,12 +118,20 @@ impl ConversionCache {
     }
 }
 
-/// Returns `true` if the first visible layer is fully opaque, unrotated, and
-/// covers the entire canvas — meaning the canvas clear can be skipped.
+/// Returns `true` if the first visible layer is fully opaque (both layer
+/// opacity *and* source pixel alpha), unrotated, and covers the entire
+/// canvas — meaning the canvas clear can be skipped.
+///
+/// `first_src_data` is the RGBA8 source buffer for the first layer (after
+/// any YUV→RGBA conversion).  We check that every pixel in the region that
+/// will be blitted has `alpha == 255`; if any pixel is semi-transparent the
+/// clear cannot be skipped because the blit would blend with uninitialised
+/// (or stale pooled) canvas bytes.
 fn first_layer_covers_canvas(
     layers: &[Option<LayerSnapshot>],
     canvas_w: u32,
     canvas_h: u32,
+    first_src_data: Option<&[u8]>,
 ) -> bool {
     let Some(first) = layers.iter().flatten().next() else {
         return false;
@@ -118,12 +143,23 @@ fn first_layer_covers_canvas(
 
     // Check if the layer fully covers the canvas.
     // A layer with no rect fills the entire canvas by default.
-    first.rect.as_ref().map_or(true, |r| {
+    let covers = first.rect.as_ref().map_or(true, |r| {
         r.x <= 0
             && r.y <= 0
             && i64::from(r.width) + i64::from(r.x) >= i64::from(canvas_w)
             && i64::from(r.height) + i64::from(r.y) >= i64::from(canvas_h)
-    })
+    });
+    if !covers {
+        return false;
+    }
+
+    // Verify that *all* source pixels are fully opaque (alpha == 255).
+    // Without this check, semi-transparent source pixels would blend with
+    // uninitialised canvas bytes when the clear is skipped.
+    let Some(src) = first_src_data else {
+        return false;
+    };
+    src.chunks_exact(4).all(|px| px[3] == 255)
 }
 
 /// Snapshot of one input layer's data for the blocking compositor thread.
@@ -185,26 +221,53 @@ pub fn composite_frame(
 
     let buf = pooled.as_mut_slice();
 
-    // Skip the canvas clear when the first layer is opaque, unrotated, and
-    // covers the entire canvas — the blit will fully overwrite every pixel.
-    if !first_layer_covers_canvas(layers, canvas_w, canvas_h) {
+    // Two-pass source resolution.
+    //
+    // Pass 1: populate the conversion cache for every non-RGBA layer.
+    // `slot_idx` uses the position in the `layers` slice (which preserves
+    // `None` holes) so that cache indices stay stable even when some slots
+    // have no frame.
+    for (slot_idx, entry) in layers.iter().enumerate() {
+        if let Some(layer) = entry {
+            if layer.pixel_format != PixelFormat::Rgba8 {
+                conversion_cache.get_or_convert(slot_idx, layer);
+            }
+        }
+    }
+
+    // Pass 2: build resolved references.  The mutable borrow of
+    // `conversion_cache` from pass 1 is released, so we can now take
+    // shared references into the cache alongside references into `layers`.
+    let resolved: Vec<Option<(&LayerSnapshot, &[u8])>> = layers
+        .iter()
+        .enumerate()
+        .map(|(slot_idx, entry)| {
+            entry.as_ref().map(|layer| {
+                let src_data: &[u8] = match layer.pixel_format {
+                    PixelFormat::Rgba8 => layer.data.as_slice(),
+                    PixelFormat::I420 | PixelFormat::Nv12 => {
+                        // Cache was populated in pass 1; this is a shared
+                        // read that cannot fail.
+                        conversion_cache.get_cached(slot_idx, layer)
+                    },
+                };
+                (layer, src_data)
+            })
+        })
+        .collect();
+
+    // Now that we have the first layer's resolved RGBA data, check whether
+    // the canvas clear can be skipped.
+    let first_src = resolved.iter().flatten().next().map(|(_, d)| *d);
+    if !first_layer_covers_canvas(layers, canvas_w, canvas_h, first_src) {
         buf[..total_bytes].fill(0);
     }
 
     // Blit each layer (in order — first layer is bottom, last is top).
-    // Non-RGBA layers use the conversion cache to avoid redundant per-frame
-    // YUV→RGBA8 conversion when the source data hasn't changed.
-    for (slot_idx, layer) in layers.iter().flatten().enumerate() {
+    for (layer, src_data) in resolved.iter().flatten() {
         let dst_rect =
             layer.rect.clone().unwrap_or(Rect { x: 0, y: 0, width: canvas_w, height: canvas_h });
 
-        let src_data: &[u8] = match layer.pixel_format {
-            PixelFormat::Rgba8 => layer.data.as_slice(),
-            PixelFormat::I420 | PixelFormat::Nv12 => {
-                conversion_cache.get_or_convert(slot_idx, layer)
-            },
-        };
-
         scale_blit_rgba_rotated(
             buf,
             canvas_w,
diff --git a/crates/nodes/src/video/compositor/mod.rs b/crates/nodes/src/video/compositor/mod.rs
index 51b20429..9c45af98 100644
--- a/crates/nodes/src/video/compositor/mod.rs
+++ b/crates/nodes/src/video/compositor/mod.rs
@@ -25,8 +25,8 @@
 //! - Bilinear / Lanczos scaling (MVP uses nearest-neighbor).
 
 pub mod config;
-mod kernel;
-mod overlay;
+pub mod kernel;
+pub mod overlay;
 pub mod pixel_ops;
 
 use async_trait::async_trait;

From 3b26f3269dd5d1684467ca34c4b4ee444970b5c1 Mon Sep 17 00:00:00 2001
From: StreamKit Devin <devin@streamkit.dev>
Date: Sun, 1 Mar 2026 09:16:04 +0000
Subject: [PATCH 11/12] style: apply rustfmt to fix formatting

Co-Authored-By: Claudio Costa <cstcld91@gmail.com>
---
 crates/core/src/frame_pool.rs               | 5 ++++-
 crates/nodes/src/video/compositor/kernel.rs | 5 ++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/crates/core/src/frame_pool.rs b/crates/core/src/frame_pool.rs
index 9e3bf98e..c7a6a829 100644
--- a/crates/core/src/frame_pool.rs
+++ b/crates/core/src/frame_pool.rs
@@ -327,7 +327,10 @@ pub const DEFAULT_VIDEO_MAX_BUFFERS_PER_BUCKET: usize = 16;
 
 impl FramePool<u8> {
     pub fn video_default() -> Self {
-        Self::with_buckets(DEFAULT_VIDEO_BUCKET_SIZES.to_vec(), DEFAULT_VIDEO_MAX_BUFFERS_PER_BUCKET)
+        Self::with_buckets(
+            DEFAULT_VIDEO_BUCKET_SIZES.to_vec(),
+            DEFAULT_VIDEO_MAX_BUFFERS_PER_BUCKET,
+        )
     }
 }
 
diff --git a/crates/nodes/src/video/compositor/kernel.rs b/crates/nodes/src/video/compositor/kernel.rs
index 3e979fe6..424decdb 100644
--- a/crates/nodes/src/video/compositor/kernel.rs
+++ b/crates/nodes/src/video/compositor/kernel.rs
@@ -56,9 +56,8 @@ impl ConversionCache {
     /// layer has been converted.
     fn get_cached(&self, slot_idx: usize, layer: &LayerSnapshot) -> &[u8] {
         #[allow(clippy::expect_used)]
-        let cached = self.entries[slot_idx]
-            .as_ref()
-            .expect("get_cached called before get_or_convert");
+        let cached =
+            self.entries[slot_idx].as_ref().expect("get_cached called before get_or_convert");
         let needed = layer.width as usize * layer.height as usize * 4;
         &cached.rgba[..needed]
     }

From f71b7b563015b6b2aeab4be89a139335bc612d4a Mon Sep 17 00:00:00 2001
From: StreamKit Devin <devin@streamkit.dev>
Date: Sun, 1 Mar 2026 09:37:10 +0000
Subject: [PATCH 12/12] perf(compositor): SSE2 blend, alpha-scan cache, bench
 pool, lazy prealloc
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix 4 remaining performance findings:

1. High: Add SSE2 SIMD fast path for RGBA blend loops (blit_row_opaque,
   blit_row_alpha). Processes 4 pixels at a time with fast-paths for
   fully-opaque (direct copy) and fully-transparent (skip) source pixels.

2. Medium: Optimize alpha scan in clear-skip check — skip scan entirely
   for I420/NV12 layers (always alpha=255 after conversion), cache scan
   result by Arc pointer identity for RGBA layers.

3. Medium: Pass VideoFramePool to bench_composite instead of None, so
   benchmark exercises pool reuse like production.

4. Low-Medium: Lazy preallocate on first bucket use — when a bucket is
   first hit, allocate one extra buffer so the second get() is a hit.

Also: inline clear-skip logic to fix borrow checker conflict, remove
unused first_layer_covers_canvas function, add clippy suppression
rationale comments for needless_range_loop.

Co-Authored-By: Claudio Costa <cstcld91@gmail.com>
---
 crates/core/src/frame_pool.rs                 |  11 +
 crates/engine/benches/compositor_only.rs      |   9 +-
 crates/nodes/src/video/compositor/kernel.rs   | 117 ++++----
 .../nodes/src/video/compositor/pixel_ops.rs   | 279 +++++++++++++++++-
 4 files changed, 360 insertions(+), 56 deletions(-)

diff --git a/crates/core/src/frame_pool.rs b/crates/core/src/frame_pool.rs
index c7a6a829..9489df8e 100644
--- a/crates/core/src/frame_pool.rs
+++ b/crates/core/src/frame_pool.rs
@@ -139,6 +139,11 @@ impl<T: Clone + Default> FramePool<T> {
     /// Get pooled storage for at least `min_len` elements.
     ///
     /// If `min_len` doesn't fit in any bucket, returns a non-pooled buffer of exact size.
+    ///
+    /// On the first miss for a given bucket (cold start), an extra buffer is
+    /// allocated and placed into the pool so that the *next* `get()` at the
+    /// same size is a hit.  This amortises cold-start allocation cost without
+    /// pre-allocating every bucket size up front.
     pub fn get(&self, min_len: usize) -> PooledFrameData<T> {
         let (handle, bucket_idx, bucket_size, maybe_buf) = {
             let Ok(mut guard) = self.inner.lock() else {
@@ -154,6 +159,12 @@ impl<T: Clone + Default> FramePool<T> {
                 guard.hits += 1;
             } else {
                 guard.misses += 1;
+                // Lazy preallocate: on first miss for this bucket, seed the
+                // pool with one extra buffer so subsequent gets are hits.
+                if guard.buckets[bucket_idx].is_empty() && guard.buckets[bucket_idx].capacity() == 0
+                {
+                    guard.buckets[bucket_idx].push(vec![T::default(); bucket_size]);
+                }
             }
             (self.handle(), bucket_idx, bucket_size, buf)
         };
diff --git a/crates/engine/benches/compositor_only.rs b/crates/engine/benches/compositor_only.rs
index f5036d7d..5a6263fe 100644
--- a/crates/engine/benches/compositor_only.rs
+++ b/crates/engine/benches/compositor_only.rs
@@ -38,6 +38,7 @@ use std::time::Instant;
 
 use streamkit_core::frame_pool::PooledVideoData;
 use streamkit_core::types::PixelFormat;
+use streamkit_core::VideoFramePool;
 
 // Re-use the compositor kernel and pixel_ops directly.
 use streamkit_nodes::video::compositor::config::Rect;
@@ -170,7 +171,10 @@ fn generate_nv12_frame(width: u32, height: u32) -> Vec<u8> {
 /// Call the real `composite_frame` kernel for `frame_count` iterations,
 /// returning per-frame timing statistics.  This exercises all kernel
 /// optimizations: conversion cache, skip-canvas-clear, identity-scale
-/// fast-path, precomputed x-map, etc.
+/// fast-path, precomputed x-map, SSE2 blend, etc.
+///
+/// Uses a real `VideoFramePool` to match production behaviour (pooled buffer
+/// reuse instead of per-frame heap allocation).
 fn bench_composite(
     _label: &str,
     canvas_w: u32,
@@ -180,6 +184,7 @@ fn bench_composite(
 ) -> BenchResult {
     let empty_overlays: Vec<Arc<DecodedOverlay>> = Vec::new();
     let mut conversion_cache = ConversionCache::new();
+    let pool = VideoFramePool::video_default();
 
     let start = Instant::now();
 
@@ -190,7 +195,7 @@ fn bench_composite(
             layers,
             &empty_overlays,
             &empty_overlays,
-            None,
+            Some(&pool),
             &mut conversion_cache,
         );
     }
diff --git a/crates/nodes/src/video/compositor/kernel.rs b/crates/nodes/src/video/compositor/kernel.rs
index 424decdb..621fe893 100644
--- a/crates/nodes/src/video/compositor/kernel.rs
+++ b/crates/nodes/src/video/compositor/kernel.rs
@@ -37,13 +37,42 @@ struct CachedConversion {
 ///
 /// Avoids redundant per-frame I420/NV12 → RGBA8 conversion when the source
 /// `Arc<PooledVideoData>` hasn't changed since the previous frame.
+///
+/// Also caches the first-layer alpha-scan result so that the canvas-clear
+/// skip check doesn't re-scan every frame when the source hasn't changed.
 pub struct ConversionCache {
     entries: Vec<Option<CachedConversion>>,
+    /// Cached result of the alpha-opaqueness scan for the first visible layer.
+    /// `(data_identity, all_opaque)` — valid when the `Arc` pointer matches.
+    first_layer_alpha_cache: Option<(usize, bool)>,
 }
 
 impl ConversionCache {
     pub const fn new() -> Self {
-        Self { entries: Vec::new() }
+        Self { entries: Vec::new(), first_layer_alpha_cache: None }
+    }
+
+    /// Check whether the first visible layer's source data is fully opaque.
+    ///
+    /// For I420/NV12 layers, the converted RGBA always has alpha == 255, so
+    /// we return `true` immediately without scanning.  For RGBA layers we
+    /// scan once and cache the result keyed by `Arc::as_ptr`.
+    fn first_layer_all_opaque(&mut self, layer: &LayerSnapshot, rgba_data: &[u8]) -> bool {
+        // I420/NV12 → RGBA conversion always writes alpha = 255.
+        if layer.pixel_format != PixelFormat::Rgba8 {
+            return true;
+        }
+
+        let identity = Arc::as_ptr(&layer.data) as usize;
+        if let Some((cached_id, cached_result)) = self.first_layer_alpha_cache {
+            if cached_id == identity {
+                return cached_result;
+            }
+        }
+
+        let all_opaque = rgba_data.chunks_exact(4).all(|px| px[3] == 255);
+        self.first_layer_alpha_cache = Some((identity, all_opaque));
+        all_opaque
     }
 
     /// Return a previously-cached RGBA slice for `slot_idx`.
@@ -117,50 +146,6 @@ impl ConversionCache {
     }
 }
 
-/// Returns `true` if the first visible layer is fully opaque (both layer
-/// opacity *and* source pixel alpha), unrotated, and covers the entire
-/// canvas — meaning the canvas clear can be skipped.
-///
-/// `first_src_data` is the RGBA8 source buffer for the first layer (after
-/// any YUV→RGBA conversion).  We check that every pixel in the region that
-/// will be blitted has `alpha == 255`; if any pixel is semi-transparent the
-/// clear cannot be skipped because the blit would blend with uninitialised
-/// (or stale pooled) canvas bytes.
-fn first_layer_covers_canvas(
-    layers: &[Option<LayerSnapshot>],
-    canvas_w: u32,
-    canvas_h: u32,
-    first_src_data: Option<&[u8]>,
-) -> bool {
-    let Some(first) = layers.iter().flatten().next() else {
-        return false;
-    };
-
-    if first.opacity < 1.0 || first.rotation_degrees.abs() >= 0.01 {
-        return false;
-    }
-
-    // Check if the layer fully covers the canvas.
-    // A layer with no rect fills the entire canvas by default.
-    let covers = first.rect.as_ref().map_or(true, |r| {
-        r.x <= 0
-            && r.y <= 0
-            && i64::from(r.width) + i64::from(r.x) >= i64::from(canvas_w)
-            && i64::from(r.height) + i64::from(r.y) >= i64::from(canvas_h)
-    });
-    if !covers {
-        return false;
-    }
-
-    // Verify that *all* source pixels are fully opaque (alpha == 255).
-    // Without this check, semi-transparent source pixels would blend with
-    // uninitialised canvas bytes when the clear is skipped.
-    let Some(src) = first_src_data else {
-        return false;
-    };
-    src.chunks_exact(4).all(|px| px[3] == 255)
-}
-
 /// Snapshot of one input layer's data for the blocking compositor thread.
 pub struct LayerSnapshot {
     pub data: Arc<streamkit_core::frame_pool::PooledVideoData>,
@@ -234,6 +219,41 @@ pub fn composite_frame(
         }
     }
 
+    // Between pass 1 and pass 2: check whether the first layer allows
+    // skipping the canvas clear.  We do the alpha-opaqueness check here
+    // while `conversion_cache` is still mutably available.  The result
+    // is a simple bool so no borrows leak into pass 2.
+    let skip_clear = layers
+        .iter()
+        .enumerate()
+        .find_map(|(i, e)| e.as_ref().map(|l| (i, l)))
+        .map_or(false, |(_slot_idx, layer)| {
+            // Quick checks that don't need the pixel data.
+            if layer.opacity < 1.0 || layer.rotation_degrees.abs() >= 0.01 {
+                return false;
+            }
+            let covers = layer.rect.as_ref().map_or(true, |r| {
+                r.x <= 0
+                    && r.y <= 0
+                    && i64::from(r.width) + i64::from(r.x) >= i64::from(canvas_w)
+                    && i64::from(r.height) + i64::from(r.y) >= i64::from(canvas_h)
+            });
+            if !covers {
+                return false;
+            }
+            // Alpha check — needs mutable access to conversion_cache.
+            match layer.pixel_format {
+                // I420/NV12 → RGBA conversion always writes alpha = 255.
+                PixelFormat::I420 | PixelFormat::Nv12 => true,
+                PixelFormat::Rgba8 => {
+                    conversion_cache.first_layer_all_opaque(layer, layer.data.as_slice())
+                },
+            }
+        });
+    if !skip_clear {
+        buf[..total_bytes].fill(0);
+    }
+
     // Pass 2: build resolved references.  The mutable borrow of
     // `conversion_cache` from pass 1 is released, so we can now take
     // shared references into the cache alongside references into `layers`.
@@ -255,13 +275,6 @@ pub fn composite_frame(
         })
         .collect();
 
-    // Now that we have the first layer's resolved RGBA data, check whether
-    // the canvas clear can be skipped.
-    let first_src = resolved.iter().flatten().next().map(|(_, d)| *d);
-    if !first_layer_covers_canvas(layers, canvas_w, canvas_h, first_src) {
-        buf[..total_bytes].fill(0);
-    }
-
     // Blit each layer (in order — first layer is bottom, last is top).
     for (layer, src_data) in resolved.iter().flatten() {
         let dst_rect =
diff --git a/crates/nodes/src/video/compositor/pixel_ops.rs b/crates/nodes/src/video/compositor/pixel_ops.rs
index a87f13fd..3c6dd03e 100644
--- a/crates/nodes/src/video/compositor/pixel_ops.rs
+++ b/crates/nodes/src/video/compositor/pixel_ops.rs
@@ -224,17 +224,183 @@ const fn blend_u8(src: u8, dst: u8, alpha: u16) -> u8 {
     ((val + (val >> 8)) >> 8) as u8
 }
 
+// ── SSE2 alpha-blend helpers (x86-64) ──────────────────────────────────────
+//
+// Process 4 RGBA pixels at a time using SSE2 integer arithmetic.
+// Source pixels are gathered (non-contiguous via x_map), destination pixels
+// are contiguous.  The blend formula is identical to the scalar `blend_u8`:
+//   result = ((src*alpha + dst*(255-alpha) + 128) + ((…) >> 8)) >> 8
+//
+// For the alpha channel we set source-alpha to 255 before blending so that
+// `blend_u8(255, dst_alpha, src_alpha)` naturally computes the standard
+// over-composite alpha `a_src + a_dst*(1-a_src)` (within ±1 of the scalar
+// approximation — both are approximate divisions by 255).
+
+/// Read 4 bytes from `src` at `offset` as a native-endian `u32`.
+///
+/// # Safety
+///
+/// Caller must ensure `offset + 3 < src.len()`.
+#[inline(always)]
+unsafe fn read_rgba_u32(src: &[u8], offset: usize) -> u32 {
+    std::ptr::read_unaligned(src.as_ptr().add(offset) as *const u32)
+}
+
+/// Blend 4 gathered source RGBA pixels onto 4 contiguous destination pixels
+/// using SSE2 "over" compositing (no opacity modifier).
+///
+/// # Safety
+///
+/// `dst_ptr` must point to at least 16 writable bytes.  Source pixel values
+/// in `src_pixels` must be valid RGBA `u32` values.
+#[cfg(target_arch = "x86_64")]
+#[inline(always)]
+unsafe fn blend_4px_over_sse2(dst_ptr: *mut u8, src_pixels: [u32; 4]) {
+    use std::arch::x86_64::*;
+
+    let zero = _mm_setzero_si128();
+    let c255 = _mm_set1_epi16(255);
+    let c128 = _mm_set1_epi16(128);
+
+    // Assemble 4 gathered source pixels into one register.
+    let src4 = _mm_set_epi32(
+        src_pixels[3] as i32,
+        src_pixels[2] as i32,
+        src_pixels[1] as i32,
+        src_pixels[0] as i32,
+    );
+
+    // Mask with 0xFF at each pixel's alpha-byte position (bytes 3,7,11,15).
+    let alpha_byte_mask = _mm_set1_epi32(0xFF00_0000_u32 as i32);
+
+    // Fast path: all 4 source pixels fully opaque → direct copy.
+    let alpha_bytes = _mm_and_si128(src4, alpha_byte_mask);
+    if _mm_movemask_epi8(_mm_cmpeq_epi8(alpha_bytes, alpha_byte_mask)) == 0xFFFF {
+        _mm_storeu_si128(dst_ptr as *mut __m128i, src4);
+        return;
+    }
+
+    // Fast path: all 4 source pixels fully transparent → nothing to do.
+    if _mm_movemask_epi8(_mm_cmpeq_epi8(alpha_bytes, zero)) == 0xFFFF {
+        return;
+    }
+
+    let dst4 = _mm_loadu_si128(dst_ptr as *const __m128i);
+
+    // Replace source alpha channel with 255 for correct composite-alpha
+    // via blend_u8(255, dst_alpha, src_alpha).
+    let src_blend = _mm_or_si128(src4, alpha_byte_mask);
+
+    // --- Low 2 pixels (u16 arithmetic) ---
+    let src_lo = _mm_unpacklo_epi8(src_blend, zero);
+    let dst_lo = _mm_unpacklo_epi8(dst4, zero);
+
+    // Extract original source alpha and broadcast within each 4-u16 pixel group.
+    let src_orig_lo = _mm_unpacklo_epi8(src4, zero);
+    // _MM_SHUFFLE(3,3,3,3) = 0xFF → replicate element 3 (alpha) to all 4 positions.
+    let alpha_lo = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src_orig_lo, 0xFF), 0xFF);
+
+    let inv_alpha_lo = _mm_sub_epi16(c255, alpha_lo);
+    let val_lo = _mm_add_epi16(
+        _mm_add_epi16(_mm_mullo_epi16(src_lo, alpha_lo), _mm_mullo_epi16(dst_lo, inv_alpha_lo)),
+        c128,
+    );
+    let result_lo = _mm_srli_epi16(_mm_add_epi16(val_lo, _mm_srli_epi16(val_lo, 8)), 8);
+
+    // --- High 2 pixels ---
+    let src_hi = _mm_unpackhi_epi8(src_blend, zero);
+    let dst_hi = _mm_unpackhi_epi8(dst4, zero);
+    let src_orig_hi = _mm_unpackhi_epi8(src4, zero);
+    let alpha_hi = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src_orig_hi, 0xFF), 0xFF);
+
+    let inv_alpha_hi = _mm_sub_epi16(c255, alpha_hi);
+    let val_hi = _mm_add_epi16(
+        _mm_add_epi16(_mm_mullo_epi16(src_hi, alpha_hi), _mm_mullo_epi16(dst_hi, inv_alpha_hi)),
+        c128,
+    );
+    let result_hi = _mm_srli_epi16(_mm_add_epi16(val_hi, _mm_srli_epi16(val_hi, 8)), 8);
+
+    // Pack back to u8 and store.
+    _mm_storeu_si128(dst_ptr as *mut __m128i, _mm_packus_epi16(result_lo, result_hi));
+}
+
+/// Blend 4 gathered source RGBA pixels onto 4 contiguous destination pixels
+/// using SSE2 "over" compositing **with** an opacity multiplier applied to
+/// each pixel's source alpha.
+///
+/// # Safety
+///
+/// `dst_ptr` must point to at least 16 writable bytes.
+#[cfg(target_arch = "x86_64")]
+#[inline(always)]
+unsafe fn blend_4px_over_alpha_sse2(dst_ptr: *mut u8, src_pixels: [u32; 4], opacity: u16) {
+    use std::arch::x86_64::*;
+
+    let zero = _mm_setzero_si128();
+    let c255 = _mm_set1_epi16(255);
+    let c128 = _mm_set1_epi16(128);
+    let opacity_v = _mm_set1_epi16(opacity as i16);
+
+    let src4 = _mm_set_epi32(
+        src_pixels[3] as i32,
+        src_pixels[2] as i32,
+        src_pixels[1] as i32,
+        src_pixels[0] as i32,
+    );
+
+    let dst4 = _mm_loadu_si128(dst_ptr as *const __m128i);
+    let alpha_byte_mask = _mm_set1_epi32(0xFF00_0000_u32 as i32);
+    let src_blend = _mm_or_si128(src4, alpha_byte_mask);
+
+    // --- Low 2 pixels ---
+    let src_lo = _mm_unpacklo_epi8(src_blend, zero);
+    let dst_lo = _mm_unpacklo_epi8(dst4, zero);
+
+    // Extract original alpha, apply opacity: sa_eff = (sa * opacity + 128) >> 8.
+    // Max value: (255*255+128)>>8 = 254, so no clamping needed.
+    let src_orig_lo = _mm_unpacklo_epi8(src4, zero);
+    let raw_alpha_lo = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src_orig_lo, 0xFF), 0xFF);
+    let alpha_lo = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(raw_alpha_lo, opacity_v), c128), 8);
+
+    let inv_alpha_lo = _mm_sub_epi16(c255, alpha_lo);
+    let val_lo = _mm_add_epi16(
+        _mm_add_epi16(_mm_mullo_epi16(src_lo, alpha_lo), _mm_mullo_epi16(dst_lo, inv_alpha_lo)),
+        c128,
+    );
+    let result_lo = _mm_srli_epi16(_mm_add_epi16(val_lo, _mm_srli_epi16(val_lo, 8)), 8);
+
+    // --- High 2 pixels ---
+    let src_hi = _mm_unpackhi_epi8(src_blend, zero);
+    let dst_hi = _mm_unpackhi_epi8(dst4, zero);
+    let src_orig_hi = _mm_unpackhi_epi8(src4, zero);
+    let raw_alpha_hi = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src_orig_hi, 0xFF), 0xFF);
+    let alpha_hi = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(raw_alpha_hi, opacity_v), c128), 8);
+
+    let inv_alpha_hi = _mm_sub_epi16(c255, alpha_hi);
+    let val_hi = _mm_add_epi16(
+        _mm_add_epi16(_mm_mullo_epi16(src_hi, alpha_hi), _mm_mullo_epi16(dst_hi, inv_alpha_hi)),
+        c128,
+    );
+    let result_hi = _mm_srli_epi16(_mm_add_epi16(val_hi, _mm_srli_epi16(val_hi, 8)), 8);
+
+    _mm_storeu_si128(dst_ptr as *mut __m128i, _mm_packus_epi16(result_lo, result_hi));
+}
+
 /// Inner blit for fully-opaque layers (`opacity >= 1.0`).  Skips the
 /// per-pixel f32 multiply on the source alpha channel.
 ///
 /// Uses integer-only alpha blending for semi-transparent source pixels.
 /// `x_map` provides precomputed source-X indices (one per destination column).
+///
+/// On x86-64, processes 4 pixels at a time using SSE2 SIMD when the row is
+/// wide enough and bounds can be pre-validated.
 #[allow(
     clippy::cast_possible_truncation,
     clippy::cast_sign_loss,
     clippy::too_many_arguments,
     clippy::suboptimal_flops,
     clippy::inline_always,
+    // dx is used as both x_map index and dst offset, so an iterator is non-trivial.
     clippy::needless_range_loop
 )]
 #[inline(always)]
@@ -248,6 +414,59 @@ fn blit_row_opaque(
     x_map: &[usize],
 ) {
     let src_row_base = sy * sw * 4;
+
+    // ── SSE2 fast path: process 4 pixels at a time ─────────────────────
+    #[cfg(target_arch = "x86_64")]
+    {
+        // Pre-validate bounds so the inner SIMD loop is branch-free.
+        let src_row_end = src_row_base + sw * 4;
+        let dst_end = (rx + effective_rw) * 4;
+        if src_row_end <= src.len() && dst_end <= row_slice.len() {
+            let chunks = effective_rw / 4;
+            for c in 0..chunks {
+                let dx = c * 4;
+                // SAFETY: bounds pre-validated above; x_map values < sw;
+                // dst range (rx+dx)*4..(rx+dx+4)*4 < dst_end <= row_slice.len().
+                unsafe {
+                    let pixels = [
+                        read_rgba_u32(src, src_row_base + x_map[dx] * 4),
+                        read_rgba_u32(src, src_row_base + x_map[dx + 1] * 4),
+                        read_rgba_u32(src, src_row_base + x_map[dx + 2] * 4),
+                        read_rgba_u32(src, src_row_base + x_map[dx + 3] * 4),
+                    ];
+                    blend_4px_over_sse2(row_slice.as_mut_ptr().add((rx + dx) * 4), pixels);
+                }
+            }
+
+            // Scalar tail for remaining 0-3 pixels.
+            let tail_start = chunks * 4;
+            for dx in tail_start..effective_rw {
+                let sx = x_map[dx];
+                let src_idx = src_row_base + sx * 4;
+                let sr = src[src_idx];
+                let sg = src[src_idx + 1];
+                let sb = src[src_idx + 2];
+                let sa = src[src_idx + 3];
+                let dst_idx = (rx + dx) * 4;
+                if sa == 255 {
+                    row_slice[dst_idx] = sr;
+                    row_slice[dst_idx + 1] = sg;
+                    row_slice[dst_idx + 2] = sb;
+                    row_slice[dst_idx + 3] = 255;
+                } else if sa > 0 {
+                    let a16 = u16::from(sa);
+                    row_slice[dst_idx] = blend_u8(sr, row_slice[dst_idx], a16);
+                    row_slice[dst_idx + 1] = blend_u8(sg, row_slice[dst_idx + 1], a16);
+                    row_slice[dst_idx + 2] = blend_u8(sb, row_slice[dst_idx + 2], a16);
+                    let da = u16::from(row_slice[dst_idx + 3]);
+                    row_slice[dst_idx + 3] = (a16 + ((da * (255 - a16) + 128) >> 8)).min(255) as u8;
+                }
+            }
+            return;
+        }
+    }
+
+    // ── Scalar fallback (bounds-checked per pixel) ─────────────────────
     for dx in 0..effective_rw {
         let sx = x_map[dx];
         let src_idx = src_row_base + sx * 4;
@@ -275,7 +494,6 @@ fn blit_row_opaque(
             row_slice[dst_idx] = blend_u8(sr, row_slice[dst_idx], a16);
             row_slice[dst_idx + 1] = blend_u8(sg, row_slice[dst_idx + 1], a16);
             row_slice[dst_idx + 2] = blend_u8(sb, row_slice[dst_idx + 2], a16);
-            // Composite alpha: a_out = a_src + a_dst * (1 - a_src)
             let da = u16::from(row_slice[dst_idx + 3]);
             row_slice[dst_idx + 3] = (a16 + ((da * (255 - a16) + 128) >> 8)).min(255) as u8;
         }
@@ -287,12 +505,16 @@ fn blit_row_opaque(
 ///
 /// Uses integer-only alpha blending.
 /// `x_map` provides precomputed source-X indices (one per destination column).
+///
+/// On x86-64, processes 4 pixels at a time using SSE2 SIMD when the row is
+/// wide enough and bounds can be pre-validated.
 #[allow(
     clippy::cast_possible_truncation,
     clippy::cast_sign_loss,
     clippy::too_many_arguments,
     clippy::suboptimal_flops,
     clippy::inline_always,
+    // dx is used as both x_map index and dst offset, so an iterator is non-trivial.
     clippy::needless_range_loop
 )]
 #[inline(always)]
@@ -310,6 +532,60 @@ fn blit_row_alpha(
     let opacity_u16 = (opacity * 255.0 + 0.5) as u16;
     let src_row_base = sy * sw * 4;
 
+    // ── SSE2 fast path ─────────────────────────────────────────────────
+    #[cfg(target_arch = "x86_64")]
+    {
+        let src_row_end = src_row_base + sw * 4;
+        let dst_end = (rx + effective_rw) * 4;
+        if src_row_end <= src.len() && dst_end <= row_slice.len() {
+            let chunks = effective_rw / 4;
+            for c in 0..chunks {
+                let dx = c * 4;
+                unsafe {
+                    let pixels = [
+                        read_rgba_u32(src, src_row_base + x_map[dx] * 4),
+                        read_rgba_u32(src, src_row_base + x_map[dx + 1] * 4),
+                        read_rgba_u32(src, src_row_base + x_map[dx + 2] * 4),
+                        read_rgba_u32(src, src_row_base + x_map[dx + 3] * 4),
+                    ];
+                    blend_4px_over_alpha_sse2(
+                        row_slice.as_mut_ptr().add((rx + dx) * 4),
+                        pixels,
+                        opacity_u16,
+                    );
+                }
+            }
+
+            // Scalar tail.
+            let tail_start = chunks * 4;
+            for dx in tail_start..effective_rw {
+                let sx = x_map[dx];
+                let src_idx = src_row_base + sx * 4;
+                let sr = src[src_idx];
+                let sg = src[src_idx + 1];
+                let sb = src[src_idx + 2];
+                let sa = src[src_idx + 3];
+                let dst_idx = (rx + dx) * 4;
+                let sa_eff = ((u16::from(sa) * opacity_u16 + 128) >> 8).min(255);
+                if sa_eff == 255 {
+                    row_slice[dst_idx] = sr;
+                    row_slice[dst_idx + 1] = sg;
+                    row_slice[dst_idx + 2] = sb;
+                    row_slice[dst_idx + 3] = 255;
+                } else if sa_eff > 0 {
+                    row_slice[dst_idx] = blend_u8(sr, row_slice[dst_idx], sa_eff);
+                    row_slice[dst_idx + 1] = blend_u8(sg, row_slice[dst_idx + 1], sa_eff);
+                    row_slice[dst_idx + 2] = blend_u8(sb, row_slice[dst_idx + 2], sa_eff);
+                    let da = u16::from(row_slice[dst_idx + 3]);
+                    row_slice[dst_idx + 3] =
+                        (sa_eff + ((da * (255 - sa_eff) + 128) >> 8)).min(255) as u8;
+                }
+            }
+            return;
+        }
+    }
+
+    // ── Scalar fallback ────────────────────────────────────────────────
     for dx in 0..effective_rw {
         let sx = x_map[dx];
         let src_idx = src_row_base + sx * 4;
@@ -327,7 +603,6 @@ fn blit_row_alpha(
             continue;
         }
 
-        // Effective alpha: (sa * opacity) / 255, done in integer.
         let sa_eff = ((u16::from(sa) * opacity_u16 + 128) >> 8).min(255);
         if sa_eff == 255 {
             row_slice[dst_idx] = sr;