From 250e6e8971102a839d0d4bd91e77bf6ca90fa8ed Mon Sep 17 00:00:00 2001 From: StreamKit Devin Date: Sun, 1 Mar 2026 08:30:51 +0000 Subject: [PATCH 01/12] perf(compositor): add compositor-only microbenchmark Adds a standalone benchmark that measures composite_frame() in isolation (no VP9 encode, no mux, no async runtime overhead). Scenarios: - 1/2/4 layers RGBA - Mixed I420+RGBA and NV12+RGBA (measures conversion overhead) - Rotation (measures rotated blit path) - Static layers (same Arc each frame, for future cache-hit measurement) Runs at 640x480, 1280x720, 1920x1080 by default. Baseline results on this VM (8 logical CPUs): 1920x1080 1-layer-rgba: ~728 fps (1.37 ms/frame) 1920x1080 2-layer-rgba-pip: ~601 fps (1.66 ms/frame) 1920x1080 2-layer-i420+rgba: ~427 fps (2.34 ms/frame) 1920x1080 2-layer-nv12+rgba: ~478 fps (2.09 ms/frame) 1920x1080 2-layer-rgba-rotated: ~470 fps (2.13 ms/frame) Co-Authored-By: Claudio Costa --- crates/engine/Cargo.toml | 4 + crates/engine/benches/compositor_only.rs | 649 +++++++++++++++++++++++ 2 files changed, 653 insertions(+) create mode 100644 crates/engine/benches/compositor_only.rs diff --git a/crates/engine/Cargo.toml b/crates/engine/Cargo.toml index a5cf11e4..8410f58d 100644 --- a/crates/engine/Cargo.toml +++ b/crates/engine/Cargo.toml @@ -64,5 +64,9 @@ indexmap = { workspace = true } name = "compositor_pipeline" harness = false +[[bench]] +name = "compositor_only" +harness = false + [lints] workspace = true diff --git a/crates/engine/benches/compositor_only.rs b/crates/engine/benches/compositor_only.rs new file mode 100644 index 00000000..0b66c961 --- /dev/null +++ b/crates/engine/benches/compositor_only.rs @@ -0,0 +1,649 @@ +// SPDX-FileCopyrightText: © 2025 StreamKit Contributors +// +// SPDX-License-Identifier: MPL-2.0 + +#![allow(clippy::disallowed_macros)] // Bench binary intentionally uses eprintln!/println! for output. +#![allow(clippy::expect_used)] // Panicking on errors is fine in a benchmark binary. +#![allow(clippy::cast_possible_truncation, clippy::cast_sign_loss, clippy::cast_precision_loss)] + +//! Compositor-only microbenchmark — measures `composite_frame` in isolation +//! (no VP9 encode, no mux, no async runtime overhead). +//! +//! Exercises the following scenarios across multiple resolutions: +//! +//! - 1 layer RGBA (baseline) +//! - 2 layers RGBA (PiP) +//! - 4 layers RGBA +//! - 2 layers mixed I420 + RGBA (measures YUV→RGBA conversion overhead) +//! - 2 layers mixed NV12 + RGBA +//! - 2 layers RGBA with rotation +//! - 2 layers RGBA, static (same data each frame — for future cache-hit measurement) +//! +//! ## Usage +//! +//! Quick run (default 200 frames @ 1280×720): +//! +//! ```bash +//! cargo bench -p streamkit-engine --bench compositor_only +//! ``` +//! +//! Custom parameters: +//! +//! ```bash +//! cargo bench -p streamkit-engine --bench compositor_only -- --frames 500 --width 1920 --height 1080 +//! ``` + +use std::sync::Arc; +use std::time::Instant; + +use streamkit_core::frame_pool::PooledVideoData; +use streamkit_core::types::PixelFormat; + +// Re-use the compositor kernel and pixel_ops directly. +use streamkit_nodes::video::compositor::config::Rect; +use streamkit_nodes::video::compositor::pixel_ops::rgba8_to_i420; + +/// Inline copy of `LayerSnapshot` to avoid depending on the private `kernel` module. +/// Must stay in sync with `kernel::LayerSnapshot`. +struct LayerSnapshot { + data: Arc, + width: u32, + height: u32, + pixel_format: PixelFormat, + rect: Option, + opacity: f32, + z_index: i32, + rotation_degrees: f32, +} + +// ── Default benchmark parameters ──────────────────────────────────────────── + +const DEFAULT_WIDTH: u32 = 1280; +const DEFAULT_HEIGHT: u32 = 720; +const DEFAULT_FRAME_COUNT: u32 = 200; + +// ── Arg parser ────────────────────────────────────────────────────────────── + +struct BenchArgs { + width: u32, + height: u32, + frame_count: u32, + iterations: u32, + /// Optional filter: only run scenarios whose label contains this substring. + filter: Option, +} + +impl BenchArgs { + fn parse() -> Self { + let args: Vec = std::env::args().collect(); + let mut cfg = Self { + width: DEFAULT_WIDTH, + height: DEFAULT_HEIGHT, + frame_count: DEFAULT_FRAME_COUNT, + iterations: 3, + filter: None, + }; + let mut i = 1; + while i < args.len() { + match args[i].as_str() { + "--width" | "-w" => { + i += 1; + if let Some(v) = args.get(i) { + cfg.width = v.parse().unwrap_or(cfg.width); + } + }, + "--height" | "-h" => { + i += 1; + if let Some(v) = args.get(i) { + cfg.height = v.parse().unwrap_or(cfg.height); + } + }, + "--frames" | "-n" => { + i += 1; + if let Some(v) = args.get(i) { + cfg.frame_count = v.parse().unwrap_or(cfg.frame_count); + } + }, + "--iterations" | "-i" => { + i += 1; + if let Some(v) = args.get(i) { + cfg.iterations = v.parse().unwrap_or(cfg.iterations); + } + }, + "--filter" | "-f" => { + i += 1; + if let Some(v) = args.get(i) { + cfg.filter = Some(v.clone()); + } + }, + _ => {}, + } + i += 1; + } + cfg + } +} + +// ── Frame generators ──────────────────────────────────────────────────────── + +/// Generate an RGBA8 color-bar frame (opaque, all alpha = 255). +fn generate_rgba_frame(width: u32, height: u32) -> Vec { + let w = width as usize; + let h = height as usize; + let mut data = vec![0u8; w * h * 4]; + // Simple vertical gradient bars for visual distinctness. + let bar_colors: &[(u8, u8, u8)] = &[ + (191, 191, 191), // white + (191, 191, 0), // yellow + (0, 191, 191), // cyan + (0, 191, 0), // green + (191, 0, 191), // magenta + (191, 0, 0), // red + (0, 0, 191), // blue + ]; + for row in 0..h { + for col in 0..w { + let bar_idx = col * bar_colors.len() / w; + let (r, g, b) = bar_colors[bar_idx]; + let off = (row * w + col) * 4; + data[off] = r; + data[off + 1] = g; + data[off + 2] = b; + data[off + 3] = 255; + } + } + data +} + +/// Generate an I420 frame by converting an RGBA frame. +fn generate_i420_frame(width: u32, height: u32) -> Vec { + let rgba = generate_rgba_frame(width, height); + rgba8_to_i420(&rgba, width, height) +} + +/// Generate an NV12 frame by converting an RGBA frame. +fn generate_nv12_frame(width: u32, height: u32) -> Vec { + let rgba = generate_rgba_frame(width, height); + let w = width as usize; + let h = height as usize; + let chroma_w = w.div_ceil(2); + let chroma_h = h.div_ceil(2); + let nv12_size = w * h + chroma_w * 2 * chroma_h; + let mut nv12 = vec![0u8; nv12_size]; + streamkit_nodes::video::compositor::pixel_ops::rgba8_to_nv12_buf(&rgba, width, height, &mut nv12); + nv12 +} + +// ── Compositing harness ───────────────────────────────────────────────────── + +/// Directly call the compositing kernel for `frame_count` iterations, +/// returning per-frame timing statistics. +fn bench_composite( + _label: &str, + canvas_w: u32, + canvas_h: u32, + layers: &[Option], + frame_count: u32, +) -> BenchResult { + // Re-create the kernel's compositing logic inline since `composite_frame` + // is pub(crate). We call the public pixel_ops functions directly. + let total_bytes = (canvas_w as usize) * (canvas_h as usize) * 4; + let mut canvas = vec![0u8; total_bytes]; + let mut i420_scratch: Vec = Vec::new(); + + let start = Instant::now(); + + for _ in 0..frame_count { + // Zero the canvas. + canvas.fill(0); + + // Blit each layer. + for layer in layers.iter().flatten() { + let dst_rect = layer.rect.clone().unwrap_or(Rect { + x: 0, + y: 0, + width: canvas_w, + height: canvas_h, + }); + + let src_data: &[u8] = match layer.pixel_format { + PixelFormat::Rgba8 => layer.data.as_slice(), + PixelFormat::I420 => { + let needed = layer.width as usize * layer.height as usize * 4; + if i420_scratch.len() < needed { + i420_scratch.resize(needed, 0); + } + streamkit_nodes::video::compositor::pixel_ops::i420_to_rgba8_buf( + layer.data.as_slice(), + layer.width, + layer.height, + &mut i420_scratch, + ); + &i420_scratch[..needed] + }, + PixelFormat::Nv12 => { + let needed = layer.width as usize * layer.height as usize * 4; + if i420_scratch.len() < needed { + i420_scratch.resize(needed, 0); + } + streamkit_nodes::video::compositor::pixel_ops::nv12_to_rgba8_buf( + layer.data.as_slice(), + layer.width, + layer.height, + &mut i420_scratch, + ); + &i420_scratch[..needed] + }, + }; + + streamkit_nodes::video::compositor::pixel_ops::scale_blit_rgba_rotated( + &mut canvas, + canvas_w, + canvas_h, + src_data, + layer.width, + layer.height, + &dst_rect, + layer.opacity, + layer.rotation_degrees, + ); + } + } + + let elapsed = start.elapsed(); + BenchResult { + total_secs: elapsed.as_secs_f64(), + frame_count, + } +} + +struct BenchResult { + total_secs: f64, + frame_count: u32, +} + +impl BenchResult { + fn fps(&self) -> f64 { + f64::from(self.frame_count) / self.total_secs + } + + fn ms_per_frame(&self) -> f64 { + self.total_secs * 1000.0 / f64::from(self.frame_count) + } +} + +// ── Scenario definitions ──────────────────────────────────────────────────── + +struct Scenario { + label: String, + layers: Vec>, +} + +fn make_layer( + data: Vec, + width: u32, + height: u32, + pixel_format: PixelFormat, + rect: Option, + opacity: f32, + z_index: i32, + rotation_degrees: f32, +) -> Option { + Some(LayerSnapshot { + data: Arc::new(PooledVideoData::from_vec(data)), + width, + height, + pixel_format, + rect, + opacity, + z_index, + rotation_degrees, + }) +} + +fn build_scenarios(canvas_w: u32, canvas_h: u32) -> Vec { + let pip_w = canvas_w / 3; + let pip_h = canvas_h / 3; + let pip_x = (canvas_w - pip_w - 20) as i32; + let pip_y = (canvas_h - pip_h - 20) as i32; + + vec![ + // 1 layer RGBA — baseline + Scenario { + label: "1-layer-rgba".to_string(), + layers: vec![make_layer( + generate_rgba_frame(canvas_w, canvas_h), + canvas_w, + canvas_h, + PixelFormat::Rgba8, + None, + 1.0, + 0, + 0.0, + )], + }, + // 2 layers RGBA (PiP) + Scenario { + label: "2-layer-rgba-pip".to_string(), + layers: vec![ + make_layer( + generate_rgba_frame(canvas_w, canvas_h), + canvas_w, + canvas_h, + PixelFormat::Rgba8, + None, + 1.0, + 0, + 0.0, + ), + make_layer( + generate_rgba_frame(pip_w, pip_h), + pip_w, + pip_h, + PixelFormat::Rgba8, + Some(Rect { + x: pip_x, + y: pip_y, + width: pip_w, + height: pip_h, + }), + 0.9, + 1, + 0.0, + ), + ], + }, + // 4 layers RGBA + Scenario { + label: "4-layer-rgba".to_string(), + layers: vec![ + make_layer( + generate_rgba_frame(canvas_w, canvas_h), + canvas_w, + canvas_h, + PixelFormat::Rgba8, + None, + 1.0, + 0, + 0.0, + ), + make_layer( + generate_rgba_frame(pip_w, pip_h), + pip_w, + pip_h, + PixelFormat::Rgba8, + Some(Rect { + x: pip_x, + y: pip_y, + width: pip_w, + height: pip_h, + }), + 0.9, + 1, + 0.0, + ), + make_layer( + generate_rgba_frame(pip_w, pip_h), + pip_w, + pip_h, + PixelFormat::Rgba8, + Some(Rect { + x: 20, + y: 20, + width: pip_w, + height: pip_h, + }), + 0.8, + 2, + 0.0, + ), + make_layer( + generate_rgba_frame(pip_w, pip_h), + pip_w, + pip_h, + PixelFormat::Rgba8, + Some(Rect { + x: 20, + y: pip_y, + width: pip_w, + height: pip_h, + }), + 0.7, + 3, + 0.0, + ), + ], + }, + // 2 layers: I420 bg + RGBA PiP (measures conversion overhead) + Scenario { + label: "2-layer-i420+rgba".to_string(), + layers: vec![ + make_layer( + generate_i420_frame(canvas_w, canvas_h), + canvas_w, + canvas_h, + PixelFormat::I420, + None, + 1.0, + 0, + 0.0, + ), + make_layer( + generate_rgba_frame(pip_w, pip_h), + pip_w, + pip_h, + PixelFormat::Rgba8, + Some(Rect { + x: pip_x, + y: pip_y, + width: pip_w, + height: pip_h, + }), + 0.9, + 1, + 0.0, + ), + ], + }, + // 2 layers: NV12 bg + RGBA PiP + Scenario { + label: "2-layer-nv12+rgba".to_string(), + layers: vec![ + make_layer( + generate_nv12_frame(canvas_w, canvas_h), + canvas_w, + canvas_h, + PixelFormat::Nv12, + None, + 1.0, + 0, + 0.0, + ), + make_layer( + generate_rgba_frame(pip_w, pip_h), + pip_w, + pip_h, + PixelFormat::Rgba8, + Some(Rect { + x: pip_x, + y: pip_y, + width: pip_w, + height: pip_h, + }), + 0.9, + 1, + 0.0, + ), + ], + }, + // 2 layers RGBA with rotation on PiP + Scenario { + label: "2-layer-rgba-rotated".to_string(), + layers: vec![ + make_layer( + generate_rgba_frame(canvas_w, canvas_h), + canvas_w, + canvas_h, + PixelFormat::Rgba8, + None, + 1.0, + 0, + 0.0, + ), + make_layer( + generate_rgba_frame(pip_w, pip_h), + pip_w, + pip_h, + PixelFormat::Rgba8, + Some(Rect { + x: pip_x, + y: pip_y, + width: pip_w, + height: pip_h, + }), + 0.9, + 1, + 15.0, // 15° rotation + ), + ], + }, + // 2 layers RGBA, static (same Arc — for future cache-hit measurement) + Scenario { + label: "2-layer-rgba-static".to_string(), + layers: { + let bg = Arc::new(PooledVideoData::from_vec(generate_rgba_frame( + canvas_w, canvas_h, + ))); + let pip = Arc::new(PooledVideoData::from_vec(generate_rgba_frame(pip_w, pip_h))); + vec![ + Some(LayerSnapshot { + data: bg, + width: canvas_w, + height: canvas_h, + pixel_format: PixelFormat::Rgba8, + rect: None, + opacity: 1.0, + z_index: 0, + rotation_degrees: 0.0, + }), + Some(LayerSnapshot { + data: pip, + width: pip_w, + height: pip_h, + pixel_format: PixelFormat::Rgba8, + rect: Some(Rect { + x: pip_x, + y: pip_y, + width: pip_w, + height: pip_h, + }), + opacity: 0.9, + z_index: 1, + rotation_degrees: 0.0, + }), + ] + }, + }, + ] +} + +// ── Main ──────────────────────────────────────────────────────────────────── + +fn main() { + let args = BenchArgs::parse(); + + let resolutions: &[(u32, u32)] = if args.width == DEFAULT_WIDTH && args.height == DEFAULT_HEIGHT + { + // Default: run at multiple resolutions. + &[(640, 480), (1280, 720), (1920, 1080)] + } else { + // Custom: run at the specified resolution only. + // (Leak to get 'static — acceptable in a short-lived bench binary.) + let res = Box::leak(Box::new([(args.width, args.height)])); + res + }; + + eprintln!("╔══════════════════════════════════════════════════════════╗"); + eprintln!("║ Compositor-Only Microbenchmark ║"); + eprintln!("╠══════════════════════════════════════════════════════════╣"); + eprintln!( + "║ Resolutions : {:<41}║", + resolutions + .iter() + .map(|(w, h)| format!("{w}×{h}")) + .collect::>() + .join(", ") + ); + eprintln!("║ Frames : {:<41}║", args.frame_count); + eprintln!("║ Iterations : {:<41}║", args.iterations); + if let Some(ref f) = args.filter { + eprintln!("║ Filter : {f:<41}║"); + } + eprintln!("╚══════════════════════════════════════════════════════════╝"); + eprintln!(); + + let mut json_results: Vec = Vec::new(); + + for &(w, h) in resolutions { + eprintln!("── {w}×{h} ──────────────────────────────────────────────"); + + let scenarios = build_scenarios(w, h); + + for scenario in &scenarios { + if let Some(ref filter) = args.filter { + if !scenario.label.contains(filter.as_str()) { + continue; + } + } + + let mut iter_results = Vec::with_capacity(args.iterations as usize); + + for iter in 1..=args.iterations { + let result = + bench_composite(&scenario.label, w, h, &scenario.layers, args.frame_count); + eprintln!( + " {:<28} iter {iter}/{}: {:>8.1} fps ({:.2} ms/frame)", + scenario.label, + args.iterations, + result.fps(), + result.ms_per_frame(), + ); + iter_results.push(result); + } + + // Summary for this scenario. + let fps_values: Vec = iter_results.iter().map(BenchResult::fps).collect(); + let ms_values: Vec = iter_results.iter().map(BenchResult::ms_per_frame).collect(); + let mean_fps = fps_values.iter().sum::() / fps_values.len() as f64; + let mean_ms = ms_values.iter().sum::() / ms_values.len() as f64; + let min_ms = ms_values.iter().copied().fold(f64::INFINITY, f64::min); + let max_ms = ms_values.iter().copied().fold(f64::NEG_INFINITY, f64::max); + + eprintln!( + " {:<28} avg: {:>8.1} fps ({:.2} ms/frame, min={:.2}, max={:.2})", + "", + mean_fps, + mean_ms, + min_ms, + max_ms, + ); + + json_results.push(serde_json::json!({ + "benchmark": "compositor_only", + "scenario": scenario.label, + "width": w, + "height": h, + "frame_count": args.frame_count, + "iterations": args.iterations, + "mean_fps": mean_fps, + "mean_ms_per_frame": mean_ms, + "min_ms_per_frame": min_ms, + "max_ms_per_frame": max_ms, + })); + } + eprintln!(); + } + + // Machine-readable JSON output. + println!("{}", serde_json::to_string_pretty(&json_results).expect("JSON serialization")); +} From 3a7a2b20c25fdd214de32efdd9de302e2a616d18 Mon Sep 17 00:00:00 2001 From: StreamKit Devin Date: Sun, 1 Mar 2026 08:31:02 +0000 Subject: [PATCH 02/12] style: apply rustfmt to compositor_only benchmark Co-Authored-By: Claudio Costa --- crates/engine/benches/compositor_only.rs | 82 +++++------------------- 1 file changed, 16 insertions(+), 66 deletions(-) diff --git a/crates/engine/benches/compositor_only.rs b/crates/engine/benches/compositor_only.rs index 0b66c961..31ea5b85 100644 --- a/crates/engine/benches/compositor_only.rs +++ b/crates/engine/benches/compositor_only.rs @@ -170,7 +170,9 @@ fn generate_nv12_frame(width: u32, height: u32) -> Vec { let chroma_h = h.div_ceil(2); let nv12_size = w * h + chroma_w * 2 * chroma_h; let mut nv12 = vec![0u8; nv12_size]; - streamkit_nodes::video::compositor::pixel_ops::rgba8_to_nv12_buf(&rgba, width, height, &mut nv12); + streamkit_nodes::video::compositor::pixel_ops::rgba8_to_nv12_buf( + &rgba, width, height, &mut nv12, + ); nv12 } @@ -251,10 +253,7 @@ fn bench_composite( } let elapsed = start.elapsed(); - BenchResult { - total_secs: elapsed.as_secs_f64(), - frame_count, - } + BenchResult { total_secs: elapsed.as_secs_f64(), frame_count } } struct BenchResult { @@ -341,12 +340,7 @@ fn build_scenarios(canvas_w: u32, canvas_h: u32) -> Vec { pip_w, pip_h, PixelFormat::Rgba8, - Some(Rect { - x: pip_x, - y: pip_y, - width: pip_w, - height: pip_h, - }), + Some(Rect { x: pip_x, y: pip_y, width: pip_w, height: pip_h }), 0.9, 1, 0.0, @@ -372,12 +366,7 @@ fn build_scenarios(canvas_w: u32, canvas_h: u32) -> Vec { pip_w, pip_h, PixelFormat::Rgba8, - Some(Rect { - x: pip_x, - y: pip_y, - width: pip_w, - height: pip_h, - }), + Some(Rect { x: pip_x, y: pip_y, width: pip_w, height: pip_h }), 0.9, 1, 0.0, @@ -387,12 +376,7 @@ fn build_scenarios(canvas_w: u32, canvas_h: u32) -> Vec { pip_w, pip_h, PixelFormat::Rgba8, - Some(Rect { - x: 20, - y: 20, - width: pip_w, - height: pip_h, - }), + Some(Rect { x: 20, y: 20, width: pip_w, height: pip_h }), 0.8, 2, 0.0, @@ -402,12 +386,7 @@ fn build_scenarios(canvas_w: u32, canvas_h: u32) -> Vec { pip_w, pip_h, PixelFormat::Rgba8, - Some(Rect { - x: 20, - y: pip_y, - width: pip_w, - height: pip_h, - }), + Some(Rect { x: 20, y: pip_y, width: pip_w, height: pip_h }), 0.7, 3, 0.0, @@ -433,12 +412,7 @@ fn build_scenarios(canvas_w: u32, canvas_h: u32) -> Vec { pip_w, pip_h, PixelFormat::Rgba8, - Some(Rect { - x: pip_x, - y: pip_y, - width: pip_w, - height: pip_h, - }), + Some(Rect { x: pip_x, y: pip_y, width: pip_w, height: pip_h }), 0.9, 1, 0.0, @@ -464,12 +438,7 @@ fn build_scenarios(canvas_w: u32, canvas_h: u32) -> Vec { pip_w, pip_h, PixelFormat::Rgba8, - Some(Rect { - x: pip_x, - y: pip_y, - width: pip_w, - height: pip_h, - }), + Some(Rect { x: pip_x, y: pip_y, width: pip_w, height: pip_h }), 0.9, 1, 0.0, @@ -495,12 +464,7 @@ fn build_scenarios(canvas_w: u32, canvas_h: u32) -> Vec { pip_w, pip_h, PixelFormat::Rgba8, - Some(Rect { - x: pip_x, - y: pip_y, - width: pip_w, - height: pip_h, - }), + Some(Rect { x: pip_x, y: pip_y, width: pip_w, height: pip_h }), 0.9, 1, 15.0, // 15° rotation @@ -511,9 +475,8 @@ fn build_scenarios(canvas_w: u32, canvas_h: u32) -> Vec { Scenario { label: "2-layer-rgba-static".to_string(), layers: { - let bg = Arc::new(PooledVideoData::from_vec(generate_rgba_frame( - canvas_w, canvas_h, - ))); + let bg = + Arc::new(PooledVideoData::from_vec(generate_rgba_frame(canvas_w, canvas_h))); let pip = Arc::new(PooledVideoData::from_vec(generate_rgba_frame(pip_w, pip_h))); vec![ Some(LayerSnapshot { @@ -531,12 +494,7 @@ fn build_scenarios(canvas_w: u32, canvas_h: u32) -> Vec { width: pip_w, height: pip_h, pixel_format: PixelFormat::Rgba8, - rect: Some(Rect { - x: pip_x, - y: pip_y, - width: pip_w, - height: pip_h, - }), + rect: Some(Rect { x: pip_x, y: pip_y, width: pip_w, height: pip_h }), opacity: 0.9, z_index: 1, rotation_degrees: 0.0, @@ -568,11 +526,7 @@ fn main() { eprintln!("╠══════════════════════════════════════════════════════════╣"); eprintln!( "║ Resolutions : {:<41}║", - resolutions - .iter() - .map(|(w, h)| format!("{w}×{h}")) - .collect::>() - .join(", ") + resolutions.iter().map(|(w, h)| format!("{w}×{h}")).collect::>().join(", ") ); eprintln!("║ Frames : {:<41}║", args.frame_count); eprintln!("║ Iterations : {:<41}║", args.iterations); @@ -621,11 +575,7 @@ fn main() { eprintln!( " {:<28} avg: {:>8.1} fps ({:.2} ms/frame, min={:.2}, max={:.2})", - "", - mean_fps, - mean_ms, - min_ms, - max_ms, + "", mean_fps, mean_ms, min_ms, max_ms, ); json_results.push(serde_json::json!({ From 74fedf8de0dc18604303c3bacca46cf624064be1 Mon Sep 17 00:00:00 2001 From: StreamKit Devin Date: Sun, 1 Mar 2026 08:33:17 +0000 Subject: [PATCH 03/12] =?UTF-8?q?perf(compositor):=20cache=20YUV=E2=86=92R?= =?UTF-8?q?GBA=20conversions=20+=20skip=20canvas=20clear?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Optimization 1: Add ConversionCache that tracks Arc pointer identity per layer slot. When the source Arc hasn't changed between frames, the cached RGBA data is reused (zero conversion cost). Replaces the old i420_scratch buffer approach. Optimization 2: Skip buf.fill(0) canvas clear when the first visible layer is opaque, unrotated, and fully covers the canvas dimensions. Saves one full-canvas memset per frame in the common case. Co-Authored-By: Claudio Costa --- crates/nodes/src/video/compositor/kernel.rs | 146 +++++++++++++++++--- crates/nodes/src/video/compositor/mod.rs | 26 ++-- 2 files changed, 137 insertions(+), 35 deletions(-) diff --git a/crates/nodes/src/video/compositor/kernel.rs b/crates/nodes/src/video/compositor/kernel.rs index c173360e..76b0b0c7 100644 --- a/crates/nodes/src/video/compositor/kernel.rs +++ b/crates/nodes/src/video/compositor/kernel.rs @@ -19,6 +19,116 @@ use super::pixel_ops::{ // ── Compositing kernel (runs on a persistent blocking thread) ──────────────── +// ── YUV → RGBA conversion cache ───────────────────────────────────────────── + +/// Cached RGBA conversion result for a single layer slot. +struct CachedConversion { + /// Identity of the source data (`Arc::as_ptr` cast to `usize`). + /// When the `Arc` pointer hasn't changed between frames + /// the underlying data is identical and the conversion can be skipped. + data_identity: usize, + width: u32, + height: u32, + /// Pre-converted RGBA8 data, stored as a plain `Vec`. + rgba: Vec, +} + +/// Per-slot cache for YUV → RGBA conversions. +/// +/// Avoids redundant per-frame I420/NV12 → RGBA8 conversion when the source +/// `Arc` hasn't changed since the previous frame. +pub struct ConversionCache { + entries: Vec>, +} + +impl ConversionCache { + pub const fn new() -> Self { + Self { entries: Vec::new() } + } + + /// Look up or perform a YUV→RGBA conversion for layer at `slot_idx`. + /// Returns a slice of RGBA8 data. + fn get_or_convert(&mut self, slot_idx: usize, layer: &LayerSnapshot) -> &[u8] { + let identity = Arc::as_ptr(&layer.data) as usize; + + // Ensure the cache Vec is large enough. + if self.entries.len() <= slot_idx { + self.entries.resize_with(slot_idx + 1, || None); + } + + // Check if the cached entry is still valid. + let needs_convert = match &self.entries[slot_idx] { + Some(cached) => { + cached.data_identity != identity + || cached.width != layer.width + || cached.height != layer.height + }, + None => true, + }; + + if needs_convert { + let needed = layer.width as usize * layer.height as usize * 4; + // Reuse the existing allocation if possible. + let mut rgba = self.entries[slot_idx].take().map(|c| c.rgba).unwrap_or_default(); + if rgba.len() < needed { + rgba.resize(needed, 0); + } + + match layer.pixel_format { + PixelFormat::I420 => { + i420_to_rgba8_buf(layer.data.as_slice(), layer.width, layer.height, &mut rgba); + }, + PixelFormat::Nv12 => { + nv12_to_rgba8_buf(layer.data.as_slice(), layer.width, layer.height, &mut rgba); + }, + PixelFormat::Rgba8 => { + // Should not be called for RGBA, but handle gracefully. + rgba[..needed].copy_from_slice(&layer.data.as_slice()[..needed]); + }, + } + + self.entries[slot_idx] = Some(CachedConversion { + data_identity: identity, + width: layer.width, + height: layer.height, + rgba, + }); + } + + let cached = self.entries[slot_idx].as_ref().expect("just inserted"); + let needed = layer.width as usize * layer.height as usize * 4; + &cached.rgba[..needed] + } +} + +/// Returns `true` if the first visible layer is fully opaque, unrotated, and +/// covers the entire canvas — meaning the canvas clear can be skipped. +fn first_layer_covers_canvas( + layers: &[Option], + canvas_w: u32, + canvas_h: u32, +) -> bool { + let Some(first) = layers.iter().flatten().next() else { + return false; + }; + + if first.opacity < 1.0 || first.rotation_degrees.abs() >= 0.01 { + return false; + } + + // Check if the layer fully covers the canvas. + // A layer with no rect fills the entire canvas by default. + match &first.rect { + None => true, + Some(r) => { + r.x <= 0 + && r.y <= 0 + && i64::from(r.width) + i64::from(r.x) >= i64::from(canvas_w) + && i64::from(r.height) + i64::from(r.y) >= i64::from(canvas_h) + }, + } +} + /// Snapshot of one input layer's data for the blocking compositor thread. pub struct LayerSnapshot { pub data: Arc, @@ -56,8 +166,8 @@ pub struct CompositeResult { /// Composite all layers + overlays onto a fresh RGBA8 canvas buffer. /// Allocates from the video pool if available. /// -/// `i420_scratch` is a reusable buffer for I420/NV12→RGBA8 conversion, -/// avoiding per-frame allocation. +/// `conversion_cache` caches YUV→RGBA8 conversions across frames so that +/// unchanged layers skip the conversion entirely. pub fn composite_frame( canvas_w: u32, canvas_h: u32, @@ -65,7 +175,7 @@ pub fn composite_frame( image_overlays: &[Arc], text_overlays: &[Arc], video_pool: Option<&streamkit_core::VideoFramePool>, - i420_scratch: &mut Vec, + conversion_cache: &mut ConversionCache, ) -> streamkit_core::frame_pool::PooledVideoData { let total_bytes = (canvas_w as usize) * (canvas_h as usize) * 4; @@ -74,33 +184,25 @@ pub fn composite_frame( |pool| pool.get(total_bytes), ); - // Zero the buffer (transparent black). let buf = pooled.as_mut_slice(); - buf[..total_bytes].fill(0); + + // Skip the canvas clear when the first layer is opaque, unrotated, and + // covers the entire canvas — the blit will fully overwrite every pixel. + if !first_layer_covers_canvas(layers, canvas_w, canvas_h) { + buf[..total_bytes].fill(0); + } // Blit each layer (in order — first layer is bottom, last is top). - // I420 layers are converted to RGBA8 on-the-fly using the scratch buffer. - for layer in layers.iter().flatten() { + // Non-RGBA layers use the conversion cache to avoid redundant per-frame + // YUV→RGBA8 conversion when the source data hasn't changed. + for (slot_idx, layer) in layers.iter().flatten().enumerate() { let dst_rect = layer.rect.clone().unwrap_or(Rect { x: 0, y: 0, width: canvas_w, height: canvas_h }); let src_data: &[u8] = match layer.pixel_format { PixelFormat::Rgba8 => layer.data.as_slice(), - PixelFormat::I420 => { - let needed = layer.width as usize * layer.height as usize * 4; - if i420_scratch.len() < needed { - i420_scratch.resize(needed, 0); - } - i420_to_rgba8_buf(layer.data.as_slice(), layer.width, layer.height, i420_scratch); - &i420_scratch[..needed] - }, - PixelFormat::Nv12 => { - let needed = layer.width as usize * layer.height as usize * 4; - if i420_scratch.len() < needed { - i420_scratch.resize(needed, 0); - } - nv12_to_rgba8_buf(layer.data.as_slice(), layer.width, layer.height, i420_scratch); - &i420_scratch[..needed] + PixelFormat::I420 | PixelFormat::Nv12 => { + conversion_cache.get_or_convert(slot_idx, layer) }, }; diff --git a/crates/nodes/src/video/compositor/mod.rs b/crates/nodes/src/video/compositor/mod.rs index 34dde60a..8b743172 100644 --- a/crates/nodes/src/video/compositor/mod.rs +++ b/crates/nodes/src/video/compositor/mod.rs @@ -48,7 +48,7 @@ use streamkit_core::{ }; use tokio::sync::mpsc; -use kernel::composite_frame; +use kernel::{composite_frame, ConversionCache}; // ── Input slot ────────────────────────────────────────────────────────────── @@ -285,9 +285,9 @@ impl ProcessorNode for CompositorNode { let (result_tx, mut result_rx) = tokio::sync::mpsc::channel::(2); let composite_thread = tokio::task::spawn_blocking(move || { - // Persistent scratch buffer for I420→RGBA8 layer conversion, - // reused across frames to avoid per-frame allocation. - let mut i420_to_rgba_scratch: Vec = Vec::new(); + // Per-slot cache for YUV→RGBA conversions. Avoids redundant + // conversion when the source Arc hasn't changed between frames. + let mut conversion_cache = ConversionCache::new(); while let Some(work) = work_rx.blocking_recv() { let rgba_buf = composite_frame( @@ -297,7 +297,7 @@ impl ProcessorNode for CompositorNode { &work.image_overlays, &work.text_overlays, work.video_pool.as_deref(), - &mut i420_to_rgba_scratch, + &mut conversion_cache, ); let result = CompositeResult { rgba_data: rgba_buf }; if result_tx.blocking_send(result).is_err() { @@ -851,8 +851,8 @@ mod tests { #[test] fn test_composite_frame_empty_layers() { // No layers, no overlays -> transparent black canvas. - let mut scratch = Vec::new(); - let result = composite_frame(4, 4, &[], &[], &[], None, &mut scratch); + let mut cache = ConversionCache::new(); + let result = composite_frame(4, 4, &[], &[], &[], None, &mut cache); let buf = result.as_slice(); assert_eq!(buf.len(), 4 * 4 * 4); assert!(buf.iter().all(|&b| b == 0)); @@ -872,8 +872,8 @@ mod tests { rotation_degrees: 0.0, }; - let mut scratch = Vec::new(); - let result = composite_frame(4, 4, &[Some(layer)], &[], &[], None, &mut scratch); + let mut cache = ConversionCache::new(); + let result = composite_frame(4, 4, &[Some(layer)], &[], &[], None, &mut cache); let buf = result.as_slice(); // Entire canvas should be red (scaled from 2x2 to 4x4). @@ -912,9 +912,9 @@ mod tests { rotation_degrees: 0.0, }; - let mut scratch = Vec::new(); + let mut cache = ConversionCache::new(); let result = - composite_frame(4, 4, &[Some(layer0), Some(layer1)], &[], &[], None, &mut scratch); + composite_frame(4, 4, &[Some(layer0), Some(layer1)], &[], &[], None, &mut cache); let buf = result.as_slice(); // (0,0) should be red. @@ -1079,8 +1079,8 @@ mod tests { let pool = FramePool::::preallocated(&[total], 2); assert_eq!(pool.stats().buckets[0].available, 2); - let mut scratch = Vec::new(); - let result = composite_frame(canvas_w, canvas_h, &[], &[], &[], Some(&pool), &mut scratch); + let mut cache = ConversionCache::new(); + let result = composite_frame(canvas_w, canvas_h, &[], &[], &[], Some(&pool), &mut cache); assert_eq!(result.as_slice().len(), total); // One buffer was taken from the pool. assert_eq!(pool.stats().buckets[0].available, 1); From 4cdc376b9f33ed33669822e5bb71f4018d2dd007 Mon Sep 17 00:00:00 2001 From: StreamKit Devin Date: Sun, 1 Mar 2026 08:34:22 +0000 Subject: [PATCH 04/12] perf(compositor): precompute x-map to eliminate per-pixel division Optimization 3: Replace per-pixel `(dx + src_col_skip) * sw / rw` integer division in blit_row_opaque/blit_row_alpha with a single precomputed lookup table (x_map) built once per scale_blit_rgba call. Each destination column now does a table lookup instead of a division, removing O(width * height) divisions per layer per frame. Co-Authored-By: Claudio Costa --- .../nodes/src/video/compositor/pixel_ops.rs | 45 ++++++++----------- 1 file changed, 19 insertions(+), 26 deletions(-) diff --git a/crates/nodes/src/video/compositor/pixel_ops.rs b/crates/nodes/src/video/compositor/pixel_ops.rs index 3464eb8e..8d3de5b1 100644 --- a/crates/nodes/src/video/compositor/pixel_ops.rs +++ b/crates/nodes/src/video/compositor/pixel_ops.rs @@ -101,6 +101,11 @@ pub fn scale_blit_rgba( return; } + // Precompute the source-X lookup table once. This replaces the per-pixel + // `(dx + src_col_skip) * sw / rw` integer division with a single table + // lookup in the inner blit loops. + let x_map: Vec = (0..effective_rect_w).map(|dx| (dx + src_col_skip) * sw / rw).collect(); + // Split the destination buffer into per-row slices so that each row can // be processed independently (and therefore in parallel). let row_stride = dw * 4; @@ -114,24 +119,13 @@ pub fn scale_blit_rgba( dst_rows.par_chunks_mut(row_stride).take(effective_rh).enumerate().for_each( |(dy, row_slice)| { let sy = (dy + src_row_skip) * sh / rh; - blit_row( - row_slice, - rx, - effective_rect_w, - src, - sw, - sh, - sy, - rw, - opacity, - src_col_skip, - ); + blit_row(row_slice, rx, effective_rect_w, src, sw, sy, opacity, &x_map); }, ); } else { for (dy, row_slice) in dst_rows.chunks_mut(row_stride).take(effective_rh).enumerate() { let sy = (dy + src_row_skip) * sh / rh; - blit_row(row_slice, rx, effective_rect_w, src, sw, sh, sy, rw, opacity, src_col_skip); + blit_row(row_slice, rx, effective_rect_w, src, sw, sy, opacity, &x_map); } } } @@ -142,6 +136,9 @@ pub fn scale_blit_rgba( /// rows in parallel. The `row_slice` covers exactly one destination row /// starting at pixel column 0 (i.e. byte offset `rx * 4` is the first column /// we write to). +/// +/// `x_map` is a precomputed table mapping each destination column to the +/// corresponding source column, eliminating per-pixel integer division. #[allow( clippy::cast_possible_truncation, clippy::cast_sign_loss, @@ -155,18 +152,16 @@ fn blit_row( effective_rw: usize, src: &[u8], sw: usize, - sh: usize, sy: usize, - rw: usize, opacity: f32, - src_col_skip: usize, + x_map: &[usize], ) { // Fast path: when opacity is 1.0, we can skip the f32 multiply on alpha // and branch more cheaply. if opacity >= 1.0 { - blit_row_opaque(row_slice, rx, effective_rw, src, sw, sh, sy, rw, src_col_skip); + blit_row_opaque(row_slice, rx, effective_rw, src, sw, sy, x_map); } else { - blit_row_alpha(row_slice, rx, effective_rw, src, sw, sh, sy, rw, opacity, src_col_skip); + blit_row_alpha(row_slice, rx, effective_rw, src, sw, sy, opacity, x_map); } } @@ -184,6 +179,7 @@ const fn blend_u8(src: u8, dst: u8, alpha: u16) -> u8 { /// per-pixel f32 multiply on the source alpha channel. /// /// Uses integer-only alpha blending for semi-transparent source pixels. +/// `x_map` provides precomputed source-X indices (one per destination column). #[allow( clippy::cast_possible_truncation, clippy::cast_sign_loss, @@ -198,14 +194,12 @@ fn blit_row_opaque( effective_rw: usize, src: &[u8], sw: usize, - _sh: usize, sy: usize, - rw: usize, - src_col_skip: usize, + x_map: &[usize], ) { let src_row_base = sy * sw * 4; for dx in 0..effective_rw { - let sx = (dx + src_col_skip) * sw / rw; + let sx = x_map[dx]; let src_idx = src_row_base + sx * 4; if src_idx + 3 >= src.len() { continue; @@ -242,6 +236,7 @@ fn blit_row_opaque( /// Applies the opacity multiplier to every source pixel's alpha channel. /// /// Uses integer-only alpha blending. +/// `x_map` provides precomputed source-X indices (one per destination column). #[allow( clippy::cast_possible_truncation, clippy::cast_sign_loss, @@ -256,18 +251,16 @@ fn blit_row_alpha( effective_rw: usize, src: &[u8], sw: usize, - _sh: usize, sy: usize, - rw: usize, opacity: f32, - src_col_skip: usize, + x_map: &[usize], ) { // Pre-compute opacity as a 0..255 integer multiplier. let opacity_u16 = (opacity * 255.0 + 0.5) as u16; let src_row_base = sy * sw * 4; for dx in 0..effective_rw { - let sx = (dx + src_col_skip) * sw / rw; + let sx = x_map[dx]; let src_idx = src_row_base + sx * 4; if src_idx + 3 >= src.len() { continue; From e28123fc1ccc1df4cc9cadd2d18df80b3a48d38a Mon Sep 17 00:00:00 2001 From: StreamKit Devin Date: Sun, 1 Mar 2026 08:35:16 +0000 Subject: [PATCH 05/12] perf(compositor): add identity-scale fast path for 1:1 opaque blits Optimization 4: When source dimensions match the destination rect, opacity is 1.0, and there's no clipping offset, bypass the x-map lookup entirely. For fully-opaque source rows, use bulk memcpy (copy_from_slice). For rows with semi-transparent pixels, use a simplified per-pixel blend without the scaling indirection. Co-Authored-By: Claudio Costa --- .../nodes/src/video/compositor/pixel_ops.rs | 59 +++++++++++++++++-- 1 file changed, 54 insertions(+), 5 deletions(-) diff --git a/crates/nodes/src/video/compositor/pixel_ops.rs b/crates/nodes/src/video/compositor/pixel_ops.rs index 8d3de5b1..f0b61e18 100644 --- a/crates/nodes/src/video/compositor/pixel_ops.rs +++ b/crates/nodes/src/video/compositor/pixel_ops.rs @@ -101,11 +101,6 @@ pub fn scale_blit_rgba( return; } - // Precompute the source-X lookup table once. This replaces the per-pixel - // `(dx + src_col_skip) * sw / rw` integer division with a single table - // lookup in the inner blit loops. - let x_map: Vec = (0..effective_rect_w).map(|dx| (dx + src_col_skip) * sw / rw).collect(); - // Split the destination buffer into per-row slices so that each row can // be processed independently (and therefore in parallel). let row_stride = dw * 4; @@ -115,6 +110,60 @@ pub fn scale_blit_rgba( let first_row_byte = ry * row_stride; let dst_rows = &mut dst[first_row_byte..]; + // ── Identity-scale fast path ─────────────────────────────────────── + // When source dimensions exactly match the destination rect and opacity + // is fully opaque, we can avoid per-pixel scaling entirely and use + // direct row copies (memcpy) for fully-opaque source rows. + if rw == sw && rh == sh && opacity >= 1.0 && src_col_skip == 0 && src_row_skip == 0 { + let src_row_bytes = sw * 4; + let copy_bytes = effective_rect_w * 4; + for (dy, row_slice) in dst_rows.chunks_mut(row_stride).take(effective_rh).enumerate() { + let src_start = dy * src_row_bytes; + let src_end = src_start + copy_bytes; + if src_end > src.len() { + break; + } + let dst_start = rx * 4; + let dst_end = dst_start + copy_bytes; + if dst_end > row_slice.len() { + break; + } + // Check if the source row has any semi-transparent pixels. + // For fully-opaque rows, use bulk memcpy. For rows with alpha, + // fall back to per-pixel blending. + let src_row = &src[src_start..src_end]; + let all_opaque = src_row.chunks_exact(4).all(|px| px[3] == 255); + if all_opaque { + row_slice[dst_start..dst_end].copy_from_slice(src_row); + } else { + // Per-pixel alpha blend (identity scale, so sx == dx). + for dx in 0..effective_rect_w { + let si = dx * 4; + let sa = src_row[si + 3]; + if sa == 255 { + row_slice[dst_start + dx * 4..dst_start + dx * 4 + 4] + .copy_from_slice(&src_row[si..si + 4]); + } else if sa > 0 { + let di = dst_start + dx * 4; + let a16 = u16::from(sa); + row_slice[di] = blend_u8(src_row[si], row_slice[di], a16); + row_slice[di + 1] = blend_u8(src_row[si + 1], row_slice[di + 1], a16); + row_slice[di + 2] = blend_u8(src_row[si + 2], row_slice[di + 2], a16); + let da = u16::from(row_slice[di + 3]); + row_slice[di + 3] = (a16 + ((da * (255 - a16) + 128) >> 8)).min(255) as u8; + } + } + } + } + return; + } + + // ── Scaled blit path ─────────────────────────────────────────────── + // Precompute the source-X lookup table once. This replaces the per-pixel + // `(dx + src_col_skip) * sw / rw` integer division with a single table + // lookup in the inner blit loops. + let x_map: Vec = (0..effective_rect_w).map(|dx| (dx + src_col_skip) * sw / rw).collect(); + if effective_rh >= RAYON_ROW_THRESHOLD { dst_rows.par_chunks_mut(row_stride).take(effective_rh).enumerate().for_each( |(dy, row_slice)| { From e54470c21068c5a6331ea8900392720b32dcddee Mon Sep 17 00:00:00 2001 From: StreamKit Devin Date: Sun, 1 Mar 2026 08:36:00 +0000 Subject: [PATCH 06/12] perf(compositor): pre-scale image overlays at decode time Optimization 5: When a decoded image overlay's native dimensions differ from its target rect, pre-scale it once using nearest-neighbor at config/update time. This ensures the per-frame blit_overlay call hits the identity-scale fast path (memcpy) instead of re-scaling every frame. Co-Authored-By: Claudio Costa --- crates/nodes/src/video/compositor/overlay.rs | 53 +++++++++++++++++--- 1 file changed, 46 insertions(+), 7 deletions(-) diff --git a/crates/nodes/src/video/compositor/overlay.rs b/crates/nodes/src/video/compositor/overlay.rs index 3072870a..0fc1e6b2 100644 --- a/crates/nodes/src/video/compositor/overlay.rs +++ b/crates/nodes/src/video/compositor/overlay.rs @@ -36,13 +36,52 @@ pub fn decode_image_overlay(config: &ImageOverlayConfig) -> Result 0 && target_h > 0 && (w != target_w || h != target_h) { + let raw = rgba.into_raw(); + let scaled = prescale_rgba(&raw, w, h, target_w, target_h); + Ok(DecodedOverlay { + rgba_data: scaled, + width: target_w, + height: target_h, + rect: config.rect.clone(), + opacity: config.opacity, + }) + } else { + Ok(DecodedOverlay { + rgba_data: rgba.into_raw(), + width: w, + height: h, + rect: config.rect.clone(), + opacity: config.opacity, + }) + } +} + +/// Nearest-neighbor scale an RGBA8 buffer from `(sw, sh)` to `(dw, dh)`. +/// Used once at config time so the per-frame blit is a 1:1 copy. +fn prescale_rgba(src: &[u8], sw: u32, sh: u32, dw: u32, dh: u32) -> Vec { + let sw = sw as usize; + let sh = sh as usize; + let dw = dw as usize; + let dh = dh as usize; + let mut out = vec![0u8; dw * dh * 4]; + for dy in 0..dh { + let sy = dy * sh / dh; + for dx in 0..dw { + let sx = dx * sw / dw; + let si = (sy * sw + sx) * 4; + let di = (dy * dw + dx) * 4; + out[di..di + 4].copy_from_slice(&src[si..si + 4]); + } + } + out } // ── Bundled default font ──────────────────────────────────────────────────── From 763a6bba0bc1739322b54b1cc3696f0af286fb3e Mon Sep 17 00:00:00 2001 From: StreamKit Devin Date: Sun, 1 Mar 2026 08:38:13 +0000 Subject: [PATCH 07/12] perf(compositor): cache layer configs and skip per-frame sort Optimization 6: Extract per-slot layer config resolution and z-order sorting into a rebuild_layer_cache() function that runs only when config or pin set changes (UpdateParams, pin add/remove, channel close). Per-frame layer building now uses the cached resolved configs and pre-sorted draw order instead of doing HashMap lookups and sort_by on every frame. Co-Authored-By: Claudio Costa --- crates/nodes/src/video/compositor/mod.rs | 142 +++++++++++++++-------- 1 file changed, 92 insertions(+), 50 deletions(-) diff --git a/crates/nodes/src/video/compositor/mod.rs b/crates/nodes/src/video/compositor/mod.rs index 8b743172..e27bb8ce 100644 --- a/crates/nodes/src/video/compositor/mod.rs +++ b/crates/nodes/src/video/compositor/mod.rs @@ -30,7 +30,7 @@ mod overlay; pub mod pixel_ops; use async_trait::async_trait; -use config::{CompositorConfig, Rect}; +use config::CompositorConfig; use kernel::{CompositeResult, CompositeWorkItem, LayerSnapshot}; use overlay::{decode_image_overlay, rasterize_text_overlay, DecodedOverlay}; use schemars::schema_for; @@ -59,6 +59,63 @@ struct InputSlot { latest_frame: Option, } +// ── Cached layer config ───────────────────────────────────────────────────── + +/// Pre-resolved layer configuration for a single slot. +/// Rebuilt only when compositor config or pin set changes, avoiding +/// per-frame `HashMap` lookups and `sort_by` calls. +#[derive(Clone)] +struct ResolvedSlotConfig { + rect: Option, + opacity: f32, + z_index: i32, + rotation_degrees: f32, +} + +/// Rebuild the per-slot resolved configs and the z-sorted draw order. +/// +/// Called once at startup and whenever `UpdateParams` or pin management +/// changes the layer set. The returned draw order is a list of slot +/// indices sorted by `(z_index, slot_index)`. +fn rebuild_layer_cache( + slots: &[InputSlot], + config: &CompositorConfig, +) -> (Vec, Vec) { + let num_slots = slots.len(); + let mut configs: Vec = Vec::with_capacity(num_slots); + for (idx, slot) in slots.iter().enumerate() { + let layer_cfg = config.layers.get(&slot.name); + #[allow(clippy::option_if_let_else)] + let (rect, opacity, z_index, rotation_degrees) = if let Some(lc) = layer_cfg { + (lc.rect.clone(), lc.opacity, lc.z_index, lc.rotation_degrees) + } else if idx > 0 && num_slots > 1 { + // Auto-PiP: non-first layers without explicit config. + let pip_w = config.width / 3; + let pip_h = config.height / 3; + #[allow(clippy::cast_possible_wrap)] + let pip_x = (config.width - pip_w - 20) as i32; + #[allow(clippy::cast_possible_wrap)] + let pip_y = (config.height - pip_h - 20) as i32; + #[allow(clippy::cast_possible_wrap)] + ( + Some(config::Rect { x: pip_x, y: pip_y, width: pip_w, height: pip_h }), + 0.9, + idx as i32, + 0.0, + ) + } else { + (None, 1.0, 0, 0.0) + }; + configs.push(ResolvedSlotConfig { rect, opacity, z_index, rotation_degrees }); + } + + // Pre-sort by (z_index, slot_index). + let mut draw_order: Vec = (0..num_slots).collect(); + draw_order.sort_by(|&a, &b| configs[a].z_index.cmp(&configs[b].z_index).then(a.cmp(&b))); + + (configs, draw_order) +} + // ── Node ──────────────────────────────────────────────────────────────────── /// Composites multiple raw video inputs onto a single RGBA8 canvas with @@ -309,6 +366,14 @@ impl ProcessorNode for CompositorNode { let mut output_seq: u64 = 0; let mut stop_reason: &str = "shutdown"; + // ── Cached layer config + draw order ──────────────────────────── + // Rebuilt only when config or pin set changes (UpdateParams, + // pin add/remove, channel close). Avoids per-frame HashMap + // lookups and sort_by calls. + let mut layer_configs_dirty = true; + let mut resolved_configs: Vec = Vec::new(); + let mut sorted_draw_order: Vec = Vec::new(); + loop { // ── Take at most one frame from every slot (non-blocking) ─── // We intentionally take only one frame per slot per iteration so @@ -347,6 +412,7 @@ impl ProcessorNode for CompositorNode { params, &mut stats_tracker, ); + layer_configs_dirty = true; }, NodeControlMessage::Start => {}, } @@ -364,6 +430,7 @@ impl ProcessorNode for CompositorNode { msg, &mut slots, ); + layer_configs_dirty = true; } // Wait for a frame from any connected input. @@ -379,6 +446,7 @@ impl ProcessorNode for CompositorNode { slots[slot_idx].name ); slots.remove(slot_idx); + layer_configs_dirty = true; if slots.is_empty() { stop_reason = "all_inputs_closed"; should_break = true; @@ -425,6 +493,7 @@ impl ProcessorNode for CompositorNode { params, &mut stats_tracker, ); + layer_configs_dirty = true; }, NodeControlMessage::Start => {}, } @@ -440,6 +509,7 @@ impl ProcessorNode for CompositorNode { msg, &mut slots, ); + layer_configs_dirty = true; } } continue; @@ -463,6 +533,7 @@ impl ProcessorNode for CompositorNode { params, &mut stats_tracker, ); + layer_configs_dirty = true; }, NodeControlMessage::Start => {}, } @@ -473,69 +544,40 @@ impl ProcessorNode for CompositorNode { if let Some(ref mut pmrx) = pin_mgmt_rx { while let Ok(msg) = pmrx.try_recv() { Self::handle_pin_management(&mut self, msg, &mut slots); + layer_configs_dirty = true; } } + // ── Rebuild layer config cache if needed ───────────────────── + if layer_configs_dirty { + let (cfgs, order) = rebuild_layer_cache(&slots, &self.config); + resolved_configs = cfgs; + sorted_draw_order = order; + layer_configs_dirty = false; + } + // ── Send work to persistent compositing thread ───────────── - // Collect the data we need to send to the blocking thread. - let num_slots = slots.len(); - let mut layers: Vec> = slots + // Build layer snapshots in pre-sorted draw order using the + // cached per-slot configs (no HashMap lookup, no sort). + let layers: Vec> = sorted_draw_order .iter() - .enumerate() - .map(|(idx, slot)| { - slot.latest_frame.as_ref().map(|f| { - let layer_cfg = self.config.layers.get(&slot.name); - #[allow(clippy::option_if_let_else)] - let (rect, opacity, z_index, rotation_degrees) = if let Some(lc) = layer_cfg - { - // Explicit per-layer config. - (lc.rect.clone(), lc.opacity, lc.z_index, lc.rotation_degrees) - } else if idx > 0 && num_slots > 1 { - // Auto-PiP: non-first layers without explicit config - // are placed in the bottom-right corner at 1/3 canvas - // size with slight transparency. - let pip_w = self.config.width / 3; - let pip_h = self.config.height / 3; - #[allow(clippy::cast_possible_wrap)] - let pip_x = (self.config.width - pip_w - 20) as i32; - #[allow(clippy::cast_possible_wrap)] - let pip_y = (self.config.height - pip_h - 20) as i32; - #[allow(clippy::cast_possible_wrap)] - ( - Some(Rect { x: pip_x, y: pip_y, width: pip_w, height: pip_h }), - 0.9, - idx as i32, - 0.0, - ) - } else { - // First layer (or single input): fill the canvas. - (None, 1.0, 0, 0.0) - }; + .map(|&idx| { + slots[idx].latest_frame.as_ref().map(|f| { + let cfg = &resolved_configs[idx]; LayerSnapshot { data: f.data.clone(), width: f.width, height: f.height, pixel_format: f.pixel_format, - rect, - opacity, - z_index, - rotation_degrees, + rect: cfg.rect.clone(), + opacity: cfg.opacity, + z_index: cfg.z_index, + rotation_degrees: cfg.rotation_degrees, } }) }) .collect(); - // Sort layers by z_index so that lower values are drawn first - // (bottom of the stack). `None` entries (slots without a frame) - // are pushed to the end — they are skipped during compositing - // anyway. - layers.sort_by(|a, b| match (a, b) { - (Some(la), Some(lb)) => la.z_index.cmp(&lb.z_index), - (Some(_), None) => std::cmp::Ordering::Less, - (None, Some(_)) => std::cmp::Ordering::Greater, - (None, None) => std::cmp::Ordering::Equal, - }); - stats_tracker.received(); let work_item = CompositeWorkItem { @@ -773,7 +815,7 @@ mod tests { use crate::test_utils::{ assert_state_initializing, assert_state_running, assert_state_stopped, create_test_context, }; - use config::LayerConfig; + use config::{LayerConfig, Rect}; use pixel_ops::scale_blit_rgba; use std::collections::HashMap; use tokio::sync::mpsc; From 91e93bb727f834426f9c9f647847fb518cbe6e27 Mon Sep 17 00:00:00 2001 From: StreamKit Devin Date: Sun, 1 Mar 2026 08:38:47 +0000 Subject: [PATCH 08/12] perf(frame_pool): preallocate video pool buckets at startup Optimization 7: Change video_default() from with_buckets (lazy, no preallocation) to preallocated_with_max with 2 buffers per bucket. This avoids cold-start allocation misses for the first few frames, matching the existing audio_default() pattern. Co-Authored-By: Claudio Costa --- crates/core/src/frame_pool.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/crates/core/src/frame_pool.rs b/crates/core/src/frame_pool.rs index c7a6a829..1e90b1e3 100644 --- a/crates/core/src/frame_pool.rs +++ b/crates/core/src/frame_pool.rs @@ -325,10 +325,15 @@ pub const DEFAULT_VIDEO_BUCKET_SIZES: &[usize] = &[ ]; pub const DEFAULT_VIDEO_MAX_BUFFERS_PER_BUCKET: usize = 16; +/// Number of buffers to preallocate per video bucket at startup. +/// Avoids cold-start misses for the first few frames. +pub const DEFAULT_VIDEO_PREALLOCATE_PER_BUCKET: usize = 2; + impl FramePool { pub fn video_default() -> Self { - Self::with_buckets( - DEFAULT_VIDEO_BUCKET_SIZES.to_vec(), + Self::preallocated_with_max( + DEFAULT_VIDEO_BUCKET_SIZES, + DEFAULT_VIDEO_PREALLOCATE_PER_BUCKET, DEFAULT_VIDEO_MAX_BUFFERS_PER_BUCKET, ) } From e942408ceffe74cfc4c21bd26b1f703e7914dc8b Mon Sep 17 00:00:00 2001 From: StreamKit Devin Date: Sun, 1 Mar 2026 08:45:11 +0000 Subject: [PATCH 09/12] style(compositor): fix clippy warnings from optimization changes - Use map_or instead of match/if-let-else in ConversionCache and first_layer_covers_canvas - Allow expect_used with safety comment in get_or_convert - Allow dead_code on LayerSnapshot::z_index (sorting moved upstream) - Allow needless_range_loop in blit_row_opaque/blit_row_alpha (dx used for both x_map index and dst offset) - Allow cast_possible_truncation on idx as i32 in rebuild_layer_cache Co-Authored-By: Claudio Costa --- crates/nodes/src/video/compositor/kernel.rs | 37 +++++++++---------- crates/nodes/src/video/compositor/mod.rs | 2 +- .../nodes/src/video/compositor/pixel_ops.rs | 6 ++- 3 files changed, 23 insertions(+), 22 deletions(-) diff --git a/crates/nodes/src/video/compositor/kernel.rs b/crates/nodes/src/video/compositor/kernel.rs index 76b0b0c7..2f52485a 100644 --- a/crates/nodes/src/video/compositor/kernel.rs +++ b/crates/nodes/src/video/compositor/kernel.rs @@ -57,14 +57,11 @@ impl ConversionCache { } // Check if the cached entry is still valid. - let needs_convert = match &self.entries[slot_idx] { - Some(cached) => { - cached.data_identity != identity - || cached.width != layer.width - || cached.height != layer.height - }, - None => true, - }; + let needs_convert = self.entries[slot_idx].as_ref().map_or(true, |cached| { + cached.data_identity != identity + || cached.width != layer.width + || cached.height != layer.height + }); if needs_convert { let needed = layer.width as usize * layer.height as usize * 4; @@ -95,6 +92,9 @@ impl ConversionCache { }); } + // SAFETY: we just inserted into this slot above when `needs_convert` was true, + // and the slot was already `Some` when `needs_convert` was false. + #[allow(clippy::expect_used)] let cached = self.entries[slot_idx].as_ref().expect("just inserted"); let needed = layer.width as usize * layer.height as usize * 4; &cached.rgba[..needed] @@ -118,15 +118,12 @@ fn first_layer_covers_canvas( // Check if the layer fully covers the canvas. // A layer with no rect fills the entire canvas by default. - match &first.rect { - None => true, - Some(r) => { - r.x <= 0 - && r.y <= 0 - && i64::from(r.width) + i64::from(r.x) >= i64::from(canvas_w) - && i64::from(r.height) + i64::from(r.y) >= i64::from(canvas_h) - }, - } + first.rect.as_ref().map_or(true, |r| { + r.x <= 0 + && r.y <= 0 + && i64::from(r.width) + i64::from(r.x) >= i64::from(canvas_w) + && i64::from(r.height) + i64::from(r.y) >= i64::from(canvas_h) + }) } /// Snapshot of one input layer's data for the blocking compositor thread. @@ -137,8 +134,10 @@ pub struct LayerSnapshot { pub pixel_format: PixelFormat, pub rect: Option, pub opacity: f32, - /// Visual stacking order. Lower values are drawn first (bottom). - /// Used to sort layers before compositing; ties broken by slot index. + /// Visual stacking order. Retained in the snapshot for diagnostic / + /// logging purposes even though sorting now happens before snapshot + /// construction. + #[allow(dead_code)] pub z_index: i32, /// Clockwise rotation in degrees around the destination rect centre. /// Default `0.0` means no rotation. diff --git a/crates/nodes/src/video/compositor/mod.rs b/crates/nodes/src/video/compositor/mod.rs index e27bb8ce..51b20429 100644 --- a/crates/nodes/src/video/compositor/mod.rs +++ b/crates/nodes/src/video/compositor/mod.rs @@ -96,7 +96,7 @@ fn rebuild_layer_cache( let pip_x = (config.width - pip_w - 20) as i32; #[allow(clippy::cast_possible_wrap)] let pip_y = (config.height - pip_h - 20) as i32; - #[allow(clippy::cast_possible_wrap)] + #[allow(clippy::cast_possible_wrap, clippy::cast_possible_truncation)] ( Some(config::Rect { x: pip_x, y: pip_y, width: pip_w, height: pip_h }), 0.9, diff --git a/crates/nodes/src/video/compositor/pixel_ops.rs b/crates/nodes/src/video/compositor/pixel_ops.rs index f0b61e18..a87f13fd 100644 --- a/crates/nodes/src/video/compositor/pixel_ops.rs +++ b/crates/nodes/src/video/compositor/pixel_ops.rs @@ -234,7 +234,8 @@ const fn blend_u8(src: u8, dst: u8, alpha: u16) -> u8 { clippy::cast_sign_loss, clippy::too_many_arguments, clippy::suboptimal_flops, - clippy::inline_always + clippy::inline_always, + clippy::needless_range_loop )] #[inline(always)] fn blit_row_opaque( @@ -291,7 +292,8 @@ fn blit_row_opaque( clippy::cast_sign_loss, clippy::too_many_arguments, clippy::suboptimal_flops, - clippy::inline_always + clippy::inline_always, + clippy::needless_range_loop )] #[inline(always)] fn blit_row_alpha( From 8311d56208d561816e08fd334cc48dd8a8a18c0f Mon Sep 17 00:00:00 2001 From: StreamKit Devin Date: Sun, 1 Mar 2026 09:15:55 +0000 Subject: [PATCH 10/12] fix(compositor): address correctness + bench issues from review - Fix #1 (High): skip-clear now validates source pixel alpha (all pixels must have alpha==255) before skipping canvas clear. Prevents blending against stale pooled buffer data when RGBA source has transparency. - Fix #2 (Medium): conversion cache slot indices now use position in the full layers slice (with None holes) via two-pass resolution, so cache keys stay stable when slots gain/lose frames. - Fix #3 (Medium): benchmark now calls real composite_frame() kernel instead of reimplementing compositing inline. Exercises all kernel optimizations (cache, clear-skip, identity fast-path, x-map). - Fix Devin Review: revert video pool preallocation (was allocating ~121MB across all bucket sizes at startup). Restored lazy allocation. Co-Authored-By: Claudio Costa --- crates/core/src/frame_pool.rs | 10 +-- crates/engine/benches/compositor_only.rs | 91 ++++--------------- crates/nodes/src/video/compositor/kernel.rs | 97 +++++++++++++++++---- crates/nodes/src/video/compositor/mod.rs | 4 +- 4 files changed, 100 insertions(+), 102 deletions(-) diff --git a/crates/core/src/frame_pool.rs b/crates/core/src/frame_pool.rs index 1e90b1e3..9e3bf98e 100644 --- a/crates/core/src/frame_pool.rs +++ b/crates/core/src/frame_pool.rs @@ -325,17 +325,9 @@ pub const DEFAULT_VIDEO_BUCKET_SIZES: &[usize] = &[ ]; pub const DEFAULT_VIDEO_MAX_BUFFERS_PER_BUCKET: usize = 16; -/// Number of buffers to preallocate per video bucket at startup. -/// Avoids cold-start misses for the first few frames. -pub const DEFAULT_VIDEO_PREALLOCATE_PER_BUCKET: usize = 2; - impl FramePool { pub fn video_default() -> Self { - Self::preallocated_with_max( - DEFAULT_VIDEO_BUCKET_SIZES, - DEFAULT_VIDEO_PREALLOCATE_PER_BUCKET, - DEFAULT_VIDEO_MAX_BUFFERS_PER_BUCKET, - ) + Self::with_buckets(DEFAULT_VIDEO_BUCKET_SIZES.to_vec(), DEFAULT_VIDEO_MAX_BUFFERS_PER_BUCKET) } } diff --git a/crates/engine/benches/compositor_only.rs b/crates/engine/benches/compositor_only.rs index 31ea5b85..f5036d7d 100644 --- a/crates/engine/benches/compositor_only.rs +++ b/crates/engine/benches/compositor_only.rs @@ -41,21 +41,10 @@ use streamkit_core::types::PixelFormat; // Re-use the compositor kernel and pixel_ops directly. use streamkit_nodes::video::compositor::config::Rect; +use streamkit_nodes::video::compositor::kernel::{composite_frame, ConversionCache, LayerSnapshot}; +use streamkit_nodes::video::compositor::overlay::DecodedOverlay; use streamkit_nodes::video::compositor::pixel_ops::rgba8_to_i420; -/// Inline copy of `LayerSnapshot` to avoid depending on the private `kernel` module. -/// Must stay in sync with `kernel::LayerSnapshot`. -struct LayerSnapshot { - data: Arc, - width: u32, - height: u32, - pixel_format: PixelFormat, - rect: Option, - opacity: f32, - z_index: i32, - rotation_degrees: f32, -} - // ── Default benchmark parameters ──────────────────────────────────────────── const DEFAULT_WIDTH: u32 = 1280; @@ -178,8 +167,10 @@ fn generate_nv12_frame(width: u32, height: u32) -> Vec { // ── Compositing harness ───────────────────────────────────────────────────── -/// Directly call the compositing kernel for `frame_count` iterations, -/// returning per-frame timing statistics. +/// Call the real `composite_frame` kernel for `frame_count` iterations, +/// returning per-frame timing statistics. This exercises all kernel +/// optimizations: conversion cache, skip-canvas-clear, identity-scale +/// fast-path, precomputed x-map, etc. fn bench_composite( _label: &str, canvas_w: u32, @@ -187,69 +178,21 @@ fn bench_composite( layers: &[Option], frame_count: u32, ) -> BenchResult { - // Re-create the kernel's compositing logic inline since `composite_frame` - // is pub(crate). We call the public pixel_ops functions directly. - let total_bytes = (canvas_w as usize) * (canvas_h as usize) * 4; - let mut canvas = vec![0u8; total_bytes]; - let mut i420_scratch: Vec = Vec::new(); + let empty_overlays: Vec> = Vec::new(); + let mut conversion_cache = ConversionCache::new(); let start = Instant::now(); for _ in 0..frame_count { - // Zero the canvas. - canvas.fill(0); - - // Blit each layer. - for layer in layers.iter().flatten() { - let dst_rect = layer.rect.clone().unwrap_or(Rect { - x: 0, - y: 0, - width: canvas_w, - height: canvas_h, - }); - - let src_data: &[u8] = match layer.pixel_format { - PixelFormat::Rgba8 => layer.data.as_slice(), - PixelFormat::I420 => { - let needed = layer.width as usize * layer.height as usize * 4; - if i420_scratch.len() < needed { - i420_scratch.resize(needed, 0); - } - streamkit_nodes::video::compositor::pixel_ops::i420_to_rgba8_buf( - layer.data.as_slice(), - layer.width, - layer.height, - &mut i420_scratch, - ); - &i420_scratch[..needed] - }, - PixelFormat::Nv12 => { - let needed = layer.width as usize * layer.height as usize * 4; - if i420_scratch.len() < needed { - i420_scratch.resize(needed, 0); - } - streamkit_nodes::video::compositor::pixel_ops::nv12_to_rgba8_buf( - layer.data.as_slice(), - layer.width, - layer.height, - &mut i420_scratch, - ); - &i420_scratch[..needed] - }, - }; - - streamkit_nodes::video::compositor::pixel_ops::scale_blit_rgba_rotated( - &mut canvas, - canvas_w, - canvas_h, - src_data, - layer.width, - layer.height, - &dst_rect, - layer.opacity, - layer.rotation_degrees, - ); - } + let _result = composite_frame( + canvas_w, + canvas_h, + layers, + &empty_overlays, + &empty_overlays, + None, + &mut conversion_cache, + ); } let elapsed = start.elapsed(); diff --git a/crates/nodes/src/video/compositor/kernel.rs b/crates/nodes/src/video/compositor/kernel.rs index 2f52485a..3e979fe6 100644 --- a/crates/nodes/src/video/compositor/kernel.rs +++ b/crates/nodes/src/video/compositor/kernel.rs @@ -46,6 +46,23 @@ impl ConversionCache { Self { entries: Vec::new() } } + /// Return a previously-cached RGBA slice for `slot_idx`. + /// + /// # Panics + /// + /// Panics if the slot has not been populated by a prior `get_or_convert` + /// call for the same `layer`. This is only called in the second pass of + /// `composite_frame` after the first pass has ensured every non-RGBA + /// layer has been converted. + fn get_cached(&self, slot_idx: usize, layer: &LayerSnapshot) -> &[u8] { + #[allow(clippy::expect_used)] + let cached = self.entries[slot_idx] + .as_ref() + .expect("get_cached called before get_or_convert"); + let needed = layer.width as usize * layer.height as usize * 4; + &cached.rgba[..needed] + } + /// Look up or perform a YUV→RGBA conversion for layer at `slot_idx`. /// Returns a slice of RGBA8 data. fn get_or_convert(&mut self, slot_idx: usize, layer: &LayerSnapshot) -> &[u8] { @@ -101,12 +118,20 @@ impl ConversionCache { } } -/// Returns `true` if the first visible layer is fully opaque, unrotated, and -/// covers the entire canvas — meaning the canvas clear can be skipped. +/// Returns `true` if the first visible layer is fully opaque (both layer +/// opacity *and* source pixel alpha), unrotated, and covers the entire +/// canvas — meaning the canvas clear can be skipped. +/// +/// `first_src_data` is the RGBA8 source buffer for the first layer (after +/// any YUV→RGBA conversion). We check that every pixel in the region that +/// will be blitted has `alpha == 255`; if any pixel is semi-transparent the +/// clear cannot be skipped because the blit would blend with uninitialised +/// (or stale pooled) canvas bytes. fn first_layer_covers_canvas( layers: &[Option], canvas_w: u32, canvas_h: u32, + first_src_data: Option<&[u8]>, ) -> bool { let Some(first) = layers.iter().flatten().next() else { return false; @@ -118,12 +143,23 @@ fn first_layer_covers_canvas( // Check if the layer fully covers the canvas. // A layer with no rect fills the entire canvas by default. - first.rect.as_ref().map_or(true, |r| { + let covers = first.rect.as_ref().map_or(true, |r| { r.x <= 0 && r.y <= 0 && i64::from(r.width) + i64::from(r.x) >= i64::from(canvas_w) && i64::from(r.height) + i64::from(r.y) >= i64::from(canvas_h) - }) + }); + if !covers { + return false; + } + + // Verify that *all* source pixels are fully opaque (alpha == 255). + // Without this check, semi-transparent source pixels would blend with + // uninitialised canvas bytes when the clear is skipped. + let Some(src) = first_src_data else { + return false; + }; + src.chunks_exact(4).all(|px| px[3] == 255) } /// Snapshot of one input layer's data for the blocking compositor thread. @@ -185,26 +221,53 @@ pub fn composite_frame( let buf = pooled.as_mut_slice(); - // Skip the canvas clear when the first layer is opaque, unrotated, and - // covers the entire canvas — the blit will fully overwrite every pixel. - if !first_layer_covers_canvas(layers, canvas_w, canvas_h) { + // Two-pass source resolution. + // + // Pass 1: populate the conversion cache for every non-RGBA layer. + // `slot_idx` uses the position in the `layers` slice (which preserves + // `None` holes) so that cache indices stay stable even when some slots + // have no frame. + for (slot_idx, entry) in layers.iter().enumerate() { + if let Some(layer) = entry { + if layer.pixel_format != PixelFormat::Rgba8 { + conversion_cache.get_or_convert(slot_idx, layer); + } + } + } + + // Pass 2: build resolved references. The mutable borrow of + // `conversion_cache` from pass 1 is released, so we can now take + // shared references into the cache alongside references into `layers`. + let resolved: Vec> = layers + .iter() + .enumerate() + .map(|(slot_idx, entry)| { + entry.as_ref().map(|layer| { + let src_data: &[u8] = match layer.pixel_format { + PixelFormat::Rgba8 => layer.data.as_slice(), + PixelFormat::I420 | PixelFormat::Nv12 => { + // Cache was populated in pass 1; this is a shared + // read that cannot fail. + conversion_cache.get_cached(slot_idx, layer) + }, + }; + (layer, src_data) + }) + }) + .collect(); + + // Now that we have the first layer's resolved RGBA data, check whether + // the canvas clear can be skipped. + let first_src = resolved.iter().flatten().next().map(|(_, d)| *d); + if !first_layer_covers_canvas(layers, canvas_w, canvas_h, first_src) { buf[..total_bytes].fill(0); } // Blit each layer (in order — first layer is bottom, last is top). - // Non-RGBA layers use the conversion cache to avoid redundant per-frame - // YUV→RGBA8 conversion when the source data hasn't changed. - for (slot_idx, layer) in layers.iter().flatten().enumerate() { + for (layer, src_data) in resolved.iter().flatten() { let dst_rect = layer.rect.clone().unwrap_or(Rect { x: 0, y: 0, width: canvas_w, height: canvas_h }); - let src_data: &[u8] = match layer.pixel_format { - PixelFormat::Rgba8 => layer.data.as_slice(), - PixelFormat::I420 | PixelFormat::Nv12 => { - conversion_cache.get_or_convert(slot_idx, layer) - }, - }; - scale_blit_rgba_rotated( buf, canvas_w, diff --git a/crates/nodes/src/video/compositor/mod.rs b/crates/nodes/src/video/compositor/mod.rs index 51b20429..9c45af98 100644 --- a/crates/nodes/src/video/compositor/mod.rs +++ b/crates/nodes/src/video/compositor/mod.rs @@ -25,8 +25,8 @@ //! - Bilinear / Lanczos scaling (MVP uses nearest-neighbor). pub mod config; -mod kernel; -mod overlay; +pub mod kernel; +pub mod overlay; pub mod pixel_ops; use async_trait::async_trait; From 3b26f3269dd5d1684467ca34c4b4ee444970b5c1 Mon Sep 17 00:00:00 2001 From: StreamKit Devin Date: Sun, 1 Mar 2026 09:16:04 +0000 Subject: [PATCH 11/12] style: apply rustfmt to fix formatting Co-Authored-By: Claudio Costa --- crates/core/src/frame_pool.rs | 5 ++++- crates/nodes/src/video/compositor/kernel.rs | 5 ++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/crates/core/src/frame_pool.rs b/crates/core/src/frame_pool.rs index 9e3bf98e..c7a6a829 100644 --- a/crates/core/src/frame_pool.rs +++ b/crates/core/src/frame_pool.rs @@ -327,7 +327,10 @@ pub const DEFAULT_VIDEO_MAX_BUFFERS_PER_BUCKET: usize = 16; impl FramePool { pub fn video_default() -> Self { - Self::with_buckets(DEFAULT_VIDEO_BUCKET_SIZES.to_vec(), DEFAULT_VIDEO_MAX_BUFFERS_PER_BUCKET) + Self::with_buckets( + DEFAULT_VIDEO_BUCKET_SIZES.to_vec(), + DEFAULT_VIDEO_MAX_BUFFERS_PER_BUCKET, + ) } } diff --git a/crates/nodes/src/video/compositor/kernel.rs b/crates/nodes/src/video/compositor/kernel.rs index 3e979fe6..424decdb 100644 --- a/crates/nodes/src/video/compositor/kernel.rs +++ b/crates/nodes/src/video/compositor/kernel.rs @@ -56,9 +56,8 @@ impl ConversionCache { /// layer has been converted. fn get_cached(&self, slot_idx: usize, layer: &LayerSnapshot) -> &[u8] { #[allow(clippy::expect_used)] - let cached = self.entries[slot_idx] - .as_ref() - .expect("get_cached called before get_or_convert"); + let cached = + self.entries[slot_idx].as_ref().expect("get_cached called before get_or_convert"); let needed = layer.width as usize * layer.height as usize * 4; &cached.rgba[..needed] } From f71b7b563015b6b2aeab4be89a139335bc612d4a Mon Sep 17 00:00:00 2001 From: StreamKit Devin Date: Sun, 1 Mar 2026 09:37:10 +0000 Subject: [PATCH 12/12] perf(compositor): SSE2 blend, alpha-scan cache, bench pool, lazy prealloc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix 4 remaining performance findings: 1. High: Add SSE2 SIMD fast path for RGBA blend loops (blit_row_opaque, blit_row_alpha). Processes 4 pixels at a time with fast-paths for fully-opaque (direct copy) and fully-transparent (skip) source pixels. 2. Medium: Optimize alpha scan in clear-skip check — skip scan entirely for I420/NV12 layers (always alpha=255 after conversion), cache scan result by Arc pointer identity for RGBA layers. 3. Medium: Pass VideoFramePool to bench_composite instead of None, so benchmark exercises pool reuse like production. 4. Low-Medium: Lazy preallocate on first bucket use — when a bucket is first hit, allocate one extra buffer so the second get() is a hit. Also: inline clear-skip logic to fix borrow checker conflict, remove unused first_layer_covers_canvas function, add clippy suppression rationale comments for needless_range_loop. Co-Authored-By: Claudio Costa --- crates/core/src/frame_pool.rs | 11 + crates/engine/benches/compositor_only.rs | 9 +- crates/nodes/src/video/compositor/kernel.rs | 117 ++++---- .../nodes/src/video/compositor/pixel_ops.rs | 279 +++++++++++++++++- 4 files changed, 360 insertions(+), 56 deletions(-) diff --git a/crates/core/src/frame_pool.rs b/crates/core/src/frame_pool.rs index c7a6a829..9489df8e 100644 --- a/crates/core/src/frame_pool.rs +++ b/crates/core/src/frame_pool.rs @@ -139,6 +139,11 @@ impl FramePool { /// Get pooled storage for at least `min_len` elements. /// /// If `min_len` doesn't fit in any bucket, returns a non-pooled buffer of exact size. + /// + /// On the first miss for a given bucket (cold start), an extra buffer is + /// allocated and placed into the pool so that the *next* `get()` at the + /// same size is a hit. This amortises cold-start allocation cost without + /// pre-allocating every bucket size up front. pub fn get(&self, min_len: usize) -> PooledFrameData { let (handle, bucket_idx, bucket_size, maybe_buf) = { let Ok(mut guard) = self.inner.lock() else { @@ -154,6 +159,12 @@ impl FramePool { guard.hits += 1; } else { guard.misses += 1; + // Lazy preallocate: on first miss for this bucket, seed the + // pool with one extra buffer so subsequent gets are hits. + if guard.buckets[bucket_idx].is_empty() && guard.buckets[bucket_idx].capacity() == 0 + { + guard.buckets[bucket_idx].push(vec![T::default(); bucket_size]); + } } (self.handle(), bucket_idx, bucket_size, buf) }; diff --git a/crates/engine/benches/compositor_only.rs b/crates/engine/benches/compositor_only.rs index f5036d7d..5a6263fe 100644 --- a/crates/engine/benches/compositor_only.rs +++ b/crates/engine/benches/compositor_only.rs @@ -38,6 +38,7 @@ use std::time::Instant; use streamkit_core::frame_pool::PooledVideoData; use streamkit_core::types::PixelFormat; +use streamkit_core::VideoFramePool; // Re-use the compositor kernel and pixel_ops directly. use streamkit_nodes::video::compositor::config::Rect; @@ -170,7 +171,10 @@ fn generate_nv12_frame(width: u32, height: u32) -> Vec { /// Call the real `composite_frame` kernel for `frame_count` iterations, /// returning per-frame timing statistics. This exercises all kernel /// optimizations: conversion cache, skip-canvas-clear, identity-scale -/// fast-path, precomputed x-map, etc. +/// fast-path, precomputed x-map, SSE2 blend, etc. +/// +/// Uses a real `VideoFramePool` to match production behaviour (pooled buffer +/// reuse instead of per-frame heap allocation). fn bench_composite( _label: &str, canvas_w: u32, @@ -180,6 +184,7 @@ fn bench_composite( ) -> BenchResult { let empty_overlays: Vec> = Vec::new(); let mut conversion_cache = ConversionCache::new(); + let pool = VideoFramePool::video_default(); let start = Instant::now(); @@ -190,7 +195,7 @@ fn bench_composite( layers, &empty_overlays, &empty_overlays, - None, + Some(&pool), &mut conversion_cache, ); } diff --git a/crates/nodes/src/video/compositor/kernel.rs b/crates/nodes/src/video/compositor/kernel.rs index 424decdb..621fe893 100644 --- a/crates/nodes/src/video/compositor/kernel.rs +++ b/crates/nodes/src/video/compositor/kernel.rs @@ -37,13 +37,42 @@ struct CachedConversion { /// /// Avoids redundant per-frame I420/NV12 → RGBA8 conversion when the source /// `Arc` hasn't changed since the previous frame. +/// +/// Also caches the first-layer alpha-scan result so that the canvas-clear +/// skip check doesn't re-scan every frame when the source hasn't changed. pub struct ConversionCache { entries: Vec>, + /// Cached result of the alpha-opaqueness scan for the first visible layer. + /// `(data_identity, all_opaque)` — valid when the `Arc` pointer matches. + first_layer_alpha_cache: Option<(usize, bool)>, } impl ConversionCache { pub const fn new() -> Self { - Self { entries: Vec::new() } + Self { entries: Vec::new(), first_layer_alpha_cache: None } + } + + /// Check whether the first visible layer's source data is fully opaque. + /// + /// For I420/NV12 layers, the converted RGBA always has alpha == 255, so + /// we return `true` immediately without scanning. For RGBA layers we + /// scan once and cache the result keyed by `Arc::as_ptr`. + fn first_layer_all_opaque(&mut self, layer: &LayerSnapshot, rgba_data: &[u8]) -> bool { + // I420/NV12 → RGBA conversion always writes alpha = 255. + if layer.pixel_format != PixelFormat::Rgba8 { + return true; + } + + let identity = Arc::as_ptr(&layer.data) as usize; + if let Some((cached_id, cached_result)) = self.first_layer_alpha_cache { + if cached_id == identity { + return cached_result; + } + } + + let all_opaque = rgba_data.chunks_exact(4).all(|px| px[3] == 255); + self.first_layer_alpha_cache = Some((identity, all_opaque)); + all_opaque } /// Return a previously-cached RGBA slice for `slot_idx`. @@ -117,50 +146,6 @@ impl ConversionCache { } } -/// Returns `true` if the first visible layer is fully opaque (both layer -/// opacity *and* source pixel alpha), unrotated, and covers the entire -/// canvas — meaning the canvas clear can be skipped. -/// -/// `first_src_data` is the RGBA8 source buffer for the first layer (after -/// any YUV→RGBA conversion). We check that every pixel in the region that -/// will be blitted has `alpha == 255`; if any pixel is semi-transparent the -/// clear cannot be skipped because the blit would blend with uninitialised -/// (or stale pooled) canvas bytes. -fn first_layer_covers_canvas( - layers: &[Option], - canvas_w: u32, - canvas_h: u32, - first_src_data: Option<&[u8]>, -) -> bool { - let Some(first) = layers.iter().flatten().next() else { - return false; - }; - - if first.opacity < 1.0 || first.rotation_degrees.abs() >= 0.01 { - return false; - } - - // Check if the layer fully covers the canvas. - // A layer with no rect fills the entire canvas by default. - let covers = first.rect.as_ref().map_or(true, |r| { - r.x <= 0 - && r.y <= 0 - && i64::from(r.width) + i64::from(r.x) >= i64::from(canvas_w) - && i64::from(r.height) + i64::from(r.y) >= i64::from(canvas_h) - }); - if !covers { - return false; - } - - // Verify that *all* source pixels are fully opaque (alpha == 255). - // Without this check, semi-transparent source pixels would blend with - // uninitialised canvas bytes when the clear is skipped. - let Some(src) = first_src_data else { - return false; - }; - src.chunks_exact(4).all(|px| px[3] == 255) -} - /// Snapshot of one input layer's data for the blocking compositor thread. pub struct LayerSnapshot { pub data: Arc, @@ -234,6 +219,41 @@ pub fn composite_frame( } } + // Between pass 1 and pass 2: check whether the first layer allows + // skipping the canvas clear. We do the alpha-opaqueness check here + // while `conversion_cache` is still mutably available. The result + // is a simple bool so no borrows leak into pass 2. + let skip_clear = layers + .iter() + .enumerate() + .find_map(|(i, e)| e.as_ref().map(|l| (i, l))) + .map_or(false, |(_slot_idx, layer)| { + // Quick checks that don't need the pixel data. + if layer.opacity < 1.0 || layer.rotation_degrees.abs() >= 0.01 { + return false; + } + let covers = layer.rect.as_ref().map_or(true, |r| { + r.x <= 0 + && r.y <= 0 + && i64::from(r.width) + i64::from(r.x) >= i64::from(canvas_w) + && i64::from(r.height) + i64::from(r.y) >= i64::from(canvas_h) + }); + if !covers { + return false; + } + // Alpha check — needs mutable access to conversion_cache. + match layer.pixel_format { + // I420/NV12 → RGBA conversion always writes alpha = 255. + PixelFormat::I420 | PixelFormat::Nv12 => true, + PixelFormat::Rgba8 => { + conversion_cache.first_layer_all_opaque(layer, layer.data.as_slice()) + }, + } + }); + if !skip_clear { + buf[..total_bytes].fill(0); + } + // Pass 2: build resolved references. The mutable borrow of // `conversion_cache` from pass 1 is released, so we can now take // shared references into the cache alongside references into `layers`. @@ -255,13 +275,6 @@ pub fn composite_frame( }) .collect(); - // Now that we have the first layer's resolved RGBA data, check whether - // the canvas clear can be skipped. - let first_src = resolved.iter().flatten().next().map(|(_, d)| *d); - if !first_layer_covers_canvas(layers, canvas_w, canvas_h, first_src) { - buf[..total_bytes].fill(0); - } - // Blit each layer (in order — first layer is bottom, last is top). for (layer, src_data) in resolved.iter().flatten() { let dst_rect = diff --git a/crates/nodes/src/video/compositor/pixel_ops.rs b/crates/nodes/src/video/compositor/pixel_ops.rs index a87f13fd..3c6dd03e 100644 --- a/crates/nodes/src/video/compositor/pixel_ops.rs +++ b/crates/nodes/src/video/compositor/pixel_ops.rs @@ -224,17 +224,183 @@ const fn blend_u8(src: u8, dst: u8, alpha: u16) -> u8 { ((val + (val >> 8)) >> 8) as u8 } +// ── SSE2 alpha-blend helpers (x86-64) ────────────────────────────────────── +// +// Process 4 RGBA pixels at a time using SSE2 integer arithmetic. +// Source pixels are gathered (non-contiguous via x_map), destination pixels +// are contiguous. The blend formula is identical to the scalar `blend_u8`: +// result = ((src*alpha + dst*(255-alpha) + 128) + ((…) >> 8)) >> 8 +// +// For the alpha channel we set source-alpha to 255 before blending so that +// `blend_u8(255, dst_alpha, src_alpha)` naturally computes the standard +// over-composite alpha `a_src + a_dst*(1-a_src)` (within ±1 of the scalar +// approximation — both are approximate divisions by 255). + +/// Read 4 bytes from `src` at `offset` as a native-endian `u32`. +/// +/// # Safety +/// +/// Caller must ensure `offset + 3 < src.len()`. +#[inline(always)] +unsafe fn read_rgba_u32(src: &[u8], offset: usize) -> u32 { + std::ptr::read_unaligned(src.as_ptr().add(offset) as *const u32) +} + +/// Blend 4 gathered source RGBA pixels onto 4 contiguous destination pixels +/// using SSE2 "over" compositing (no opacity modifier). +/// +/// # Safety +/// +/// `dst_ptr` must point to at least 16 writable bytes. Source pixel values +/// in `src_pixels` must be valid RGBA `u32` values. +#[cfg(target_arch = "x86_64")] +#[inline(always)] +unsafe fn blend_4px_over_sse2(dst_ptr: *mut u8, src_pixels: [u32; 4]) { + use std::arch::x86_64::*; + + let zero = _mm_setzero_si128(); + let c255 = _mm_set1_epi16(255); + let c128 = _mm_set1_epi16(128); + + // Assemble 4 gathered source pixels into one register. + let src4 = _mm_set_epi32( + src_pixels[3] as i32, + src_pixels[2] as i32, + src_pixels[1] as i32, + src_pixels[0] as i32, + ); + + // Mask with 0xFF at each pixel's alpha-byte position (bytes 3,7,11,15). + let alpha_byte_mask = _mm_set1_epi32(0xFF00_0000_u32 as i32); + + // Fast path: all 4 source pixels fully opaque → direct copy. + let alpha_bytes = _mm_and_si128(src4, alpha_byte_mask); + if _mm_movemask_epi8(_mm_cmpeq_epi8(alpha_bytes, alpha_byte_mask)) == 0xFFFF { + _mm_storeu_si128(dst_ptr as *mut __m128i, src4); + return; + } + + // Fast path: all 4 source pixels fully transparent → nothing to do. + if _mm_movemask_epi8(_mm_cmpeq_epi8(alpha_bytes, zero)) == 0xFFFF { + return; + } + + let dst4 = _mm_loadu_si128(dst_ptr as *const __m128i); + + // Replace source alpha channel with 255 for correct composite-alpha + // via blend_u8(255, dst_alpha, src_alpha). + let src_blend = _mm_or_si128(src4, alpha_byte_mask); + + // --- Low 2 pixels (u16 arithmetic) --- + let src_lo = _mm_unpacklo_epi8(src_blend, zero); + let dst_lo = _mm_unpacklo_epi8(dst4, zero); + + // Extract original source alpha and broadcast within each 4-u16 pixel group. + let src_orig_lo = _mm_unpacklo_epi8(src4, zero); + // _MM_SHUFFLE(3,3,3,3) = 0xFF → replicate element 3 (alpha) to all 4 positions. + let alpha_lo = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src_orig_lo, 0xFF), 0xFF); + + let inv_alpha_lo = _mm_sub_epi16(c255, alpha_lo); + let val_lo = _mm_add_epi16( + _mm_add_epi16(_mm_mullo_epi16(src_lo, alpha_lo), _mm_mullo_epi16(dst_lo, inv_alpha_lo)), + c128, + ); + let result_lo = _mm_srli_epi16(_mm_add_epi16(val_lo, _mm_srli_epi16(val_lo, 8)), 8); + + // --- High 2 pixels --- + let src_hi = _mm_unpackhi_epi8(src_blend, zero); + let dst_hi = _mm_unpackhi_epi8(dst4, zero); + let src_orig_hi = _mm_unpackhi_epi8(src4, zero); + let alpha_hi = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src_orig_hi, 0xFF), 0xFF); + + let inv_alpha_hi = _mm_sub_epi16(c255, alpha_hi); + let val_hi = _mm_add_epi16( + _mm_add_epi16(_mm_mullo_epi16(src_hi, alpha_hi), _mm_mullo_epi16(dst_hi, inv_alpha_hi)), + c128, + ); + let result_hi = _mm_srli_epi16(_mm_add_epi16(val_hi, _mm_srli_epi16(val_hi, 8)), 8); + + // Pack back to u8 and store. + _mm_storeu_si128(dst_ptr as *mut __m128i, _mm_packus_epi16(result_lo, result_hi)); +} + +/// Blend 4 gathered source RGBA pixels onto 4 contiguous destination pixels +/// using SSE2 "over" compositing **with** an opacity multiplier applied to +/// each pixel's source alpha. +/// +/// # Safety +/// +/// `dst_ptr` must point to at least 16 writable bytes. +#[cfg(target_arch = "x86_64")] +#[inline(always)] +unsafe fn blend_4px_over_alpha_sse2(dst_ptr: *mut u8, src_pixels: [u32; 4], opacity: u16) { + use std::arch::x86_64::*; + + let zero = _mm_setzero_si128(); + let c255 = _mm_set1_epi16(255); + let c128 = _mm_set1_epi16(128); + let opacity_v = _mm_set1_epi16(opacity as i16); + + let src4 = _mm_set_epi32( + src_pixels[3] as i32, + src_pixels[2] as i32, + src_pixels[1] as i32, + src_pixels[0] as i32, + ); + + let dst4 = _mm_loadu_si128(dst_ptr as *const __m128i); + let alpha_byte_mask = _mm_set1_epi32(0xFF00_0000_u32 as i32); + let src_blend = _mm_or_si128(src4, alpha_byte_mask); + + // --- Low 2 pixels --- + let src_lo = _mm_unpacklo_epi8(src_blend, zero); + let dst_lo = _mm_unpacklo_epi8(dst4, zero); + + // Extract original alpha, apply opacity: sa_eff = (sa * opacity + 128) >> 8. + // Max value: (255*255+128)>>8 = 254, so no clamping needed. + let src_orig_lo = _mm_unpacklo_epi8(src4, zero); + let raw_alpha_lo = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src_orig_lo, 0xFF), 0xFF); + let alpha_lo = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(raw_alpha_lo, opacity_v), c128), 8); + + let inv_alpha_lo = _mm_sub_epi16(c255, alpha_lo); + let val_lo = _mm_add_epi16( + _mm_add_epi16(_mm_mullo_epi16(src_lo, alpha_lo), _mm_mullo_epi16(dst_lo, inv_alpha_lo)), + c128, + ); + let result_lo = _mm_srli_epi16(_mm_add_epi16(val_lo, _mm_srli_epi16(val_lo, 8)), 8); + + // --- High 2 pixels --- + let src_hi = _mm_unpackhi_epi8(src_blend, zero); + let dst_hi = _mm_unpackhi_epi8(dst4, zero); + let src_orig_hi = _mm_unpackhi_epi8(src4, zero); + let raw_alpha_hi = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src_orig_hi, 0xFF), 0xFF); + let alpha_hi = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(raw_alpha_hi, opacity_v), c128), 8); + + let inv_alpha_hi = _mm_sub_epi16(c255, alpha_hi); + let val_hi = _mm_add_epi16( + _mm_add_epi16(_mm_mullo_epi16(src_hi, alpha_hi), _mm_mullo_epi16(dst_hi, inv_alpha_hi)), + c128, + ); + let result_hi = _mm_srli_epi16(_mm_add_epi16(val_hi, _mm_srli_epi16(val_hi, 8)), 8); + + _mm_storeu_si128(dst_ptr as *mut __m128i, _mm_packus_epi16(result_lo, result_hi)); +} + /// Inner blit for fully-opaque layers (`opacity >= 1.0`). Skips the /// per-pixel f32 multiply on the source alpha channel. /// /// Uses integer-only alpha blending for semi-transparent source pixels. /// `x_map` provides precomputed source-X indices (one per destination column). +/// +/// On x86-64, processes 4 pixels at a time using SSE2 SIMD when the row is +/// wide enough and bounds can be pre-validated. #[allow( clippy::cast_possible_truncation, clippy::cast_sign_loss, clippy::too_many_arguments, clippy::suboptimal_flops, clippy::inline_always, + // dx is used as both x_map index and dst offset, so an iterator is non-trivial. clippy::needless_range_loop )] #[inline(always)] @@ -248,6 +414,59 @@ fn blit_row_opaque( x_map: &[usize], ) { let src_row_base = sy * sw * 4; + + // ── SSE2 fast path: process 4 pixels at a time ───────────────────── + #[cfg(target_arch = "x86_64")] + { + // Pre-validate bounds so the inner SIMD loop is branch-free. + let src_row_end = src_row_base + sw * 4; + let dst_end = (rx + effective_rw) * 4; + if src_row_end <= src.len() && dst_end <= row_slice.len() { + let chunks = effective_rw / 4; + for c in 0..chunks { + let dx = c * 4; + // SAFETY: bounds pre-validated above; x_map values < sw; + // dst range (rx+dx)*4..(rx+dx+4)*4 < dst_end <= row_slice.len(). + unsafe { + let pixels = [ + read_rgba_u32(src, src_row_base + x_map[dx] * 4), + read_rgba_u32(src, src_row_base + x_map[dx + 1] * 4), + read_rgba_u32(src, src_row_base + x_map[dx + 2] * 4), + read_rgba_u32(src, src_row_base + x_map[dx + 3] * 4), + ]; + blend_4px_over_sse2(row_slice.as_mut_ptr().add((rx + dx) * 4), pixels); + } + } + + // Scalar tail for remaining 0-3 pixels. + let tail_start = chunks * 4; + for dx in tail_start..effective_rw { + let sx = x_map[dx]; + let src_idx = src_row_base + sx * 4; + let sr = src[src_idx]; + let sg = src[src_idx + 1]; + let sb = src[src_idx + 2]; + let sa = src[src_idx + 3]; + let dst_idx = (rx + dx) * 4; + if sa == 255 { + row_slice[dst_idx] = sr; + row_slice[dst_idx + 1] = sg; + row_slice[dst_idx + 2] = sb; + row_slice[dst_idx + 3] = 255; + } else if sa > 0 { + let a16 = u16::from(sa); + row_slice[dst_idx] = blend_u8(sr, row_slice[dst_idx], a16); + row_slice[dst_idx + 1] = blend_u8(sg, row_slice[dst_idx + 1], a16); + row_slice[dst_idx + 2] = blend_u8(sb, row_slice[dst_idx + 2], a16); + let da = u16::from(row_slice[dst_idx + 3]); + row_slice[dst_idx + 3] = (a16 + ((da * (255 - a16) + 128) >> 8)).min(255) as u8; + } + } + return; + } + } + + // ── Scalar fallback (bounds-checked per pixel) ───────────────────── for dx in 0..effective_rw { let sx = x_map[dx]; let src_idx = src_row_base + sx * 4; @@ -275,7 +494,6 @@ fn blit_row_opaque( row_slice[dst_idx] = blend_u8(sr, row_slice[dst_idx], a16); row_slice[dst_idx + 1] = blend_u8(sg, row_slice[dst_idx + 1], a16); row_slice[dst_idx + 2] = blend_u8(sb, row_slice[dst_idx + 2], a16); - // Composite alpha: a_out = a_src + a_dst * (1 - a_src) let da = u16::from(row_slice[dst_idx + 3]); row_slice[dst_idx + 3] = (a16 + ((da * (255 - a16) + 128) >> 8)).min(255) as u8; } @@ -287,12 +505,16 @@ fn blit_row_opaque( /// /// Uses integer-only alpha blending. /// `x_map` provides precomputed source-X indices (one per destination column). +/// +/// On x86-64, processes 4 pixels at a time using SSE2 SIMD when the row is +/// wide enough and bounds can be pre-validated. #[allow( clippy::cast_possible_truncation, clippy::cast_sign_loss, clippy::too_many_arguments, clippy::suboptimal_flops, clippy::inline_always, + // dx is used as both x_map index and dst offset, so an iterator is non-trivial. clippy::needless_range_loop )] #[inline(always)] @@ -310,6 +532,60 @@ fn blit_row_alpha( let opacity_u16 = (opacity * 255.0 + 0.5) as u16; let src_row_base = sy * sw * 4; + // ── SSE2 fast path ───────────────────────────────────────────────── + #[cfg(target_arch = "x86_64")] + { + let src_row_end = src_row_base + sw * 4; + let dst_end = (rx + effective_rw) * 4; + if src_row_end <= src.len() && dst_end <= row_slice.len() { + let chunks = effective_rw / 4; + for c in 0..chunks { + let dx = c * 4; + unsafe { + let pixels = [ + read_rgba_u32(src, src_row_base + x_map[dx] * 4), + read_rgba_u32(src, src_row_base + x_map[dx + 1] * 4), + read_rgba_u32(src, src_row_base + x_map[dx + 2] * 4), + read_rgba_u32(src, src_row_base + x_map[dx + 3] * 4), + ]; + blend_4px_over_alpha_sse2( + row_slice.as_mut_ptr().add((rx + dx) * 4), + pixels, + opacity_u16, + ); + } + } + + // Scalar tail. + let tail_start = chunks * 4; + for dx in tail_start..effective_rw { + let sx = x_map[dx]; + let src_idx = src_row_base + sx * 4; + let sr = src[src_idx]; + let sg = src[src_idx + 1]; + let sb = src[src_idx + 2]; + let sa = src[src_idx + 3]; + let dst_idx = (rx + dx) * 4; + let sa_eff = ((u16::from(sa) * opacity_u16 + 128) >> 8).min(255); + if sa_eff == 255 { + row_slice[dst_idx] = sr; + row_slice[dst_idx + 1] = sg; + row_slice[dst_idx + 2] = sb; + row_slice[dst_idx + 3] = 255; + } else if sa_eff > 0 { + row_slice[dst_idx] = blend_u8(sr, row_slice[dst_idx], sa_eff); + row_slice[dst_idx + 1] = blend_u8(sg, row_slice[dst_idx + 1], sa_eff); + row_slice[dst_idx + 2] = blend_u8(sb, row_slice[dst_idx + 2], sa_eff); + let da = u16::from(row_slice[dst_idx + 3]); + row_slice[dst_idx + 3] = + (sa_eff + ((da * (255 - sa_eff) + 128) >> 8)).min(255) as u8; + } + } + return; + } + } + + // ── Scalar fallback ──────────────────────────────────────────────── for dx in 0..effective_rw { let sx = x_map[dx]; let src_idx = src_row_base + sx * 4; @@ -327,7 +603,6 @@ fn blit_row_alpha( continue; } - // Effective alpha: (sa * opacity) / 255, done in integer. let sa_eff = ((u16::from(sa) * opacity_u16 + 128) >> 8).min(255); if sa_eff == 255 { row_slice[dst_idx] = sr;