diff --git a/crates/core/src/frame_pool.rs b/crates/core/src/frame_pool.rs index c7a6a829..9489df8e 100644 --- a/crates/core/src/frame_pool.rs +++ b/crates/core/src/frame_pool.rs @@ -139,6 +139,11 @@ impl FramePool { /// Get pooled storage for at least `min_len` elements. /// /// If `min_len` doesn't fit in any bucket, returns a non-pooled buffer of exact size. + /// + /// On the first miss for a given bucket (cold start), an extra buffer is + /// allocated and placed into the pool so that the *next* `get()` at the + /// same size is a hit. This amortises cold-start allocation cost without + /// pre-allocating every bucket size up front. pub fn get(&self, min_len: usize) -> PooledFrameData { let (handle, bucket_idx, bucket_size, maybe_buf) = { let Ok(mut guard) = self.inner.lock() else { @@ -154,6 +159,12 @@ impl FramePool { guard.hits += 1; } else { guard.misses += 1; + // Lazy preallocate: on first miss for this bucket, seed the + // pool with one extra buffer so subsequent gets are hits. + if guard.buckets[bucket_idx].is_empty() && guard.buckets[bucket_idx].capacity() == 0 + { + guard.buckets[bucket_idx].push(vec![T::default(); bucket_size]); + } } (self.handle(), bucket_idx, bucket_size, buf) }; diff --git a/crates/engine/Cargo.toml b/crates/engine/Cargo.toml index a5cf11e4..8410f58d 100644 --- a/crates/engine/Cargo.toml +++ b/crates/engine/Cargo.toml @@ -64,5 +64,9 @@ indexmap = { workspace = true } name = "compositor_pipeline" harness = false +[[bench]] +name = "compositor_only" +harness = false + [lints] workspace = true diff --git a/crates/engine/benches/compositor_only.rs b/crates/engine/benches/compositor_only.rs new file mode 100644 index 00000000..5a6263fe --- /dev/null +++ b/crates/engine/benches/compositor_only.rs @@ -0,0 +1,547 @@ +// SPDX-FileCopyrightText: © 2025 StreamKit Contributors +// +// SPDX-License-Identifier: MPL-2.0 + +#![allow(clippy::disallowed_macros)] // Bench binary intentionally uses eprintln!/println! for output. +#![allow(clippy::expect_used)] // Panicking on errors is fine in a benchmark binary. +#![allow(clippy::cast_possible_truncation, clippy::cast_sign_loss, clippy::cast_precision_loss)] + +//! Compositor-only microbenchmark — measures `composite_frame` in isolation +//! (no VP9 encode, no mux, no async runtime overhead). +//! +//! Exercises the following scenarios across multiple resolutions: +//! +//! - 1 layer RGBA (baseline) +//! - 2 layers RGBA (PiP) +//! - 4 layers RGBA +//! - 2 layers mixed I420 + RGBA (measures YUV→RGBA conversion overhead) +//! - 2 layers mixed NV12 + RGBA +//! - 2 layers RGBA with rotation +//! - 2 layers RGBA, static (same data each frame — for future cache-hit measurement) +//! +//! ## Usage +//! +//! Quick run (default 200 frames @ 1280×720): +//! +//! ```bash +//! cargo bench -p streamkit-engine --bench compositor_only +//! ``` +//! +//! Custom parameters: +//! +//! ```bash +//! cargo bench -p streamkit-engine --bench compositor_only -- --frames 500 --width 1920 --height 1080 +//! ``` + +use std::sync::Arc; +use std::time::Instant; + +use streamkit_core::frame_pool::PooledVideoData; +use streamkit_core::types::PixelFormat; +use streamkit_core::VideoFramePool; + +// Re-use the compositor kernel and pixel_ops directly. +use streamkit_nodes::video::compositor::config::Rect; +use streamkit_nodes::video::compositor::kernel::{composite_frame, ConversionCache, LayerSnapshot}; +use streamkit_nodes::video::compositor::overlay::DecodedOverlay; +use streamkit_nodes::video::compositor::pixel_ops::rgba8_to_i420; + +// ── Default benchmark parameters ──────────────────────────────────────────── + +const DEFAULT_WIDTH: u32 = 1280; +const DEFAULT_HEIGHT: u32 = 720; +const DEFAULT_FRAME_COUNT: u32 = 200; + +// ── Arg parser ────────────────────────────────────────────────────────────── + +struct BenchArgs { + width: u32, + height: u32, + frame_count: u32, + iterations: u32, + /// Optional filter: only run scenarios whose label contains this substring. + filter: Option, +} + +impl BenchArgs { + fn parse() -> Self { + let args: Vec = std::env::args().collect(); + let mut cfg = Self { + width: DEFAULT_WIDTH, + height: DEFAULT_HEIGHT, + frame_count: DEFAULT_FRAME_COUNT, + iterations: 3, + filter: None, + }; + let mut i = 1; + while i < args.len() { + match args[i].as_str() { + "--width" | "-w" => { + i += 1; + if let Some(v) = args.get(i) { + cfg.width = v.parse().unwrap_or(cfg.width); + } + }, + "--height" | "-h" => { + i += 1; + if let Some(v) = args.get(i) { + cfg.height = v.parse().unwrap_or(cfg.height); + } + }, + "--frames" | "-n" => { + i += 1; + if let Some(v) = args.get(i) { + cfg.frame_count = v.parse().unwrap_or(cfg.frame_count); + } + }, + "--iterations" | "-i" => { + i += 1; + if let Some(v) = args.get(i) { + cfg.iterations = v.parse().unwrap_or(cfg.iterations); + } + }, + "--filter" | "-f" => { + i += 1; + if let Some(v) = args.get(i) { + cfg.filter = Some(v.clone()); + } + }, + _ => {}, + } + i += 1; + } + cfg + } +} + +// ── Frame generators ──────────────────────────────────────────────────────── + +/// Generate an RGBA8 color-bar frame (opaque, all alpha = 255). +fn generate_rgba_frame(width: u32, height: u32) -> Vec { + let w = width as usize; + let h = height as usize; + let mut data = vec![0u8; w * h * 4]; + // Simple vertical gradient bars for visual distinctness. + let bar_colors: &[(u8, u8, u8)] = &[ + (191, 191, 191), // white + (191, 191, 0), // yellow + (0, 191, 191), // cyan + (0, 191, 0), // green + (191, 0, 191), // magenta + (191, 0, 0), // red + (0, 0, 191), // blue + ]; + for row in 0..h { + for col in 0..w { + let bar_idx = col * bar_colors.len() / w; + let (r, g, b) = bar_colors[bar_idx]; + let off = (row * w + col) * 4; + data[off] = r; + data[off + 1] = g; + data[off + 2] = b; + data[off + 3] = 255; + } + } + data +} + +/// Generate an I420 frame by converting an RGBA frame. +fn generate_i420_frame(width: u32, height: u32) -> Vec { + let rgba = generate_rgba_frame(width, height); + rgba8_to_i420(&rgba, width, height) +} + +/// Generate an NV12 frame by converting an RGBA frame. +fn generate_nv12_frame(width: u32, height: u32) -> Vec { + let rgba = generate_rgba_frame(width, height); + let w = width as usize; + let h = height as usize; + let chroma_w = w.div_ceil(2); + let chroma_h = h.div_ceil(2); + let nv12_size = w * h + chroma_w * 2 * chroma_h; + let mut nv12 = vec![0u8; nv12_size]; + streamkit_nodes::video::compositor::pixel_ops::rgba8_to_nv12_buf( + &rgba, width, height, &mut nv12, + ); + nv12 +} + +// ── Compositing harness ───────────────────────────────────────────────────── + +/// Call the real `composite_frame` kernel for `frame_count` iterations, +/// returning per-frame timing statistics. This exercises all kernel +/// optimizations: conversion cache, skip-canvas-clear, identity-scale +/// fast-path, precomputed x-map, SSE2 blend, etc. +/// +/// Uses a real `VideoFramePool` to match production behaviour (pooled buffer +/// reuse instead of per-frame heap allocation). +fn bench_composite( + _label: &str, + canvas_w: u32, + canvas_h: u32, + layers: &[Option], + frame_count: u32, +) -> BenchResult { + let empty_overlays: Vec> = Vec::new(); + let mut conversion_cache = ConversionCache::new(); + let pool = VideoFramePool::video_default(); + + let start = Instant::now(); + + for _ in 0..frame_count { + let _result = composite_frame( + canvas_w, + canvas_h, + layers, + &empty_overlays, + &empty_overlays, + Some(&pool), + &mut conversion_cache, + ); + } + + let elapsed = start.elapsed(); + BenchResult { total_secs: elapsed.as_secs_f64(), frame_count } +} + +struct BenchResult { + total_secs: f64, + frame_count: u32, +} + +impl BenchResult { + fn fps(&self) -> f64 { + f64::from(self.frame_count) / self.total_secs + } + + fn ms_per_frame(&self) -> f64 { + self.total_secs * 1000.0 / f64::from(self.frame_count) + } +} + +// ── Scenario definitions ──────────────────────────────────────────────────── + +struct Scenario { + label: String, + layers: Vec>, +} + +fn make_layer( + data: Vec, + width: u32, + height: u32, + pixel_format: PixelFormat, + rect: Option, + opacity: f32, + z_index: i32, + rotation_degrees: f32, +) -> Option { + Some(LayerSnapshot { + data: Arc::new(PooledVideoData::from_vec(data)), + width, + height, + pixel_format, + rect, + opacity, + z_index, + rotation_degrees, + }) +} + +fn build_scenarios(canvas_w: u32, canvas_h: u32) -> Vec { + let pip_w = canvas_w / 3; + let pip_h = canvas_h / 3; + let pip_x = (canvas_w - pip_w - 20) as i32; + let pip_y = (canvas_h - pip_h - 20) as i32; + + vec![ + // 1 layer RGBA — baseline + Scenario { + label: "1-layer-rgba".to_string(), + layers: vec![make_layer( + generate_rgba_frame(canvas_w, canvas_h), + canvas_w, + canvas_h, + PixelFormat::Rgba8, + None, + 1.0, + 0, + 0.0, + )], + }, + // 2 layers RGBA (PiP) + Scenario { + label: "2-layer-rgba-pip".to_string(), + layers: vec![ + make_layer( + generate_rgba_frame(canvas_w, canvas_h), + canvas_w, + canvas_h, + PixelFormat::Rgba8, + None, + 1.0, + 0, + 0.0, + ), + make_layer( + generate_rgba_frame(pip_w, pip_h), + pip_w, + pip_h, + PixelFormat::Rgba8, + Some(Rect { x: pip_x, y: pip_y, width: pip_w, height: pip_h }), + 0.9, + 1, + 0.0, + ), + ], + }, + // 4 layers RGBA + Scenario { + label: "4-layer-rgba".to_string(), + layers: vec![ + make_layer( + generate_rgba_frame(canvas_w, canvas_h), + canvas_w, + canvas_h, + PixelFormat::Rgba8, + None, + 1.0, + 0, + 0.0, + ), + make_layer( + generate_rgba_frame(pip_w, pip_h), + pip_w, + pip_h, + PixelFormat::Rgba8, + Some(Rect { x: pip_x, y: pip_y, width: pip_w, height: pip_h }), + 0.9, + 1, + 0.0, + ), + make_layer( + generate_rgba_frame(pip_w, pip_h), + pip_w, + pip_h, + PixelFormat::Rgba8, + Some(Rect { x: 20, y: 20, width: pip_w, height: pip_h }), + 0.8, + 2, + 0.0, + ), + make_layer( + generate_rgba_frame(pip_w, pip_h), + pip_w, + pip_h, + PixelFormat::Rgba8, + Some(Rect { x: 20, y: pip_y, width: pip_w, height: pip_h }), + 0.7, + 3, + 0.0, + ), + ], + }, + // 2 layers: I420 bg + RGBA PiP (measures conversion overhead) + Scenario { + label: "2-layer-i420+rgba".to_string(), + layers: vec![ + make_layer( + generate_i420_frame(canvas_w, canvas_h), + canvas_w, + canvas_h, + PixelFormat::I420, + None, + 1.0, + 0, + 0.0, + ), + make_layer( + generate_rgba_frame(pip_w, pip_h), + pip_w, + pip_h, + PixelFormat::Rgba8, + Some(Rect { x: pip_x, y: pip_y, width: pip_w, height: pip_h }), + 0.9, + 1, + 0.0, + ), + ], + }, + // 2 layers: NV12 bg + RGBA PiP + Scenario { + label: "2-layer-nv12+rgba".to_string(), + layers: vec![ + make_layer( + generate_nv12_frame(canvas_w, canvas_h), + canvas_w, + canvas_h, + PixelFormat::Nv12, + None, + 1.0, + 0, + 0.0, + ), + make_layer( + generate_rgba_frame(pip_w, pip_h), + pip_w, + pip_h, + PixelFormat::Rgba8, + Some(Rect { x: pip_x, y: pip_y, width: pip_w, height: pip_h }), + 0.9, + 1, + 0.0, + ), + ], + }, + // 2 layers RGBA with rotation on PiP + Scenario { + label: "2-layer-rgba-rotated".to_string(), + layers: vec![ + make_layer( + generate_rgba_frame(canvas_w, canvas_h), + canvas_w, + canvas_h, + PixelFormat::Rgba8, + None, + 1.0, + 0, + 0.0, + ), + make_layer( + generate_rgba_frame(pip_w, pip_h), + pip_w, + pip_h, + PixelFormat::Rgba8, + Some(Rect { x: pip_x, y: pip_y, width: pip_w, height: pip_h }), + 0.9, + 1, + 15.0, // 15° rotation + ), + ], + }, + // 2 layers RGBA, static (same Arc — for future cache-hit measurement) + Scenario { + label: "2-layer-rgba-static".to_string(), + layers: { + let bg = + Arc::new(PooledVideoData::from_vec(generate_rgba_frame(canvas_w, canvas_h))); + let pip = Arc::new(PooledVideoData::from_vec(generate_rgba_frame(pip_w, pip_h))); + vec![ + Some(LayerSnapshot { + data: bg, + width: canvas_w, + height: canvas_h, + pixel_format: PixelFormat::Rgba8, + rect: None, + opacity: 1.0, + z_index: 0, + rotation_degrees: 0.0, + }), + Some(LayerSnapshot { + data: pip, + width: pip_w, + height: pip_h, + pixel_format: PixelFormat::Rgba8, + rect: Some(Rect { x: pip_x, y: pip_y, width: pip_w, height: pip_h }), + opacity: 0.9, + z_index: 1, + rotation_degrees: 0.0, + }), + ] + }, + }, + ] +} + +// ── Main ──────────────────────────────────────────────────────────────────── + +fn main() { + let args = BenchArgs::parse(); + + let resolutions: &[(u32, u32)] = if args.width == DEFAULT_WIDTH && args.height == DEFAULT_HEIGHT + { + // Default: run at multiple resolutions. + &[(640, 480), (1280, 720), (1920, 1080)] + } else { + // Custom: run at the specified resolution only. + // (Leak to get 'static — acceptable in a short-lived bench binary.) + let res = Box::leak(Box::new([(args.width, args.height)])); + res + }; + + eprintln!("╔══════════════════════════════════════════════════════════╗"); + eprintln!("║ Compositor-Only Microbenchmark ║"); + eprintln!("╠══════════════════════════════════════════════════════════╣"); + eprintln!( + "║ Resolutions : {:<41}║", + resolutions.iter().map(|(w, h)| format!("{w}×{h}")).collect::>().join(", ") + ); + eprintln!("║ Frames : {:<41}║", args.frame_count); + eprintln!("║ Iterations : {:<41}║", args.iterations); + if let Some(ref f) = args.filter { + eprintln!("║ Filter : {f:<41}║"); + } + eprintln!("╚══════════════════════════════════════════════════════════╝"); + eprintln!(); + + let mut json_results: Vec = Vec::new(); + + for &(w, h) in resolutions { + eprintln!("── {w}×{h} ──────────────────────────────────────────────"); + + let scenarios = build_scenarios(w, h); + + for scenario in &scenarios { + if let Some(ref filter) = args.filter { + if !scenario.label.contains(filter.as_str()) { + continue; + } + } + + let mut iter_results = Vec::with_capacity(args.iterations as usize); + + for iter in 1..=args.iterations { + let result = + bench_composite(&scenario.label, w, h, &scenario.layers, args.frame_count); + eprintln!( + " {:<28} iter {iter}/{}: {:>8.1} fps ({:.2} ms/frame)", + scenario.label, + args.iterations, + result.fps(), + result.ms_per_frame(), + ); + iter_results.push(result); + } + + // Summary for this scenario. + let fps_values: Vec = iter_results.iter().map(BenchResult::fps).collect(); + let ms_values: Vec = iter_results.iter().map(BenchResult::ms_per_frame).collect(); + let mean_fps = fps_values.iter().sum::() / fps_values.len() as f64; + let mean_ms = ms_values.iter().sum::() / ms_values.len() as f64; + let min_ms = ms_values.iter().copied().fold(f64::INFINITY, f64::min); + let max_ms = ms_values.iter().copied().fold(f64::NEG_INFINITY, f64::max); + + eprintln!( + " {:<28} avg: {:>8.1} fps ({:.2} ms/frame, min={:.2}, max={:.2})", + "", mean_fps, mean_ms, min_ms, max_ms, + ); + + json_results.push(serde_json::json!({ + "benchmark": "compositor_only", + "scenario": scenario.label, + "width": w, + "height": h, + "frame_count": args.frame_count, + "iterations": args.iterations, + "mean_fps": mean_fps, + "mean_ms_per_frame": mean_ms, + "min_ms_per_frame": min_ms, + "max_ms_per_frame": max_ms, + })); + } + eprintln!(); + } + + // Machine-readable JSON output. + println!("{}", serde_json::to_string_pretty(&json_results).expect("JSON serialization")); +} diff --git a/crates/nodes/src/video/compositor/kernel.rs b/crates/nodes/src/video/compositor/kernel.rs index c173360e..621fe893 100644 --- a/crates/nodes/src/video/compositor/kernel.rs +++ b/crates/nodes/src/video/compositor/kernel.rs @@ -19,6 +19,133 @@ use super::pixel_ops::{ // ── Compositing kernel (runs on a persistent blocking thread) ──────────────── +// ── YUV → RGBA conversion cache ───────────────────────────────────────────── + +/// Cached RGBA conversion result for a single layer slot. +struct CachedConversion { + /// Identity of the source data (`Arc::as_ptr` cast to `usize`). + /// When the `Arc` pointer hasn't changed between frames + /// the underlying data is identical and the conversion can be skipped. + data_identity: usize, + width: u32, + height: u32, + /// Pre-converted RGBA8 data, stored as a plain `Vec`. + rgba: Vec, +} + +/// Per-slot cache for YUV → RGBA conversions. +/// +/// Avoids redundant per-frame I420/NV12 → RGBA8 conversion when the source +/// `Arc` hasn't changed since the previous frame. +/// +/// Also caches the first-layer alpha-scan result so that the canvas-clear +/// skip check doesn't re-scan every frame when the source hasn't changed. +pub struct ConversionCache { + entries: Vec>, + /// Cached result of the alpha-opaqueness scan for the first visible layer. + /// `(data_identity, all_opaque)` — valid when the `Arc` pointer matches. + first_layer_alpha_cache: Option<(usize, bool)>, +} + +impl ConversionCache { + pub const fn new() -> Self { + Self { entries: Vec::new(), first_layer_alpha_cache: None } + } + + /// Check whether the first visible layer's source data is fully opaque. + /// + /// For I420/NV12 layers, the converted RGBA always has alpha == 255, so + /// we return `true` immediately without scanning. For RGBA layers we + /// scan once and cache the result keyed by `Arc::as_ptr`. + fn first_layer_all_opaque(&mut self, layer: &LayerSnapshot, rgba_data: &[u8]) -> bool { + // I420/NV12 → RGBA conversion always writes alpha = 255. + if layer.pixel_format != PixelFormat::Rgba8 { + return true; + } + + let identity = Arc::as_ptr(&layer.data) as usize; + if let Some((cached_id, cached_result)) = self.first_layer_alpha_cache { + if cached_id == identity { + return cached_result; + } + } + + let all_opaque = rgba_data.chunks_exact(4).all(|px| px[3] == 255); + self.first_layer_alpha_cache = Some((identity, all_opaque)); + all_opaque + } + + /// Return a previously-cached RGBA slice for `slot_idx`. + /// + /// # Panics + /// + /// Panics if the slot has not been populated by a prior `get_or_convert` + /// call for the same `layer`. This is only called in the second pass of + /// `composite_frame` after the first pass has ensured every non-RGBA + /// layer has been converted. + fn get_cached(&self, slot_idx: usize, layer: &LayerSnapshot) -> &[u8] { + #[allow(clippy::expect_used)] + let cached = + self.entries[slot_idx].as_ref().expect("get_cached called before get_or_convert"); + let needed = layer.width as usize * layer.height as usize * 4; + &cached.rgba[..needed] + } + + /// Look up or perform a YUV→RGBA conversion for layer at `slot_idx`. + /// Returns a slice of RGBA8 data. + fn get_or_convert(&mut self, slot_idx: usize, layer: &LayerSnapshot) -> &[u8] { + let identity = Arc::as_ptr(&layer.data) as usize; + + // Ensure the cache Vec is large enough. + if self.entries.len() <= slot_idx { + self.entries.resize_with(slot_idx + 1, || None); + } + + // Check if the cached entry is still valid. + let needs_convert = self.entries[slot_idx].as_ref().map_or(true, |cached| { + cached.data_identity != identity + || cached.width != layer.width + || cached.height != layer.height + }); + + if needs_convert { + let needed = layer.width as usize * layer.height as usize * 4; + // Reuse the existing allocation if possible. + let mut rgba = self.entries[slot_idx].take().map(|c| c.rgba).unwrap_or_default(); + if rgba.len() < needed { + rgba.resize(needed, 0); + } + + match layer.pixel_format { + PixelFormat::I420 => { + i420_to_rgba8_buf(layer.data.as_slice(), layer.width, layer.height, &mut rgba); + }, + PixelFormat::Nv12 => { + nv12_to_rgba8_buf(layer.data.as_slice(), layer.width, layer.height, &mut rgba); + }, + PixelFormat::Rgba8 => { + // Should not be called for RGBA, but handle gracefully. + rgba[..needed].copy_from_slice(&layer.data.as_slice()[..needed]); + }, + } + + self.entries[slot_idx] = Some(CachedConversion { + data_identity: identity, + width: layer.width, + height: layer.height, + rgba, + }); + } + + // SAFETY: we just inserted into this slot above when `needs_convert` was true, + // and the slot was already `Some` when `needs_convert` was false. + #[allow(clippy::expect_used)] + let cached = self.entries[slot_idx].as_ref().expect("just inserted"); + let needed = layer.width as usize * layer.height as usize * 4; + &cached.rgba[..needed] + } +} + /// Snapshot of one input layer's data for the blocking compositor thread. pub struct LayerSnapshot { pub data: Arc, @@ -27,8 +154,10 @@ pub struct LayerSnapshot { pub pixel_format: PixelFormat, pub rect: Option, pub opacity: f32, - /// Visual stacking order. Lower values are drawn first (bottom). - /// Used to sort layers before compositing; ties broken by slot index. + /// Visual stacking order. Retained in the snapshot for diagnostic / + /// logging purposes even though sorting now happens before snapshot + /// construction. + #[allow(dead_code)] pub z_index: i32, /// Clockwise rotation in degrees around the destination rect centre. /// Default `0.0` means no rotation. @@ -56,8 +185,8 @@ pub struct CompositeResult { /// Composite all layers + overlays onto a fresh RGBA8 canvas buffer. /// Allocates from the video pool if available. /// -/// `i420_scratch` is a reusable buffer for I420/NV12→RGBA8 conversion, -/// avoiding per-frame allocation. +/// `conversion_cache` caches YUV→RGBA8 conversions across frames so that +/// unchanged layers skip the conversion entirely. pub fn composite_frame( canvas_w: u32, canvas_h: u32, @@ -65,7 +194,7 @@ pub fn composite_frame( image_overlays: &[Arc], text_overlays: &[Arc], video_pool: Option<&streamkit_core::VideoFramePool>, - i420_scratch: &mut Vec, + conversion_cache: &mut ConversionCache, ) -> streamkit_core::frame_pool::PooledVideoData { let total_bytes = (canvas_w as usize) * (canvas_h as usize) * 4; @@ -74,36 +203,83 @@ pub fn composite_frame( |pool| pool.get(total_bytes), ); - // Zero the buffer (transparent black). let buf = pooled.as_mut_slice(); - buf[..total_bytes].fill(0); + + // Two-pass source resolution. + // + // Pass 1: populate the conversion cache for every non-RGBA layer. + // `slot_idx` uses the position in the `layers` slice (which preserves + // `None` holes) so that cache indices stay stable even when some slots + // have no frame. + for (slot_idx, entry) in layers.iter().enumerate() { + if let Some(layer) = entry { + if layer.pixel_format != PixelFormat::Rgba8 { + conversion_cache.get_or_convert(slot_idx, layer); + } + } + } + + // Between pass 1 and pass 2: check whether the first layer allows + // skipping the canvas clear. We do the alpha-opaqueness check here + // while `conversion_cache` is still mutably available. The result + // is a simple bool so no borrows leak into pass 2. + let skip_clear = layers + .iter() + .enumerate() + .find_map(|(i, e)| e.as_ref().map(|l| (i, l))) + .map_or(false, |(_slot_idx, layer)| { + // Quick checks that don't need the pixel data. + if layer.opacity < 1.0 || layer.rotation_degrees.abs() >= 0.01 { + return false; + } + let covers = layer.rect.as_ref().map_or(true, |r| { + r.x <= 0 + && r.y <= 0 + && i64::from(r.width) + i64::from(r.x) >= i64::from(canvas_w) + && i64::from(r.height) + i64::from(r.y) >= i64::from(canvas_h) + }); + if !covers { + return false; + } + // Alpha check — needs mutable access to conversion_cache. + match layer.pixel_format { + // I420/NV12 → RGBA conversion always writes alpha = 255. + PixelFormat::I420 | PixelFormat::Nv12 => true, + PixelFormat::Rgba8 => { + conversion_cache.first_layer_all_opaque(layer, layer.data.as_slice()) + }, + } + }); + if !skip_clear { + buf[..total_bytes].fill(0); + } + + // Pass 2: build resolved references. The mutable borrow of + // `conversion_cache` from pass 1 is released, so we can now take + // shared references into the cache alongside references into `layers`. + let resolved: Vec> = layers + .iter() + .enumerate() + .map(|(slot_idx, entry)| { + entry.as_ref().map(|layer| { + let src_data: &[u8] = match layer.pixel_format { + PixelFormat::Rgba8 => layer.data.as_slice(), + PixelFormat::I420 | PixelFormat::Nv12 => { + // Cache was populated in pass 1; this is a shared + // read that cannot fail. + conversion_cache.get_cached(slot_idx, layer) + }, + }; + (layer, src_data) + }) + }) + .collect(); // Blit each layer (in order — first layer is bottom, last is top). - // I420 layers are converted to RGBA8 on-the-fly using the scratch buffer. - for layer in layers.iter().flatten() { + for (layer, src_data) in resolved.iter().flatten() { let dst_rect = layer.rect.clone().unwrap_or(Rect { x: 0, y: 0, width: canvas_w, height: canvas_h }); - let src_data: &[u8] = match layer.pixel_format { - PixelFormat::Rgba8 => layer.data.as_slice(), - PixelFormat::I420 => { - let needed = layer.width as usize * layer.height as usize * 4; - if i420_scratch.len() < needed { - i420_scratch.resize(needed, 0); - } - i420_to_rgba8_buf(layer.data.as_slice(), layer.width, layer.height, i420_scratch); - &i420_scratch[..needed] - }, - PixelFormat::Nv12 => { - let needed = layer.width as usize * layer.height as usize * 4; - if i420_scratch.len() < needed { - i420_scratch.resize(needed, 0); - } - nv12_to_rgba8_buf(layer.data.as_slice(), layer.width, layer.height, i420_scratch); - &i420_scratch[..needed] - }, - }; - scale_blit_rgba_rotated( buf, canvas_w, diff --git a/crates/nodes/src/video/compositor/mod.rs b/crates/nodes/src/video/compositor/mod.rs index 34dde60a..9c45af98 100644 --- a/crates/nodes/src/video/compositor/mod.rs +++ b/crates/nodes/src/video/compositor/mod.rs @@ -25,12 +25,12 @@ //! - Bilinear / Lanczos scaling (MVP uses nearest-neighbor). pub mod config; -mod kernel; -mod overlay; +pub mod kernel; +pub mod overlay; pub mod pixel_ops; use async_trait::async_trait; -use config::{CompositorConfig, Rect}; +use config::CompositorConfig; use kernel::{CompositeResult, CompositeWorkItem, LayerSnapshot}; use overlay::{decode_image_overlay, rasterize_text_overlay, DecodedOverlay}; use schemars::schema_for; @@ -48,7 +48,7 @@ use streamkit_core::{ }; use tokio::sync::mpsc; -use kernel::composite_frame; +use kernel::{composite_frame, ConversionCache}; // ── Input slot ────────────────────────────────────────────────────────────── @@ -59,6 +59,63 @@ struct InputSlot { latest_frame: Option, } +// ── Cached layer config ───────────────────────────────────────────────────── + +/// Pre-resolved layer configuration for a single slot. +/// Rebuilt only when compositor config or pin set changes, avoiding +/// per-frame `HashMap` lookups and `sort_by` calls. +#[derive(Clone)] +struct ResolvedSlotConfig { + rect: Option, + opacity: f32, + z_index: i32, + rotation_degrees: f32, +} + +/// Rebuild the per-slot resolved configs and the z-sorted draw order. +/// +/// Called once at startup and whenever `UpdateParams` or pin management +/// changes the layer set. The returned draw order is a list of slot +/// indices sorted by `(z_index, slot_index)`. +fn rebuild_layer_cache( + slots: &[InputSlot], + config: &CompositorConfig, +) -> (Vec, Vec) { + let num_slots = slots.len(); + let mut configs: Vec = Vec::with_capacity(num_slots); + for (idx, slot) in slots.iter().enumerate() { + let layer_cfg = config.layers.get(&slot.name); + #[allow(clippy::option_if_let_else)] + let (rect, opacity, z_index, rotation_degrees) = if let Some(lc) = layer_cfg { + (lc.rect.clone(), lc.opacity, lc.z_index, lc.rotation_degrees) + } else if idx > 0 && num_slots > 1 { + // Auto-PiP: non-first layers without explicit config. + let pip_w = config.width / 3; + let pip_h = config.height / 3; + #[allow(clippy::cast_possible_wrap)] + let pip_x = (config.width - pip_w - 20) as i32; + #[allow(clippy::cast_possible_wrap)] + let pip_y = (config.height - pip_h - 20) as i32; + #[allow(clippy::cast_possible_wrap, clippy::cast_possible_truncation)] + ( + Some(config::Rect { x: pip_x, y: pip_y, width: pip_w, height: pip_h }), + 0.9, + idx as i32, + 0.0, + ) + } else { + (None, 1.0, 0, 0.0) + }; + configs.push(ResolvedSlotConfig { rect, opacity, z_index, rotation_degrees }); + } + + // Pre-sort by (z_index, slot_index). + let mut draw_order: Vec = (0..num_slots).collect(); + draw_order.sort_by(|&a, &b| configs[a].z_index.cmp(&configs[b].z_index).then(a.cmp(&b))); + + (configs, draw_order) +} + // ── Node ──────────────────────────────────────────────────────────────────── /// Composites multiple raw video inputs onto a single RGBA8 canvas with @@ -285,9 +342,9 @@ impl ProcessorNode for CompositorNode { let (result_tx, mut result_rx) = tokio::sync::mpsc::channel::(2); let composite_thread = tokio::task::spawn_blocking(move || { - // Persistent scratch buffer for I420→RGBA8 layer conversion, - // reused across frames to avoid per-frame allocation. - let mut i420_to_rgba_scratch: Vec = Vec::new(); + // Per-slot cache for YUV→RGBA conversions. Avoids redundant + // conversion when the source Arc hasn't changed between frames. + let mut conversion_cache = ConversionCache::new(); while let Some(work) = work_rx.blocking_recv() { let rgba_buf = composite_frame( @@ -297,7 +354,7 @@ impl ProcessorNode for CompositorNode { &work.image_overlays, &work.text_overlays, work.video_pool.as_deref(), - &mut i420_to_rgba_scratch, + &mut conversion_cache, ); let result = CompositeResult { rgba_data: rgba_buf }; if result_tx.blocking_send(result).is_err() { @@ -309,6 +366,14 @@ impl ProcessorNode for CompositorNode { let mut output_seq: u64 = 0; let mut stop_reason: &str = "shutdown"; + // ── Cached layer config + draw order ──────────────────────────── + // Rebuilt only when config or pin set changes (UpdateParams, + // pin add/remove, channel close). Avoids per-frame HashMap + // lookups and sort_by calls. + let mut layer_configs_dirty = true; + let mut resolved_configs: Vec = Vec::new(); + let mut sorted_draw_order: Vec = Vec::new(); + loop { // ── Take at most one frame from every slot (non-blocking) ─── // We intentionally take only one frame per slot per iteration so @@ -347,6 +412,7 @@ impl ProcessorNode for CompositorNode { params, &mut stats_tracker, ); + layer_configs_dirty = true; }, NodeControlMessage::Start => {}, } @@ -364,6 +430,7 @@ impl ProcessorNode for CompositorNode { msg, &mut slots, ); + layer_configs_dirty = true; } // Wait for a frame from any connected input. @@ -379,6 +446,7 @@ impl ProcessorNode for CompositorNode { slots[slot_idx].name ); slots.remove(slot_idx); + layer_configs_dirty = true; if slots.is_empty() { stop_reason = "all_inputs_closed"; should_break = true; @@ -425,6 +493,7 @@ impl ProcessorNode for CompositorNode { params, &mut stats_tracker, ); + layer_configs_dirty = true; }, NodeControlMessage::Start => {}, } @@ -440,6 +509,7 @@ impl ProcessorNode for CompositorNode { msg, &mut slots, ); + layer_configs_dirty = true; } } continue; @@ -463,6 +533,7 @@ impl ProcessorNode for CompositorNode { params, &mut stats_tracker, ); + layer_configs_dirty = true; }, NodeControlMessage::Start => {}, } @@ -473,69 +544,40 @@ impl ProcessorNode for CompositorNode { if let Some(ref mut pmrx) = pin_mgmt_rx { while let Ok(msg) = pmrx.try_recv() { Self::handle_pin_management(&mut self, msg, &mut slots); + layer_configs_dirty = true; } } + // ── Rebuild layer config cache if needed ───────────────────── + if layer_configs_dirty { + let (cfgs, order) = rebuild_layer_cache(&slots, &self.config); + resolved_configs = cfgs; + sorted_draw_order = order; + layer_configs_dirty = false; + } + // ── Send work to persistent compositing thread ───────────── - // Collect the data we need to send to the blocking thread. - let num_slots = slots.len(); - let mut layers: Vec> = slots + // Build layer snapshots in pre-sorted draw order using the + // cached per-slot configs (no HashMap lookup, no sort). + let layers: Vec> = sorted_draw_order .iter() - .enumerate() - .map(|(idx, slot)| { - slot.latest_frame.as_ref().map(|f| { - let layer_cfg = self.config.layers.get(&slot.name); - #[allow(clippy::option_if_let_else)] - let (rect, opacity, z_index, rotation_degrees) = if let Some(lc) = layer_cfg - { - // Explicit per-layer config. - (lc.rect.clone(), lc.opacity, lc.z_index, lc.rotation_degrees) - } else if idx > 0 && num_slots > 1 { - // Auto-PiP: non-first layers without explicit config - // are placed in the bottom-right corner at 1/3 canvas - // size with slight transparency. - let pip_w = self.config.width / 3; - let pip_h = self.config.height / 3; - #[allow(clippy::cast_possible_wrap)] - let pip_x = (self.config.width - pip_w - 20) as i32; - #[allow(clippy::cast_possible_wrap)] - let pip_y = (self.config.height - pip_h - 20) as i32; - #[allow(clippy::cast_possible_wrap)] - ( - Some(Rect { x: pip_x, y: pip_y, width: pip_w, height: pip_h }), - 0.9, - idx as i32, - 0.0, - ) - } else { - // First layer (or single input): fill the canvas. - (None, 1.0, 0, 0.0) - }; + .map(|&idx| { + slots[idx].latest_frame.as_ref().map(|f| { + let cfg = &resolved_configs[idx]; LayerSnapshot { data: f.data.clone(), width: f.width, height: f.height, pixel_format: f.pixel_format, - rect, - opacity, - z_index, - rotation_degrees, + rect: cfg.rect.clone(), + opacity: cfg.opacity, + z_index: cfg.z_index, + rotation_degrees: cfg.rotation_degrees, } }) }) .collect(); - // Sort layers by z_index so that lower values are drawn first - // (bottom of the stack). `None` entries (slots without a frame) - // are pushed to the end — they are skipped during compositing - // anyway. - layers.sort_by(|a, b| match (a, b) { - (Some(la), Some(lb)) => la.z_index.cmp(&lb.z_index), - (Some(_), None) => std::cmp::Ordering::Less, - (None, Some(_)) => std::cmp::Ordering::Greater, - (None, None) => std::cmp::Ordering::Equal, - }); - stats_tracker.received(); let work_item = CompositeWorkItem { @@ -773,7 +815,7 @@ mod tests { use crate::test_utils::{ assert_state_initializing, assert_state_running, assert_state_stopped, create_test_context, }; - use config::LayerConfig; + use config::{LayerConfig, Rect}; use pixel_ops::scale_blit_rgba; use std::collections::HashMap; use tokio::sync::mpsc; @@ -851,8 +893,8 @@ mod tests { #[test] fn test_composite_frame_empty_layers() { // No layers, no overlays -> transparent black canvas. - let mut scratch = Vec::new(); - let result = composite_frame(4, 4, &[], &[], &[], None, &mut scratch); + let mut cache = ConversionCache::new(); + let result = composite_frame(4, 4, &[], &[], &[], None, &mut cache); let buf = result.as_slice(); assert_eq!(buf.len(), 4 * 4 * 4); assert!(buf.iter().all(|&b| b == 0)); @@ -872,8 +914,8 @@ mod tests { rotation_degrees: 0.0, }; - let mut scratch = Vec::new(); - let result = composite_frame(4, 4, &[Some(layer)], &[], &[], None, &mut scratch); + let mut cache = ConversionCache::new(); + let result = composite_frame(4, 4, &[Some(layer)], &[], &[], None, &mut cache); let buf = result.as_slice(); // Entire canvas should be red (scaled from 2x2 to 4x4). @@ -912,9 +954,9 @@ mod tests { rotation_degrees: 0.0, }; - let mut scratch = Vec::new(); + let mut cache = ConversionCache::new(); let result = - composite_frame(4, 4, &[Some(layer0), Some(layer1)], &[], &[], None, &mut scratch); + composite_frame(4, 4, &[Some(layer0), Some(layer1)], &[], &[], None, &mut cache); let buf = result.as_slice(); // (0,0) should be red. @@ -1079,8 +1121,8 @@ mod tests { let pool = FramePool::::preallocated(&[total], 2); assert_eq!(pool.stats().buckets[0].available, 2); - let mut scratch = Vec::new(); - let result = composite_frame(canvas_w, canvas_h, &[], &[], &[], Some(&pool), &mut scratch); + let mut cache = ConversionCache::new(); + let result = composite_frame(canvas_w, canvas_h, &[], &[], &[], Some(&pool), &mut cache); assert_eq!(result.as_slice().len(), total); // One buffer was taken from the pool. assert_eq!(pool.stats().buckets[0].available, 1); diff --git a/crates/nodes/src/video/compositor/overlay.rs b/crates/nodes/src/video/compositor/overlay.rs index 3072870a..0fc1e6b2 100644 --- a/crates/nodes/src/video/compositor/overlay.rs +++ b/crates/nodes/src/video/compositor/overlay.rs @@ -36,13 +36,52 @@ pub fn decode_image_overlay(config: &ImageOverlayConfig) -> Result 0 && target_h > 0 && (w != target_w || h != target_h) { + let raw = rgba.into_raw(); + let scaled = prescale_rgba(&raw, w, h, target_w, target_h); + Ok(DecodedOverlay { + rgba_data: scaled, + width: target_w, + height: target_h, + rect: config.rect.clone(), + opacity: config.opacity, + }) + } else { + Ok(DecodedOverlay { + rgba_data: rgba.into_raw(), + width: w, + height: h, + rect: config.rect.clone(), + opacity: config.opacity, + }) + } +} + +/// Nearest-neighbor scale an RGBA8 buffer from `(sw, sh)` to `(dw, dh)`. +/// Used once at config time so the per-frame blit is a 1:1 copy. +fn prescale_rgba(src: &[u8], sw: u32, sh: u32, dw: u32, dh: u32) -> Vec { + let sw = sw as usize; + let sh = sh as usize; + let dw = dw as usize; + let dh = dh as usize; + let mut out = vec![0u8; dw * dh * 4]; + for dy in 0..dh { + let sy = dy * sh / dh; + for dx in 0..dw { + let sx = dx * sw / dw; + let si = (sy * sw + sx) * 4; + let di = (dy * dw + dx) * 4; + out[di..di + 4].copy_from_slice(&src[si..si + 4]); + } + } + out } // ── Bundled default font ──────────────────────────────────────────────────── diff --git a/crates/nodes/src/video/compositor/pixel_ops.rs b/crates/nodes/src/video/compositor/pixel_ops.rs index 3464eb8e..3c6dd03e 100644 --- a/crates/nodes/src/video/compositor/pixel_ops.rs +++ b/crates/nodes/src/video/compositor/pixel_ops.rs @@ -110,28 +110,71 @@ pub fn scale_blit_rgba( let first_row_byte = ry * row_stride; let dst_rows = &mut dst[first_row_byte..]; + // ── Identity-scale fast path ─────────────────────────────────────── + // When source dimensions exactly match the destination rect and opacity + // is fully opaque, we can avoid per-pixel scaling entirely and use + // direct row copies (memcpy) for fully-opaque source rows. + if rw == sw && rh == sh && opacity >= 1.0 && src_col_skip == 0 && src_row_skip == 0 { + let src_row_bytes = sw * 4; + let copy_bytes = effective_rect_w * 4; + for (dy, row_slice) in dst_rows.chunks_mut(row_stride).take(effective_rh).enumerate() { + let src_start = dy * src_row_bytes; + let src_end = src_start + copy_bytes; + if src_end > src.len() { + break; + } + let dst_start = rx * 4; + let dst_end = dst_start + copy_bytes; + if dst_end > row_slice.len() { + break; + } + // Check if the source row has any semi-transparent pixels. + // For fully-opaque rows, use bulk memcpy. For rows with alpha, + // fall back to per-pixel blending. + let src_row = &src[src_start..src_end]; + let all_opaque = src_row.chunks_exact(4).all(|px| px[3] == 255); + if all_opaque { + row_slice[dst_start..dst_end].copy_from_slice(src_row); + } else { + // Per-pixel alpha blend (identity scale, so sx == dx). + for dx in 0..effective_rect_w { + let si = dx * 4; + let sa = src_row[si + 3]; + if sa == 255 { + row_slice[dst_start + dx * 4..dst_start + dx * 4 + 4] + .copy_from_slice(&src_row[si..si + 4]); + } else if sa > 0 { + let di = dst_start + dx * 4; + let a16 = u16::from(sa); + row_slice[di] = blend_u8(src_row[si], row_slice[di], a16); + row_slice[di + 1] = blend_u8(src_row[si + 1], row_slice[di + 1], a16); + row_slice[di + 2] = blend_u8(src_row[si + 2], row_slice[di + 2], a16); + let da = u16::from(row_slice[di + 3]); + row_slice[di + 3] = (a16 + ((da * (255 - a16) + 128) >> 8)).min(255) as u8; + } + } + } + } + return; + } + + // ── Scaled blit path ─────────────────────────────────────────────── + // Precompute the source-X lookup table once. This replaces the per-pixel + // `(dx + src_col_skip) * sw / rw` integer division with a single table + // lookup in the inner blit loops. + let x_map: Vec = (0..effective_rect_w).map(|dx| (dx + src_col_skip) * sw / rw).collect(); + if effective_rh >= RAYON_ROW_THRESHOLD { dst_rows.par_chunks_mut(row_stride).take(effective_rh).enumerate().for_each( |(dy, row_slice)| { let sy = (dy + src_row_skip) * sh / rh; - blit_row( - row_slice, - rx, - effective_rect_w, - src, - sw, - sh, - sy, - rw, - opacity, - src_col_skip, - ); + blit_row(row_slice, rx, effective_rect_w, src, sw, sy, opacity, &x_map); }, ); } else { for (dy, row_slice) in dst_rows.chunks_mut(row_stride).take(effective_rh).enumerate() { let sy = (dy + src_row_skip) * sh / rh; - blit_row(row_slice, rx, effective_rect_w, src, sw, sh, sy, rw, opacity, src_col_skip); + blit_row(row_slice, rx, effective_rect_w, src, sw, sy, opacity, &x_map); } } } @@ -142,6 +185,9 @@ pub fn scale_blit_rgba( /// rows in parallel. The `row_slice` covers exactly one destination row /// starting at pixel column 0 (i.e. byte offset `rx * 4` is the first column /// we write to). +/// +/// `x_map` is a precomputed table mapping each destination column to the +/// corresponding source column, eliminating per-pixel integer division. #[allow( clippy::cast_possible_truncation, clippy::cast_sign_loss, @@ -155,18 +201,16 @@ fn blit_row( effective_rw: usize, src: &[u8], sw: usize, - sh: usize, sy: usize, - rw: usize, opacity: f32, - src_col_skip: usize, + x_map: &[usize], ) { // Fast path: when opacity is 1.0, we can skip the f32 multiply on alpha // and branch more cheaply. if opacity >= 1.0 { - blit_row_opaque(row_slice, rx, effective_rw, src, sw, sh, sy, rw, src_col_skip); + blit_row_opaque(row_slice, rx, effective_rw, src, sw, sy, x_map); } else { - blit_row_alpha(row_slice, rx, effective_rw, src, sw, sh, sy, rw, opacity, src_col_skip); + blit_row_alpha(row_slice, rx, effective_rw, src, sw, sy, opacity, x_map); } } @@ -180,16 +224,184 @@ const fn blend_u8(src: u8, dst: u8, alpha: u16) -> u8 { ((val + (val >> 8)) >> 8) as u8 } +// ── SSE2 alpha-blend helpers (x86-64) ────────────────────────────────────── +// +// Process 4 RGBA pixels at a time using SSE2 integer arithmetic. +// Source pixels are gathered (non-contiguous via x_map), destination pixels +// are contiguous. The blend formula is identical to the scalar `blend_u8`: +// result = ((src*alpha + dst*(255-alpha) + 128) + ((…) >> 8)) >> 8 +// +// For the alpha channel we set source-alpha to 255 before blending so that +// `blend_u8(255, dst_alpha, src_alpha)` naturally computes the standard +// over-composite alpha `a_src + a_dst*(1-a_src)` (within ±1 of the scalar +// approximation — both are approximate divisions by 255). + +/// Read 4 bytes from `src` at `offset` as a native-endian `u32`. +/// +/// # Safety +/// +/// Caller must ensure `offset + 3 < src.len()`. +#[inline(always)] +unsafe fn read_rgba_u32(src: &[u8], offset: usize) -> u32 { + std::ptr::read_unaligned(src.as_ptr().add(offset) as *const u32) +} + +/// Blend 4 gathered source RGBA pixels onto 4 contiguous destination pixels +/// using SSE2 "over" compositing (no opacity modifier). +/// +/// # Safety +/// +/// `dst_ptr` must point to at least 16 writable bytes. Source pixel values +/// in `src_pixels` must be valid RGBA `u32` values. +#[cfg(target_arch = "x86_64")] +#[inline(always)] +unsafe fn blend_4px_over_sse2(dst_ptr: *mut u8, src_pixels: [u32; 4]) { + use std::arch::x86_64::*; + + let zero = _mm_setzero_si128(); + let c255 = _mm_set1_epi16(255); + let c128 = _mm_set1_epi16(128); + + // Assemble 4 gathered source pixels into one register. + let src4 = _mm_set_epi32( + src_pixels[3] as i32, + src_pixels[2] as i32, + src_pixels[1] as i32, + src_pixels[0] as i32, + ); + + // Mask with 0xFF at each pixel's alpha-byte position (bytes 3,7,11,15). + let alpha_byte_mask = _mm_set1_epi32(0xFF00_0000_u32 as i32); + + // Fast path: all 4 source pixels fully opaque → direct copy. + let alpha_bytes = _mm_and_si128(src4, alpha_byte_mask); + if _mm_movemask_epi8(_mm_cmpeq_epi8(alpha_bytes, alpha_byte_mask)) == 0xFFFF { + _mm_storeu_si128(dst_ptr as *mut __m128i, src4); + return; + } + + // Fast path: all 4 source pixels fully transparent → nothing to do. + if _mm_movemask_epi8(_mm_cmpeq_epi8(alpha_bytes, zero)) == 0xFFFF { + return; + } + + let dst4 = _mm_loadu_si128(dst_ptr as *const __m128i); + + // Replace source alpha channel with 255 for correct composite-alpha + // via blend_u8(255, dst_alpha, src_alpha). + let src_blend = _mm_or_si128(src4, alpha_byte_mask); + + // --- Low 2 pixels (u16 arithmetic) --- + let src_lo = _mm_unpacklo_epi8(src_blend, zero); + let dst_lo = _mm_unpacklo_epi8(dst4, zero); + + // Extract original source alpha and broadcast within each 4-u16 pixel group. + let src_orig_lo = _mm_unpacklo_epi8(src4, zero); + // _MM_SHUFFLE(3,3,3,3) = 0xFF → replicate element 3 (alpha) to all 4 positions. + let alpha_lo = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src_orig_lo, 0xFF), 0xFF); + + let inv_alpha_lo = _mm_sub_epi16(c255, alpha_lo); + let val_lo = _mm_add_epi16( + _mm_add_epi16(_mm_mullo_epi16(src_lo, alpha_lo), _mm_mullo_epi16(dst_lo, inv_alpha_lo)), + c128, + ); + let result_lo = _mm_srli_epi16(_mm_add_epi16(val_lo, _mm_srli_epi16(val_lo, 8)), 8); + + // --- High 2 pixels --- + let src_hi = _mm_unpackhi_epi8(src_blend, zero); + let dst_hi = _mm_unpackhi_epi8(dst4, zero); + let src_orig_hi = _mm_unpackhi_epi8(src4, zero); + let alpha_hi = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src_orig_hi, 0xFF), 0xFF); + + let inv_alpha_hi = _mm_sub_epi16(c255, alpha_hi); + let val_hi = _mm_add_epi16( + _mm_add_epi16(_mm_mullo_epi16(src_hi, alpha_hi), _mm_mullo_epi16(dst_hi, inv_alpha_hi)), + c128, + ); + let result_hi = _mm_srli_epi16(_mm_add_epi16(val_hi, _mm_srli_epi16(val_hi, 8)), 8); + + // Pack back to u8 and store. + _mm_storeu_si128(dst_ptr as *mut __m128i, _mm_packus_epi16(result_lo, result_hi)); +} + +/// Blend 4 gathered source RGBA pixels onto 4 contiguous destination pixels +/// using SSE2 "over" compositing **with** an opacity multiplier applied to +/// each pixel's source alpha. +/// +/// # Safety +/// +/// `dst_ptr` must point to at least 16 writable bytes. +#[cfg(target_arch = "x86_64")] +#[inline(always)] +unsafe fn blend_4px_over_alpha_sse2(dst_ptr: *mut u8, src_pixels: [u32; 4], opacity: u16) { + use std::arch::x86_64::*; + + let zero = _mm_setzero_si128(); + let c255 = _mm_set1_epi16(255); + let c128 = _mm_set1_epi16(128); + let opacity_v = _mm_set1_epi16(opacity as i16); + + let src4 = _mm_set_epi32( + src_pixels[3] as i32, + src_pixels[2] as i32, + src_pixels[1] as i32, + src_pixels[0] as i32, + ); + + let dst4 = _mm_loadu_si128(dst_ptr as *const __m128i); + let alpha_byte_mask = _mm_set1_epi32(0xFF00_0000_u32 as i32); + let src_blend = _mm_or_si128(src4, alpha_byte_mask); + + // --- Low 2 pixels --- + let src_lo = _mm_unpacklo_epi8(src_blend, zero); + let dst_lo = _mm_unpacklo_epi8(dst4, zero); + + // Extract original alpha, apply opacity: sa_eff = (sa * opacity + 128) >> 8. + // Max value: (255*255+128)>>8 = 254, so no clamping needed. + let src_orig_lo = _mm_unpacklo_epi8(src4, zero); + let raw_alpha_lo = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src_orig_lo, 0xFF), 0xFF); + let alpha_lo = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(raw_alpha_lo, opacity_v), c128), 8); + + let inv_alpha_lo = _mm_sub_epi16(c255, alpha_lo); + let val_lo = _mm_add_epi16( + _mm_add_epi16(_mm_mullo_epi16(src_lo, alpha_lo), _mm_mullo_epi16(dst_lo, inv_alpha_lo)), + c128, + ); + let result_lo = _mm_srli_epi16(_mm_add_epi16(val_lo, _mm_srli_epi16(val_lo, 8)), 8); + + // --- High 2 pixels --- + let src_hi = _mm_unpackhi_epi8(src_blend, zero); + let dst_hi = _mm_unpackhi_epi8(dst4, zero); + let src_orig_hi = _mm_unpackhi_epi8(src4, zero); + let raw_alpha_hi = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src_orig_hi, 0xFF), 0xFF); + let alpha_hi = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(raw_alpha_hi, opacity_v), c128), 8); + + let inv_alpha_hi = _mm_sub_epi16(c255, alpha_hi); + let val_hi = _mm_add_epi16( + _mm_add_epi16(_mm_mullo_epi16(src_hi, alpha_hi), _mm_mullo_epi16(dst_hi, inv_alpha_hi)), + c128, + ); + let result_hi = _mm_srli_epi16(_mm_add_epi16(val_hi, _mm_srli_epi16(val_hi, 8)), 8); + + _mm_storeu_si128(dst_ptr as *mut __m128i, _mm_packus_epi16(result_lo, result_hi)); +} + /// Inner blit for fully-opaque layers (`opacity >= 1.0`). Skips the /// per-pixel f32 multiply on the source alpha channel. /// /// Uses integer-only alpha blending for semi-transparent source pixels. +/// `x_map` provides precomputed source-X indices (one per destination column). +/// +/// On x86-64, processes 4 pixels at a time using SSE2 SIMD when the row is +/// wide enough and bounds can be pre-validated. #[allow( clippy::cast_possible_truncation, clippy::cast_sign_loss, clippy::too_many_arguments, clippy::suboptimal_flops, - clippy::inline_always + clippy::inline_always, + // dx is used as both x_map index and dst offset, so an iterator is non-trivial. + clippy::needless_range_loop )] #[inline(always)] fn blit_row_opaque( @@ -198,14 +410,65 @@ fn blit_row_opaque( effective_rw: usize, src: &[u8], sw: usize, - _sh: usize, sy: usize, - rw: usize, - src_col_skip: usize, + x_map: &[usize], ) { let src_row_base = sy * sw * 4; + + // ── SSE2 fast path: process 4 pixels at a time ───────────────────── + #[cfg(target_arch = "x86_64")] + { + // Pre-validate bounds so the inner SIMD loop is branch-free. + let src_row_end = src_row_base + sw * 4; + let dst_end = (rx + effective_rw) * 4; + if src_row_end <= src.len() && dst_end <= row_slice.len() { + let chunks = effective_rw / 4; + for c in 0..chunks { + let dx = c * 4; + // SAFETY: bounds pre-validated above; x_map values < sw; + // dst range (rx+dx)*4..(rx+dx+4)*4 < dst_end <= row_slice.len(). + unsafe { + let pixels = [ + read_rgba_u32(src, src_row_base + x_map[dx] * 4), + read_rgba_u32(src, src_row_base + x_map[dx + 1] * 4), + read_rgba_u32(src, src_row_base + x_map[dx + 2] * 4), + read_rgba_u32(src, src_row_base + x_map[dx + 3] * 4), + ]; + blend_4px_over_sse2(row_slice.as_mut_ptr().add((rx + dx) * 4), pixels); + } + } + + // Scalar tail for remaining 0-3 pixels. + let tail_start = chunks * 4; + for dx in tail_start..effective_rw { + let sx = x_map[dx]; + let src_idx = src_row_base + sx * 4; + let sr = src[src_idx]; + let sg = src[src_idx + 1]; + let sb = src[src_idx + 2]; + let sa = src[src_idx + 3]; + let dst_idx = (rx + dx) * 4; + if sa == 255 { + row_slice[dst_idx] = sr; + row_slice[dst_idx + 1] = sg; + row_slice[dst_idx + 2] = sb; + row_slice[dst_idx + 3] = 255; + } else if sa > 0 { + let a16 = u16::from(sa); + row_slice[dst_idx] = blend_u8(sr, row_slice[dst_idx], a16); + row_slice[dst_idx + 1] = blend_u8(sg, row_slice[dst_idx + 1], a16); + row_slice[dst_idx + 2] = blend_u8(sb, row_slice[dst_idx + 2], a16); + let da = u16::from(row_slice[dst_idx + 3]); + row_slice[dst_idx + 3] = (a16 + ((da * (255 - a16) + 128) >> 8)).min(255) as u8; + } + } + return; + } + } + + // ── Scalar fallback (bounds-checked per pixel) ───────────────────── for dx in 0..effective_rw { - let sx = (dx + src_col_skip) * sw / rw; + let sx = x_map[dx]; let src_idx = src_row_base + sx * 4; if src_idx + 3 >= src.len() { continue; @@ -231,7 +494,6 @@ fn blit_row_opaque( row_slice[dst_idx] = blend_u8(sr, row_slice[dst_idx], a16); row_slice[dst_idx + 1] = blend_u8(sg, row_slice[dst_idx + 1], a16); row_slice[dst_idx + 2] = blend_u8(sb, row_slice[dst_idx + 2], a16); - // Composite alpha: a_out = a_src + a_dst * (1 - a_src) let da = u16::from(row_slice[dst_idx + 3]); row_slice[dst_idx + 3] = (a16 + ((da * (255 - a16) + 128) >> 8)).min(255) as u8; } @@ -242,12 +504,18 @@ fn blit_row_opaque( /// Applies the opacity multiplier to every source pixel's alpha channel. /// /// Uses integer-only alpha blending. +/// `x_map` provides precomputed source-X indices (one per destination column). +/// +/// On x86-64, processes 4 pixels at a time using SSE2 SIMD when the row is +/// wide enough and bounds can be pre-validated. #[allow( clippy::cast_possible_truncation, clippy::cast_sign_loss, clippy::too_many_arguments, clippy::suboptimal_flops, - clippy::inline_always + clippy::inline_always, + // dx is used as both x_map index and dst offset, so an iterator is non-trivial. + clippy::needless_range_loop )] #[inline(always)] fn blit_row_alpha( @@ -256,18 +524,70 @@ fn blit_row_alpha( effective_rw: usize, src: &[u8], sw: usize, - _sh: usize, sy: usize, - rw: usize, opacity: f32, - src_col_skip: usize, + x_map: &[usize], ) { // Pre-compute opacity as a 0..255 integer multiplier. let opacity_u16 = (opacity * 255.0 + 0.5) as u16; let src_row_base = sy * sw * 4; + // ── SSE2 fast path ───────────────────────────────────────────────── + #[cfg(target_arch = "x86_64")] + { + let src_row_end = src_row_base + sw * 4; + let dst_end = (rx + effective_rw) * 4; + if src_row_end <= src.len() && dst_end <= row_slice.len() { + let chunks = effective_rw / 4; + for c in 0..chunks { + let dx = c * 4; + unsafe { + let pixels = [ + read_rgba_u32(src, src_row_base + x_map[dx] * 4), + read_rgba_u32(src, src_row_base + x_map[dx + 1] * 4), + read_rgba_u32(src, src_row_base + x_map[dx + 2] * 4), + read_rgba_u32(src, src_row_base + x_map[dx + 3] * 4), + ]; + blend_4px_over_alpha_sse2( + row_slice.as_mut_ptr().add((rx + dx) * 4), + pixels, + opacity_u16, + ); + } + } + + // Scalar tail. + let tail_start = chunks * 4; + for dx in tail_start..effective_rw { + let sx = x_map[dx]; + let src_idx = src_row_base + sx * 4; + let sr = src[src_idx]; + let sg = src[src_idx + 1]; + let sb = src[src_idx + 2]; + let sa = src[src_idx + 3]; + let dst_idx = (rx + dx) * 4; + let sa_eff = ((u16::from(sa) * opacity_u16 + 128) >> 8).min(255); + if sa_eff == 255 { + row_slice[dst_idx] = sr; + row_slice[dst_idx + 1] = sg; + row_slice[dst_idx + 2] = sb; + row_slice[dst_idx + 3] = 255; + } else if sa_eff > 0 { + row_slice[dst_idx] = blend_u8(sr, row_slice[dst_idx], sa_eff); + row_slice[dst_idx + 1] = blend_u8(sg, row_slice[dst_idx + 1], sa_eff); + row_slice[dst_idx + 2] = blend_u8(sb, row_slice[dst_idx + 2], sa_eff); + let da = u16::from(row_slice[dst_idx + 3]); + row_slice[dst_idx + 3] = + (sa_eff + ((da * (255 - sa_eff) + 128) >> 8)).min(255) as u8; + } + } + return; + } + } + + // ── Scalar fallback ──────────────────────────────────────────────── for dx in 0..effective_rw { - let sx = (dx + src_col_skip) * sw / rw; + let sx = x_map[dx]; let src_idx = src_row_base + sx * 4; if src_idx + 3 >= src.len() { continue; @@ -283,7 +603,6 @@ fn blit_row_alpha( continue; } - // Effective alpha: (sa * opacity) / 255, done in integer. let sa_eff = ((u16::from(sa) * opacity_u16 + 128) >> 8).min(255); if sa_eff == 255 { row_slice[dst_idx] = sr;