diff --git a/crates/core/src/frame_pool.rs b/crates/core/src/frame_pool.rs
index c7a6a829..9489df8e 100644
--- a/crates/core/src/frame_pool.rs
+++ b/crates/core/src/frame_pool.rs
@@ -139,6 +139,11 @@ impl<T: Clone + Default> FramePool<T> {
     /// Get pooled storage for at least `min_len` elements.
     ///
     /// If `min_len` doesn't fit in any bucket, returns a non-pooled buffer of exact size.
+    ///
+    /// On the first miss for a given bucket (cold start), an extra buffer is
+    /// allocated and placed into the pool so that the *next* `get()` at the
+    /// same size is a hit.  This amortises cold-start allocation cost without
+    /// pre-allocating every bucket size up front.
     pub fn get(&self, min_len: usize) -> PooledFrameData<T> {
         let (handle, bucket_idx, bucket_size, maybe_buf) = {
             let Ok(mut guard) = self.inner.lock() else {
@@ -154,6 +159,12 @@ impl<T: Clone + Default> FramePool<T> {
                 guard.hits += 1;
             } else {
                 guard.misses += 1;
+                // Lazy preallocate: on first miss for this bucket, seed the
+                // pool with one extra buffer so subsequent gets are hits.
+                if guard.buckets[bucket_idx].is_empty() && guard.buckets[bucket_idx].capacity() == 0
+                {
+                    guard.buckets[bucket_idx].push(vec![T::default(); bucket_size]);
+                }
             }
             (self.handle(), bucket_idx, bucket_size, buf)
         };
diff --git a/crates/engine/Cargo.toml b/crates/engine/Cargo.toml
index a5cf11e4..8410f58d 100644
--- a/crates/engine/Cargo.toml
+++ b/crates/engine/Cargo.toml
@@ -64,5 +64,9 @@ indexmap = { workspace = true }
 name = "compositor_pipeline"
 harness = false
 
+[[bench]]
+name = "compositor_only"
+harness = false
+
 [lints]
 workspace = true
diff --git a/crates/engine/benches/compositor_only.rs b/crates/engine/benches/compositor_only.rs
new file mode 100644
index 00000000..5a6263fe
--- /dev/null
+++ b/crates/engine/benches/compositor_only.rs
@@ -0,0 +1,547 @@
+// SPDX-FileCopyrightText: © 2025 StreamKit Contributors
+//
+// SPDX-License-Identifier: MPL-2.0
+
+#![allow(clippy::disallowed_macros)] // Bench binary intentionally uses eprintln!/println! for output.
+#![allow(clippy::expect_used)] // Panicking on errors is fine in a benchmark binary.
+#![allow(clippy::cast_possible_truncation, clippy::cast_sign_loss, clippy::cast_precision_loss)]
+
+//! Compositor-only microbenchmark — measures `composite_frame` in isolation
+//! (no VP9 encode, no mux, no async runtime overhead).
+//!
+//! Exercises the following scenarios across multiple resolutions:
+//!
+//! - 1 layer RGBA (baseline)
+//! - 2 layers RGBA (PiP)
+//! - 4 layers RGBA
+//! - 2 layers mixed I420 + RGBA (measures YUV→RGBA conversion overhead)
+//! - 2 layers mixed NV12 + RGBA
+//! - 2 layers RGBA with rotation
+//! - 2 layers RGBA, static (same data each frame — for future cache-hit measurement)
+//!
+//! ## Usage
+//!
+//! Quick run (default 200 frames @ 1280×720):
+//!
+//! ```bash
+//! cargo bench -p streamkit-engine --bench compositor_only
+//! ```
+//!
+//! Custom parameters:
+//!
+//! ```bash
+//! cargo bench -p streamkit-engine --bench compositor_only -- --frames 500 --width 1920 --height 1080
+//! ```
+
+use std::sync::Arc;
+use std::time::Instant;
+
+use streamkit_core::frame_pool::PooledVideoData;
+use streamkit_core::types::PixelFormat;
+use streamkit_core::VideoFramePool;
+
+// Re-use the compositor kernel and pixel_ops directly.
+use streamkit_nodes::video::compositor::config::Rect;
+use streamkit_nodes::video::compositor::kernel::{composite_frame, ConversionCache, LayerSnapshot};
+use streamkit_nodes::video::compositor::overlay::DecodedOverlay;
+use streamkit_nodes::video::compositor::pixel_ops::rgba8_to_i420;
+
+// ── Default benchmark parameters ────────────────────────────────────────────
+
+const DEFAULT_WIDTH: u32 = 1280;
+const DEFAULT_HEIGHT: u32 = 720;
+const DEFAULT_FRAME_COUNT: u32 = 200;
+
+// ── Arg parser ──────────────────────────────────────────────────────────────
+
+struct BenchArgs {
+    width: u32,
+    height: u32,
+    frame_count: u32,
+    iterations: u32,
+    /// Optional filter: only run scenarios whose label contains this substring.
+    filter: Option<String>,
+}
+
+impl BenchArgs {
+    fn parse() -> Self {
+        let args: Vec<String> = std::env::args().collect();
+        let mut cfg = Self {
+            width: DEFAULT_WIDTH,
+            height: DEFAULT_HEIGHT,
+            frame_count: DEFAULT_FRAME_COUNT,
+            iterations: 3,
+            filter: None,
+        };
+        let mut i = 1;
+        while i < args.len() {
+            match args[i].as_str() {
+                "--width" | "-w" => {
+                    i += 1;
+                    if let Some(v) = args.get(i) {
+                        cfg.width = v.parse().unwrap_or(cfg.width);
+                    }
+                },
+                "--height" | "-h" => {
+                    i += 1;
+                    if let Some(v) = args.get(i) {
+                        cfg.height = v.parse().unwrap_or(cfg.height);
+                    }
+                },
+                "--frames" | "-n" => {
+                    i += 1;
+                    if let Some(v) = args.get(i) {
+                        cfg.frame_count = v.parse().unwrap_or(cfg.frame_count);
+                    }
+                },
+                "--iterations" | "-i" => {
+                    i += 1;
+                    if let Some(v) = args.get(i) {
+                        cfg.iterations = v.parse().unwrap_or(cfg.iterations);
+                    }
+                },
+                "--filter" | "-f" => {
+                    i += 1;
+                    if let Some(v) = args.get(i) {
+                        cfg.filter = Some(v.clone());
+                    }
+                },
+                _ => {},
+            }
+            i += 1;
+        }
+        cfg
+    }
+}
+
+// ── Frame generators ────────────────────────────────────────────────────────
+
+/// Generate an RGBA8 color-bar frame (opaque, all alpha = 255).
+fn generate_rgba_frame(width: u32, height: u32) -> Vec<u8> {
+    let w = width as usize;
+    let h = height as usize;
+    let mut data = vec![0u8; w * h * 4];
+    // Simple vertical gradient bars for visual distinctness.
+    let bar_colors: &[(u8, u8, u8)] = &[
+        (191, 191, 191), // white
+        (191, 191, 0),   // yellow
+        (0, 191, 191),   // cyan
+        (0, 191, 0),     // green
+        (191, 0, 191),   // magenta
+        (191, 0, 0),     // red
+        (0, 0, 191),     // blue
+    ];
+    for row in 0..h {
+        for col in 0..w {
+            let bar_idx = col * bar_colors.len() / w;
+            let (r, g, b) = bar_colors[bar_idx];
+            let off = (row * w + col) * 4;
+            data[off] = r;
+            data[off + 1] = g;
+            data[off + 2] = b;
+            data[off + 3] = 255;
+        }
+    }
+    data
+}
+
+/// Generate an I420 frame by converting an RGBA frame.
+fn generate_i420_frame(width: u32, height: u32) -> Vec<u8> {
+    let rgba = generate_rgba_frame(width, height);
+    rgba8_to_i420(&rgba, width, height)
+}
+
+/// Generate an NV12 frame by converting an RGBA frame.
+fn generate_nv12_frame(width: u32, height: u32) -> Vec<u8> {
+    let rgba = generate_rgba_frame(width, height);
+    let w = width as usize;
+    let h = height as usize;
+    let chroma_w = w.div_ceil(2);
+    let chroma_h = h.div_ceil(2);
+    let nv12_size = w * h + chroma_w * 2 * chroma_h;
+    let mut nv12 = vec![0u8; nv12_size];
+    streamkit_nodes::video::compositor::pixel_ops::rgba8_to_nv12_buf(
+        &rgba, width, height, &mut nv12,
+    );
+    nv12
+}
+
+// ── Compositing harness ─────────────────────────────────────────────────────
+
+/// Call the real `composite_frame` kernel for `frame_count` iterations,
+/// returning per-frame timing statistics.  This exercises all kernel
+/// optimizations: conversion cache, skip-canvas-clear, identity-scale
+/// fast-path, precomputed x-map, SSE2 blend, etc.
+///
+/// Uses a real `VideoFramePool` to match production behaviour (pooled buffer
+/// reuse instead of per-frame heap allocation).
+fn bench_composite(
+    _label: &str,
+    canvas_w: u32,
+    canvas_h: u32,
+    layers: &[Option<LayerSnapshot>],
+    frame_count: u32,
+) -> BenchResult {
+    let empty_overlays: Vec<Arc<DecodedOverlay>> = Vec::new();
+    let mut conversion_cache = ConversionCache::new();
+    let pool = VideoFramePool::video_default();
+
+    let start = Instant::now();
+
+    for _ in 0..frame_count {
+        let _result = composite_frame(
+            canvas_w,
+            canvas_h,
+            layers,
+            &empty_overlays,
+            &empty_overlays,
+            Some(&pool),
+            &mut conversion_cache,
+        );
+    }
+
+    let elapsed = start.elapsed();
+    BenchResult { total_secs: elapsed.as_secs_f64(), frame_count }
+}
+
+struct BenchResult {
+    total_secs: f64,
+    frame_count: u32,
+}
+
+impl BenchResult {
+    fn fps(&self) -> f64 {
+        f64::from(self.frame_count) / self.total_secs
+    }
+
+    fn ms_per_frame(&self) -> f64 {
+        self.total_secs * 1000.0 / f64::from(self.frame_count)
+    }
+}
+
+// ── Scenario definitions ────────────────────────────────────────────────────
+
+struct Scenario {
+    label: String,
+    layers: Vec<Option<LayerSnapshot>>,
+}
+
+fn make_layer(
+    data: Vec<u8>,
+    width: u32,
+    height: u32,
+    pixel_format: PixelFormat,
+    rect: Option<Rect>,
+    opacity: f32,
+    z_index: i32,
+    rotation_degrees: f32,
+) -> Option<LayerSnapshot> {
+    Some(LayerSnapshot {
+        data: Arc::new(PooledVideoData::from_vec(data)),
+        width,
+        height,
+        pixel_format,
+        rect,
+        opacity,
+        z_index,
+        rotation_degrees,
+    })
+}
+
+fn build_scenarios(canvas_w: u32, canvas_h: u32) -> Vec<Scenario> {
+    let pip_w = canvas_w / 3;
+    let pip_h = canvas_h / 3;
+    let pip_x = (canvas_w - pip_w - 20) as i32;
+    let pip_y = (canvas_h - pip_h - 20) as i32;
+
+    vec![
+        // 1 layer RGBA — baseline
+        Scenario {
+            label: "1-layer-rgba".to_string(),
+            layers: vec![make_layer(
+                generate_rgba_frame(canvas_w, canvas_h),
+                canvas_w,
+                canvas_h,
+                PixelFormat::Rgba8,
+                None,
+                1.0,
+                0,
+                0.0,
+            )],
+        },
+        // 2 layers RGBA (PiP)
+        Scenario {
+            label: "2-layer-rgba-pip".to_string(),
+            layers: vec![
+                make_layer(
+                    generate_rgba_frame(canvas_w, canvas_h),
+                    canvas_w,
+                    canvas_h,
+                    PixelFormat::Rgba8,
+                    None,
+                    1.0,
+                    0,
+                    0.0,
+                ),
+                make_layer(
+                    generate_rgba_frame(pip_w, pip_h),
+                    pip_w,
+                    pip_h,
+                    PixelFormat::Rgba8,
+                    Some(Rect { x: pip_x, y: pip_y, width: pip_w, height: pip_h }),
+                    0.9,
+                    1,
+                    0.0,
+                ),
+            ],
+        },
+        // 4 layers RGBA
+        Scenario {
+            label: "4-layer-rgba".to_string(),
+            layers: vec![
+                make_layer(
+                    generate_rgba_frame(canvas_w, canvas_h),
+                    canvas_w,
+                    canvas_h,
+                    PixelFormat::Rgba8,
+                    None,
+                    1.0,
+                    0,
+                    0.0,
+                ),
+                make_layer(
+                    generate_rgba_frame(pip_w, pip_h),
+                    pip_w,
+                    pip_h,
+                    PixelFormat::Rgba8,
+                    Some(Rect { x: pip_x, y: pip_y, width: pip_w, height: pip_h }),
+                    0.9,
+                    1,
+                    0.0,
+                ),
+                make_layer(
+                    generate_rgba_frame(pip_w, pip_h),
+                    pip_w,
+                    pip_h,
+                    PixelFormat::Rgba8,
+                    Some(Rect { x: 20, y: 20, width: pip_w, height: pip_h }),
+                    0.8,
+                    2,
+                    0.0,
+                ),
+                make_layer(
+                    generate_rgba_frame(pip_w, pip_h),
+                    pip_w,
+                    pip_h,
+                    PixelFormat::Rgba8,
+                    Some(Rect { x: 20, y: pip_y, width: pip_w, height: pip_h }),
+                    0.7,
+                    3,
+                    0.0,
+                ),
+            ],
+        },
+        // 2 layers: I420 bg + RGBA PiP (measures conversion overhead)
+        Scenario {
+            label: "2-layer-i420+rgba".to_string(),
+            layers: vec![
+                make_layer(
+                    generate_i420_frame(canvas_w, canvas_h),
+                    canvas_w,
+                    canvas_h,
+                    PixelFormat::I420,
+                    None,
+                    1.0,
+                    0,
+                    0.0,
+                ),
+                make_layer(
+                    generate_rgba_frame(pip_w, pip_h),
+                    pip_w,
+                    pip_h,
+                    PixelFormat::Rgba8,
+                    Some(Rect { x: pip_x, y: pip_y, width: pip_w, height: pip_h }),
+                    0.9,
+                    1,
+                    0.0,
+                ),
+            ],
+        },
+        // 2 layers: NV12 bg + RGBA PiP
+        Scenario {
+            label: "2-layer-nv12+rgba".to_string(),
+            layers: vec![
+                make_layer(
+                    generate_nv12_frame(canvas_w, canvas_h),
+                    canvas_w,
+                    canvas_h,
+                    PixelFormat::Nv12,
+                    None,
+                    1.0,
+                    0,
+                    0.0,
+                ),
+                make_layer(
+                    generate_rgba_frame(pip_w, pip_h),
+                    pip_w,
+                    pip_h,
+                    PixelFormat::Rgba8,
+                    Some(Rect { x: pip_x, y: pip_y, width: pip_w, height: pip_h }),
+                    0.9,
+                    1,
+                    0.0,
+                ),
+            ],
+        },
+        // 2 layers RGBA with rotation on PiP
+        Scenario {
+            label: "2-layer-rgba-rotated".to_string(),
+            layers: vec![
+                make_layer(
+                    generate_rgba_frame(canvas_w, canvas_h),
+                    canvas_w,
+                    canvas_h,
+                    PixelFormat::Rgba8,
+                    None,
+                    1.0,
+                    0,
+                    0.0,
+                ),
+                make_layer(
+                    generate_rgba_frame(pip_w, pip_h),
+                    pip_w,
+                    pip_h,
+                    PixelFormat::Rgba8,
+                    Some(Rect { x: pip_x, y: pip_y, width: pip_w, height: pip_h }),
+                    0.9,
+                    1,
+                    15.0, // 15° rotation
+                ),
+            ],
+        },
+        // 2 layers RGBA, static (same Arc — for future cache-hit measurement)
+        Scenario {
+            label: "2-layer-rgba-static".to_string(),
+            layers: {
+                let bg =
+                    Arc::new(PooledVideoData::from_vec(generate_rgba_frame(canvas_w, canvas_h)));
+                let pip = Arc::new(PooledVideoData::from_vec(generate_rgba_frame(pip_w, pip_h)));
+                vec![
+                    Some(LayerSnapshot {
+                        data: bg,
+                        width: canvas_w,
+                        height: canvas_h,
+                        pixel_format: PixelFormat::Rgba8,
+                        rect: None,
+                        opacity: 1.0,
+                        z_index: 0,
+                        rotation_degrees: 0.0,
+                    }),
+                    Some(LayerSnapshot {
+                        data: pip,
+                        width: pip_w,
+                        height: pip_h,
+                        pixel_format: PixelFormat::Rgba8,
+                        rect: Some(Rect { x: pip_x, y: pip_y, width: pip_w, height: pip_h }),
+                        opacity: 0.9,
+                        z_index: 1,
+                        rotation_degrees: 0.0,
+                    }),
+                ]
+            },
+        },
+    ]
+}
+
+// ── Main ────────────────────────────────────────────────────────────────────
+
+fn main() {
+    let args = BenchArgs::parse();
+
+    let resolutions: &[(u32, u32)] = if args.width == DEFAULT_WIDTH && args.height == DEFAULT_HEIGHT
+    {
+        // Default: run at multiple resolutions.
+        &[(640, 480), (1280, 720), (1920, 1080)]
+    } else {
+        // Custom: run at the specified resolution only.
+        // (Leak to get 'static — acceptable in a short-lived bench binary.)
+        let res = Box::leak(Box::new([(args.width, args.height)]));
+        res
+    };
+
+    eprintln!("╔══════════════════════════════════════════════════════════╗");
+    eprintln!("║         Compositor-Only Microbenchmark                  ║");
+    eprintln!("╠══════════════════════════════════════════════════════════╣");
+    eprintln!(
+        "║  Resolutions : {:<41}║",
+        resolutions.iter().map(|(w, h)| format!("{w}×{h}")).collect::<Vec<_>>().join(", ")
+    );
+    eprintln!("║  Frames      : {:<41}║", args.frame_count);
+    eprintln!("║  Iterations  : {:<41}║", args.iterations);
+    if let Some(ref f) = args.filter {
+        eprintln!("║  Filter      : {f:<41}║");
+    }
+    eprintln!("╚══════════════════════════════════════════════════════════╝");
+    eprintln!();
+
+    let mut json_results: Vec<serde_json::Value> = Vec::new();
+
+    for &(w, h) in resolutions {
+        eprintln!("── {w}×{h} ──────────────────────────────────────────────");
+
+        let scenarios = build_scenarios(w, h);
+
+        for scenario in &scenarios {
+            if let Some(ref filter) = args.filter {
+                if !scenario.label.contains(filter.as_str()) {
+                    continue;
+                }
+            }
+
+            let mut iter_results = Vec::with_capacity(args.iterations as usize);
+
+            for iter in 1..=args.iterations {
+                let result =
+                    bench_composite(&scenario.label, w, h, &scenario.layers, args.frame_count);
+                eprintln!(
+                    "  {:<28} iter {iter}/{}: {:>8.1} fps  ({:.2} ms/frame)",
+                    scenario.label,
+                    args.iterations,
+                    result.fps(),
+                    result.ms_per_frame(),
+                );
+                iter_results.push(result);
+            }
+
+            // Summary for this scenario.
+            let fps_values: Vec<f64> = iter_results.iter().map(BenchResult::fps).collect();
+            let ms_values: Vec<f64> = iter_results.iter().map(BenchResult::ms_per_frame).collect();
+            let mean_fps = fps_values.iter().sum::<f64>() / fps_values.len() as f64;
+            let mean_ms = ms_values.iter().sum::<f64>() / ms_values.len() as f64;
+            let min_ms = ms_values.iter().copied().fold(f64::INFINITY, f64::min);
+            let max_ms = ms_values.iter().copied().fold(f64::NEG_INFINITY, f64::max);
+
+            eprintln!(
+                "  {:<28} avg: {:>8.1} fps  ({:.2} ms/frame, min={:.2}, max={:.2})",
+                "", mean_fps, mean_ms, min_ms, max_ms,
+            );
+
+            json_results.push(serde_json::json!({
+                "benchmark": "compositor_only",
+                "scenario": scenario.label,
+                "width": w,
+                "height": h,
+                "frame_count": args.frame_count,
+                "iterations": args.iterations,
+                "mean_fps": mean_fps,
+                "mean_ms_per_frame": mean_ms,
+                "min_ms_per_frame": min_ms,
+                "max_ms_per_frame": max_ms,
+            }));
+        }
+        eprintln!();
+    }
+
+    // Machine-readable JSON output.
+    println!("{}", serde_json::to_string_pretty(&json_results).expect("JSON serialization"));
+}
diff --git a/crates/nodes/src/video/compositor/kernel.rs b/crates/nodes/src/video/compositor/kernel.rs
index c173360e..621fe893 100644
--- a/crates/nodes/src/video/compositor/kernel.rs
+++ b/crates/nodes/src/video/compositor/kernel.rs
@@ -19,6 +19,133 @@ use super::pixel_ops::{
 
 // ── Compositing kernel (runs on a persistent blocking thread) ────────────────
 
+// ── YUV → RGBA conversion cache ─────────────────────────────────────────────
+
+/// Cached RGBA conversion result for a single layer slot.
+struct CachedConversion {
+    /// Identity of the source data (`Arc::as_ptr` cast to `usize`).
+    /// When the `Arc<PooledVideoData>` pointer hasn't changed between frames
+    /// the underlying data is identical and the conversion can be skipped.
+    data_identity: usize,
+    width: u32,
+    height: u32,
+    /// Pre-converted RGBA8 data, stored as a plain `Vec<u8>`.
+    rgba: Vec<u8>,
+}
+
+/// Per-slot cache for YUV → RGBA conversions.
+///
+/// Avoids redundant per-frame I420/NV12 → RGBA8 conversion when the source
+/// `Arc<PooledVideoData>` hasn't changed since the previous frame.
+///
+/// Also caches the first-layer alpha-scan result so that the canvas-clear
+/// skip check doesn't re-scan every frame when the source hasn't changed.
+pub struct ConversionCache {
+    entries: Vec<Option<CachedConversion>>,
+    /// Cached result of the alpha-opaqueness scan for the first visible layer.
+    /// `(data_identity, all_opaque)` — valid when the `Arc` pointer matches.
+    first_layer_alpha_cache: Option<(usize, bool)>,
+}
+
+impl ConversionCache {
+    pub const fn new() -> Self {
+        Self { entries: Vec::new(), first_layer_alpha_cache: None }
+    }
+
+    /// Check whether the first visible layer's source data is fully opaque.
+    ///
+    /// For I420/NV12 layers, the converted RGBA always has alpha == 255, so
+    /// we return `true` immediately without scanning.  For RGBA layers we
+    /// scan once and cache the result keyed by `Arc::as_ptr`.
+    fn first_layer_all_opaque(&mut self, layer: &LayerSnapshot, rgba_data: &[u8]) -> bool {
+        // I420/NV12 → RGBA conversion always writes alpha = 255.
+        if layer.pixel_format != PixelFormat::Rgba8 {
+            return true;
+        }
+
+        let identity = Arc::as_ptr(&layer.data) as usize;
+        if let Some((cached_id, cached_result)) = self.first_layer_alpha_cache {
+            if cached_id == identity {
+                return cached_result;
+            }
+        }
+
+        let all_opaque = rgba_data.chunks_exact(4).all(|px| px[3] == 255);
+        self.first_layer_alpha_cache = Some((identity, all_opaque));
+        all_opaque
+    }
+
+    /// Return a previously-cached RGBA slice for `slot_idx`.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the slot has not been populated by a prior `get_or_convert`
+    /// call for the same `layer`.  This is only called in the second pass of
+    /// `composite_frame` after the first pass has ensured every non-RGBA
+    /// layer has been converted.
+    fn get_cached(&self, slot_idx: usize, layer: &LayerSnapshot) -> &[u8] {
+        #[allow(clippy::expect_used)]
+        let cached =
+            self.entries[slot_idx].as_ref().expect("get_cached called before get_or_convert");
+        let needed = layer.width as usize * layer.height as usize * 4;
+        &cached.rgba[..needed]
+    }
+
+    /// Look up or perform a YUV→RGBA conversion for layer at `slot_idx`.
+    /// Returns a slice of RGBA8 data.
+    fn get_or_convert(&mut self, slot_idx: usize, layer: &LayerSnapshot) -> &[u8] {
+        let identity = Arc::as_ptr(&layer.data) as usize;
+
+        // Ensure the cache Vec is large enough.
+        if self.entries.len() <= slot_idx {
+            self.entries.resize_with(slot_idx + 1, || None);
+        }
+
+        // Check if the cached entry is still valid.
+        let needs_convert = self.entries[slot_idx].as_ref().map_or(true, |cached| {
+            cached.data_identity != identity
+                || cached.width != layer.width
+                || cached.height != layer.height
+        });
+
+        if needs_convert {
+            let needed = layer.width as usize * layer.height as usize * 4;
+            // Reuse the existing allocation if possible.
+            let mut rgba = self.entries[slot_idx].take().map(|c| c.rgba).unwrap_or_default();
+            if rgba.len() < needed {
+                rgba.resize(needed, 0);
+            }
+
+            match layer.pixel_format {
+                PixelFormat::I420 => {
+                    i420_to_rgba8_buf(layer.data.as_slice(), layer.width, layer.height, &mut rgba);
+                },
+                PixelFormat::Nv12 => {
+                    nv12_to_rgba8_buf(layer.data.as_slice(), layer.width, layer.height, &mut rgba);
+                },
+                PixelFormat::Rgba8 => {
+                    // Should not be called for RGBA, but handle gracefully.
+                    rgba[..needed].copy_from_slice(&layer.data.as_slice()[..needed]);
+                },
+            }
+
+            self.entries[slot_idx] = Some(CachedConversion {
+                data_identity: identity,
+                width: layer.width,
+                height: layer.height,
+                rgba,
+            });
+        }
+
+        // SAFETY: we just inserted into this slot above when `needs_convert` was true,
+        // and the slot was already `Some` when `needs_convert` was false.
+        #[allow(clippy::expect_used)]
+        let cached = self.entries[slot_idx].as_ref().expect("just inserted");
+        let needed = layer.width as usize * layer.height as usize * 4;
+        &cached.rgba[..needed]
+    }
+}
+
 /// Snapshot of one input layer's data for the blocking compositor thread.
 pub struct LayerSnapshot {
     pub data: Arc<streamkit_core::frame_pool::PooledVideoData>,
@@ -27,8 +154,10 @@ pub struct LayerSnapshot {
     pub pixel_format: PixelFormat,
     pub rect: Option<Rect>,
     pub opacity: f32,
-    /// Visual stacking order.  Lower values are drawn first (bottom).
-    /// Used to sort layers before compositing; ties broken by slot index.
+    /// Visual stacking order.  Retained in the snapshot for diagnostic /
+    /// logging purposes even though sorting now happens before snapshot
+    /// construction.
+    #[allow(dead_code)]
     pub z_index: i32,
     /// Clockwise rotation in degrees around the destination rect centre.
     /// Default `0.0` means no rotation.
@@ -56,8 +185,8 @@ pub struct CompositeResult {
 /// Composite all layers + overlays onto a fresh RGBA8 canvas buffer.
 /// Allocates from the video pool if available.
 ///
-/// `i420_scratch` is a reusable buffer for I420/NV12→RGBA8 conversion,
-/// avoiding per-frame allocation.
+/// `conversion_cache` caches YUV→RGBA8 conversions across frames so that
+/// unchanged layers skip the conversion entirely.
 pub fn composite_frame(
     canvas_w: u32,
     canvas_h: u32,
@@ -65,7 +194,7 @@ pub fn composite_frame(
     image_overlays: &[Arc<DecodedOverlay>],
     text_overlays: &[Arc<DecodedOverlay>],
     video_pool: Option<&streamkit_core::VideoFramePool>,
-    i420_scratch: &mut Vec<u8>,
+    conversion_cache: &mut ConversionCache,
 ) -> streamkit_core::frame_pool::PooledVideoData {
     let total_bytes = (canvas_w as usize) * (canvas_h as usize) * 4;
 
@@ -74,36 +203,83 @@ pub fn composite_frame(
         |pool| pool.get(total_bytes),
     );
 
-    // Zero the buffer (transparent black).
     let buf = pooled.as_mut_slice();
-    buf[..total_bytes].fill(0);
+
+    // Two-pass source resolution.
+    //
+    // Pass 1: populate the conversion cache for every non-RGBA layer.
+    // `slot_idx` uses the position in the `layers` slice (which preserves
+    // `None` holes) so that cache indices stay stable even when some slots
+    // have no frame.
+    for (slot_idx, entry) in layers.iter().enumerate() {
+        if let Some(layer) = entry {
+            if layer.pixel_format != PixelFormat::Rgba8 {
+                conversion_cache.get_or_convert(slot_idx, layer);
+            }
+        }
+    }
+
+    // Between pass 1 and pass 2: check whether the first layer allows
+    // skipping the canvas clear.  We do the alpha-opaqueness check here
+    // while `conversion_cache` is still mutably available.  The result
+    // is a simple bool so no borrows leak into pass 2.
+    let skip_clear = layers
+        .iter()
+        .enumerate()
+        .find_map(|(i, e)| e.as_ref().map(|l| (i, l)))
+        .map_or(false, |(_slot_idx, layer)| {
+            // Quick checks that don't need the pixel data.
+            if layer.opacity < 1.0 || layer.rotation_degrees.abs() >= 0.01 {
+                return false;
+            }
+            let covers = layer.rect.as_ref().map_or(true, |r| {
+                r.x <= 0
+                    && r.y <= 0
+                    && i64::from(r.width) + i64::from(r.x) >= i64::from(canvas_w)
+                    && i64::from(r.height) + i64::from(r.y) >= i64::from(canvas_h)
+            });
+            if !covers {
+                return false;
+            }
+            // Alpha check — needs mutable access to conversion_cache.
+            match layer.pixel_format {
+                // I420/NV12 → RGBA conversion always writes alpha = 255.
+                PixelFormat::I420 | PixelFormat::Nv12 => true,
+                PixelFormat::Rgba8 => {
+                    conversion_cache.first_layer_all_opaque(layer, layer.data.as_slice())
+                },
+            }
+        });
+    if !skip_clear {
+        buf[..total_bytes].fill(0);
+    }
+
+    // Pass 2: build resolved references.  The mutable borrow of
+    // `conversion_cache` from pass 1 is released, so we can now take
+    // shared references into the cache alongside references into `layers`.
+    let resolved: Vec<Option<(&LayerSnapshot, &[u8])>> = layers
+        .iter()
+        .enumerate()
+        .map(|(slot_idx, entry)| {
+            entry.as_ref().map(|layer| {
+                let src_data: &[u8] = match layer.pixel_format {
+                    PixelFormat::Rgba8 => layer.data.as_slice(),
+                    PixelFormat::I420 | PixelFormat::Nv12 => {
+                        // Cache was populated in pass 1; this is a shared
+                        // read that cannot fail.
+                        conversion_cache.get_cached(slot_idx, layer)
+                    },
+                };
+                (layer, src_data)
+            })
+        })
+        .collect();
 
     // Blit each layer (in order — first layer is bottom, last is top).
-    // I420 layers are converted to RGBA8 on-the-fly using the scratch buffer.
-    for layer in layers.iter().flatten() {
+    for (layer, src_data) in resolved.iter().flatten() {
         let dst_rect =
             layer.rect.clone().unwrap_or(Rect { x: 0, y: 0, width: canvas_w, height: canvas_h });
 
-        let src_data: &[u8] = match layer.pixel_format {
-            PixelFormat::Rgba8 => layer.data.as_slice(),
-            PixelFormat::I420 => {
-                let needed = layer.width as usize * layer.height as usize * 4;
-                if i420_scratch.len() < needed {
-                    i420_scratch.resize(needed, 0);
-                }
-                i420_to_rgba8_buf(layer.data.as_slice(), layer.width, layer.height, i420_scratch);
-                &i420_scratch[..needed]
-            },
-            PixelFormat::Nv12 => {
-                let needed = layer.width as usize * layer.height as usize * 4;
-                if i420_scratch.len() < needed {
-                    i420_scratch.resize(needed, 0);
-                }
-                nv12_to_rgba8_buf(layer.data.as_slice(), layer.width, layer.height, i420_scratch);
-                &i420_scratch[..needed]
-            },
-        };
-
         scale_blit_rgba_rotated(
             buf,
             canvas_w,
diff --git a/crates/nodes/src/video/compositor/mod.rs b/crates/nodes/src/video/compositor/mod.rs
index 34dde60a..9c45af98 100644
--- a/crates/nodes/src/video/compositor/mod.rs
+++ b/crates/nodes/src/video/compositor/mod.rs
@@ -25,12 +25,12 @@
 //! - Bilinear / Lanczos scaling (MVP uses nearest-neighbor).
 
 pub mod config;
-mod kernel;
-mod overlay;
+pub mod kernel;
+pub mod overlay;
 pub mod pixel_ops;
 
 use async_trait::async_trait;
-use config::{CompositorConfig, Rect};
+use config::CompositorConfig;
 use kernel::{CompositeResult, CompositeWorkItem, LayerSnapshot};
 use overlay::{decode_image_overlay, rasterize_text_overlay, DecodedOverlay};
 use schemars::schema_for;
@@ -48,7 +48,7 @@ use streamkit_core::{
 };
 use tokio::sync::mpsc;
 
-use kernel::composite_frame;
+use kernel::{composite_frame, ConversionCache};
 
 // ── Input slot ──────────────────────────────────────────────────────────────
 
@@ -59,6 +59,63 @@ struct InputSlot {
     latest_frame: Option<VideoFrame>,
 }
 
+// ── Cached layer config ─────────────────────────────────────────────────────
+
+/// Pre-resolved layer configuration for a single slot.
+/// Rebuilt only when compositor config or pin set changes, avoiding
+/// per-frame `HashMap` lookups and `sort_by` calls.
+#[derive(Clone)]
+struct ResolvedSlotConfig {
+    rect: Option<config::Rect>,
+    opacity: f32,
+    z_index: i32,
+    rotation_degrees: f32,
+}
+
+/// Rebuild the per-slot resolved configs and the z-sorted draw order.
+///
+/// Called once at startup and whenever `UpdateParams` or pin management
+/// changes the layer set.  The returned draw order is a list of slot
+/// indices sorted by `(z_index, slot_index)`.
+fn rebuild_layer_cache(
+    slots: &[InputSlot],
+    config: &CompositorConfig,
+) -> (Vec<ResolvedSlotConfig>, Vec<usize>) {
+    let num_slots = slots.len();
+    let mut configs: Vec<ResolvedSlotConfig> = Vec::with_capacity(num_slots);
+    for (idx, slot) in slots.iter().enumerate() {
+        let layer_cfg = config.layers.get(&slot.name);
+        #[allow(clippy::option_if_let_else)]
+        let (rect, opacity, z_index, rotation_degrees) = if let Some(lc) = layer_cfg {
+            (lc.rect.clone(), lc.opacity, lc.z_index, lc.rotation_degrees)
+        } else if idx > 0 && num_slots > 1 {
+            // Auto-PiP: non-first layers without explicit config.
+            let pip_w = config.width / 3;
+            let pip_h = config.height / 3;
+            #[allow(clippy::cast_possible_wrap)]
+            let pip_x = (config.width - pip_w - 20) as i32;
+            #[allow(clippy::cast_possible_wrap)]
+            let pip_y = (config.height - pip_h - 20) as i32;
+            #[allow(clippy::cast_possible_wrap, clippy::cast_possible_truncation)]
+            (
+                Some(config::Rect { x: pip_x, y: pip_y, width: pip_w, height: pip_h }),
+                0.9,
+                idx as i32,
+                0.0,
+            )
+        } else {
+            (None, 1.0, 0, 0.0)
+        };
+        configs.push(ResolvedSlotConfig { rect, opacity, z_index, rotation_degrees });
+    }
+
+    // Pre-sort by (z_index, slot_index).
+    let mut draw_order: Vec<usize> = (0..num_slots).collect();
+    draw_order.sort_by(|&a, &b| configs[a].z_index.cmp(&configs[b].z_index).then(a.cmp(&b)));
+
+    (configs, draw_order)
+}
+
 // ── Node ────────────────────────────────────────────────────────────────────
 
 /// Composites multiple raw video inputs onto a single RGBA8 canvas with
@@ -285,9 +342,9 @@ impl ProcessorNode for CompositorNode {
         let (result_tx, mut result_rx) = tokio::sync::mpsc::channel::<CompositeResult>(2);
 
         let composite_thread = tokio::task::spawn_blocking(move || {
-            // Persistent scratch buffer for I420→RGBA8 layer conversion,
-            // reused across frames to avoid per-frame allocation.
-            let mut i420_to_rgba_scratch: Vec<u8> = Vec::new();
+            // Per-slot cache for YUV→RGBA conversions. Avoids redundant
+            // conversion when the source Arc hasn't changed between frames.
+            let mut conversion_cache = ConversionCache::new();
 
             while let Some(work) = work_rx.blocking_recv() {
                 let rgba_buf = composite_frame(
@@ -297,7 +354,7 @@ impl ProcessorNode for CompositorNode {
                     &work.image_overlays,
                     &work.text_overlays,
                     work.video_pool.as_deref(),
-                    &mut i420_to_rgba_scratch,
+                    &mut conversion_cache,
                 );
                 let result = CompositeResult { rgba_data: rgba_buf };
                 if result_tx.blocking_send(result).is_err() {
@@ -309,6 +366,14 @@ impl ProcessorNode for CompositorNode {
         let mut output_seq: u64 = 0;
         let mut stop_reason: &str = "shutdown";
 
+        // ── Cached layer config + draw order ────────────────────────────
+        // Rebuilt only when config or pin set changes (UpdateParams,
+        // pin add/remove, channel close).  Avoids per-frame HashMap
+        // lookups and sort_by calls.
+        let mut layer_configs_dirty = true;
+        let mut resolved_configs: Vec<ResolvedSlotConfig> = Vec::new();
+        let mut sorted_draw_order: Vec<usize> = Vec::new();
+
         loop {
             // ── Take at most one frame from every slot (non-blocking) ───
             // We intentionally take only one frame per slot per iteration so
@@ -347,6 +412,7 @@ impl ProcessorNode for CompositorNode {
                                     params,
                                     &mut stats_tracker,
                                 );
+                                layer_configs_dirty = true;
                             },
                             NodeControlMessage::Start => {},
                         }
@@ -364,6 +430,7 @@ impl ProcessorNode for CompositorNode {
                             msg,
                             &mut slots,
                         );
+                        layer_configs_dirty = true;
                     }
 
                     // Wait for a frame from any connected input.
@@ -379,6 +446,7 @@ impl ProcessorNode for CompositorNode {
                                     slots[slot_idx].name
                                 );
                                 slots.remove(slot_idx);
+                                layer_configs_dirty = true;
                                 if slots.is_empty() {
                                     stop_reason = "all_inputs_closed";
                                     should_break = true;
@@ -425,6 +493,7 @@ impl ProcessorNode for CompositorNode {
                                     params,
                                     &mut stats_tracker,
                                 );
+                                layer_configs_dirty = true;
                             },
                             NodeControlMessage::Start => {},
                         }
@@ -440,6 +509,7 @@ impl ProcessorNode for CompositorNode {
                             msg,
                             &mut slots,
                         );
+                        layer_configs_dirty = true;
                     }
                 }
                 continue;
@@ -463,6 +533,7 @@ impl ProcessorNode for CompositorNode {
                             params,
                             &mut stats_tracker,
                         );
+                        layer_configs_dirty = true;
                     },
                     NodeControlMessage::Start => {},
                 }
@@ -473,69 +544,40 @@ impl ProcessorNode for CompositorNode {
             if let Some(ref mut pmrx) = pin_mgmt_rx {
                 while let Ok(msg) = pmrx.try_recv() {
                     Self::handle_pin_management(&mut self, msg, &mut slots);
+                    layer_configs_dirty = true;
                 }
             }
 
+            // ── Rebuild layer config cache if needed ─────────────────────
+            if layer_configs_dirty {
+                let (cfgs, order) = rebuild_layer_cache(&slots, &self.config);
+                resolved_configs = cfgs;
+                sorted_draw_order = order;
+                layer_configs_dirty = false;
+            }
+
             // ── Send work to persistent compositing thread ─────────────
-            // Collect the data we need to send to the blocking thread.
-            let num_slots = slots.len();
-            let mut layers: Vec<Option<LayerSnapshot>> = slots
+            // Build layer snapshots in pre-sorted draw order using the
+            // cached per-slot configs (no HashMap lookup, no sort).
+            let layers: Vec<Option<LayerSnapshot>> = sorted_draw_order
                 .iter()
-                .enumerate()
-                .map(|(idx, slot)| {
-                    slot.latest_frame.as_ref().map(|f| {
-                        let layer_cfg = self.config.layers.get(&slot.name);
-                        #[allow(clippy::option_if_let_else)]
-                        let (rect, opacity, z_index, rotation_degrees) = if let Some(lc) = layer_cfg
-                        {
-                            // Explicit per-layer config.
-                            (lc.rect.clone(), lc.opacity, lc.z_index, lc.rotation_degrees)
-                        } else if idx > 0 && num_slots > 1 {
-                            // Auto-PiP: non-first layers without explicit config
-                            // are placed in the bottom-right corner at 1/3 canvas
-                            // size with slight transparency.
-                            let pip_w = self.config.width / 3;
-                            let pip_h = self.config.height / 3;
-                            #[allow(clippy::cast_possible_wrap)]
-                            let pip_x = (self.config.width - pip_w - 20) as i32;
-                            #[allow(clippy::cast_possible_wrap)]
-                            let pip_y = (self.config.height - pip_h - 20) as i32;
-                            #[allow(clippy::cast_possible_wrap)]
-                            (
-                                Some(Rect { x: pip_x, y: pip_y, width: pip_w, height: pip_h }),
-                                0.9,
-                                idx as i32,
-                                0.0,
-                            )
-                        } else {
-                            // First layer (or single input): fill the canvas.
-                            (None, 1.0, 0, 0.0)
-                        };
+                .map(|&idx| {
+                    slots[idx].latest_frame.as_ref().map(|f| {
+                        let cfg = &resolved_configs[idx];
                         LayerSnapshot {
                             data: f.data.clone(),
                             width: f.width,
                             height: f.height,
                             pixel_format: f.pixel_format,
-                            rect,
-                            opacity,
-                            z_index,
-                            rotation_degrees,
+                            rect: cfg.rect.clone(),
+                            opacity: cfg.opacity,
+                            z_index: cfg.z_index,
+                            rotation_degrees: cfg.rotation_degrees,
                         }
                     })
                 })
                 .collect();
 
-            // Sort layers by z_index so that lower values are drawn first
-            // (bottom of the stack).  `None` entries (slots without a frame)
-            // are pushed to the end — they are skipped during compositing
-            // anyway.
-            layers.sort_by(|a, b| match (a, b) {
-                (Some(la), Some(lb)) => la.z_index.cmp(&lb.z_index),
-                (Some(_), None) => std::cmp::Ordering::Less,
-                (None, Some(_)) => std::cmp::Ordering::Greater,
-                (None, None) => std::cmp::Ordering::Equal,
-            });
-
             stats_tracker.received();
 
             let work_item = CompositeWorkItem {
@@ -773,7 +815,7 @@ mod tests {
     use crate::test_utils::{
         assert_state_initializing, assert_state_running, assert_state_stopped, create_test_context,
     };
-    use config::LayerConfig;
+    use config::{LayerConfig, Rect};
     use pixel_ops::scale_blit_rgba;
     use std::collections::HashMap;
     use tokio::sync::mpsc;
@@ -851,8 +893,8 @@ mod tests {
     #[test]
     fn test_composite_frame_empty_layers() {
         // No layers, no overlays -> transparent black canvas.
-        let mut scratch = Vec::new();
-        let result = composite_frame(4, 4, &[], &[], &[], None, &mut scratch);
+        let mut cache = ConversionCache::new();
+        let result = composite_frame(4, 4, &[], &[], &[], None, &mut cache);
         let buf = result.as_slice();
         assert_eq!(buf.len(), 4 * 4 * 4);
         assert!(buf.iter().all(|&b| b == 0));
@@ -872,8 +914,8 @@ mod tests {
             rotation_degrees: 0.0,
         };
 
-        let mut scratch = Vec::new();
-        let result = composite_frame(4, 4, &[Some(layer)], &[], &[], None, &mut scratch);
+        let mut cache = ConversionCache::new();
+        let result = composite_frame(4, 4, &[Some(layer)], &[], &[], None, &mut cache);
         let buf = result.as_slice();
 
         // Entire canvas should be red (scaled from 2x2 to 4x4).
@@ -912,9 +954,9 @@ mod tests {
             rotation_degrees: 0.0,
         };
 
-        let mut scratch = Vec::new();
+        let mut cache = ConversionCache::new();
         let result =
-            composite_frame(4, 4, &[Some(layer0), Some(layer1)], &[], &[], None, &mut scratch);
+            composite_frame(4, 4, &[Some(layer0), Some(layer1)], &[], &[], None, &mut cache);
         let buf = result.as_slice();
 
         // (0,0) should be red.
@@ -1079,8 +1121,8 @@ mod tests {
         let pool = FramePool::<u8>::preallocated(&[total], 2);
         assert_eq!(pool.stats().buckets[0].available, 2);
 
-        let mut scratch = Vec::new();
-        let result = composite_frame(canvas_w, canvas_h, &[], &[], &[], Some(&pool), &mut scratch);
+        let mut cache = ConversionCache::new();
+        let result = composite_frame(canvas_w, canvas_h, &[], &[], &[], Some(&pool), &mut cache);
         assert_eq!(result.as_slice().len(), total);
         // One buffer was taken from the pool.
         assert_eq!(pool.stats().buckets[0].available, 1);
diff --git a/crates/nodes/src/video/compositor/overlay.rs b/crates/nodes/src/video/compositor/overlay.rs
index 3072870a..0fc1e6b2 100644
--- a/crates/nodes/src/video/compositor/overlay.rs
+++ b/crates/nodes/src/video/compositor/overlay.rs
@@ -36,13 +36,52 @@ pub fn decode_image_overlay(config: &ImageOverlayConfig) -> Result<DecodedOverla
     let rgba = img.to_rgba8();
     let (w, h) = img.dimensions();
 
-    Ok(DecodedOverlay {
-        rgba_data: rgba.into_raw(),
-        width: w,
-        height: h,
-        rect: config.rect.clone(),
-        opacity: config.opacity,
-    })
+    let target_w = config.rect.width;
+    let target_h = config.rect.height;
+
+    // Pre-scale the decoded image to the target rect dimensions so that
+    // the per-frame `blit_overlay` → `scale_blit_rgba` call hits the
+    // identity-scale fast path (direct memcpy) instead of doing
+    // nearest-neighbor scaling every frame.
+    if target_w > 0 && target_h > 0 && (w != target_w || h != target_h) {
+        let raw = rgba.into_raw();
+        let scaled = prescale_rgba(&raw, w, h, target_w, target_h);
+        Ok(DecodedOverlay {
+            rgba_data: scaled,
+            width: target_w,
+            height: target_h,
+            rect: config.rect.clone(),
+            opacity: config.opacity,
+        })
+    } else {
+        Ok(DecodedOverlay {
+            rgba_data: rgba.into_raw(),
+            width: w,
+            height: h,
+            rect: config.rect.clone(),
+            opacity: config.opacity,
+        })
+    }
+}
+
+/// Nearest-neighbor scale an RGBA8 buffer from `(sw, sh)` to `(dw, dh)`.
+/// Used once at config time so the per-frame blit is a 1:1 copy.
+fn prescale_rgba(src: &[u8], sw: u32, sh: u32, dw: u32, dh: u32) -> Vec<u8> {
+    let sw = sw as usize;
+    let sh = sh as usize;
+    let dw = dw as usize;
+    let dh = dh as usize;
+    let mut out = vec![0u8; dw * dh * 4];
+    for dy in 0..dh {
+        let sy = dy * sh / dh;
+        for dx in 0..dw {
+            let sx = dx * sw / dw;
+            let si = (sy * sw + sx) * 4;
+            let di = (dy * dw + dx) * 4;
+            out[di..di + 4].copy_from_slice(&src[si..si + 4]);
+        }
+    }
+    out
 }
 
 // ── Bundled default font ────────────────────────────────────────────────────
diff --git a/crates/nodes/src/video/compositor/pixel_ops.rs b/crates/nodes/src/video/compositor/pixel_ops.rs
index 3464eb8e..3c6dd03e 100644
--- a/crates/nodes/src/video/compositor/pixel_ops.rs
+++ b/crates/nodes/src/video/compositor/pixel_ops.rs
@@ -110,28 +110,71 @@ pub fn scale_blit_rgba(
     let first_row_byte = ry * row_stride;
     let dst_rows = &mut dst[first_row_byte..];
 
+    // ── Identity-scale fast path ───────────────────────────────────────
+    // When source dimensions exactly match the destination rect and opacity
+    // is fully opaque, we can avoid per-pixel scaling entirely and use
+    // direct row copies (memcpy) for fully-opaque source rows.
+    if rw == sw && rh == sh && opacity >= 1.0 && src_col_skip == 0 && src_row_skip == 0 {
+        let src_row_bytes = sw * 4;
+        let copy_bytes = effective_rect_w * 4;
+        for (dy, row_slice) in dst_rows.chunks_mut(row_stride).take(effective_rh).enumerate() {
+            let src_start = dy * src_row_bytes;
+            let src_end = src_start + copy_bytes;
+            if src_end > src.len() {
+                break;
+            }
+            let dst_start = rx * 4;
+            let dst_end = dst_start + copy_bytes;
+            if dst_end > row_slice.len() {
+                break;
+            }
+            // Check if the source row has any semi-transparent pixels.
+            // For fully-opaque rows, use bulk memcpy.  For rows with alpha,
+            // fall back to per-pixel blending.
+            let src_row = &src[src_start..src_end];
+            let all_opaque = src_row.chunks_exact(4).all(|px| px[3] == 255);
+            if all_opaque {
+                row_slice[dst_start..dst_end].copy_from_slice(src_row);
+            } else {
+                // Per-pixel alpha blend (identity scale, so sx == dx).
+                for dx in 0..effective_rect_w {
+                    let si = dx * 4;
+                    let sa = src_row[si + 3];
+                    if sa == 255 {
+                        row_slice[dst_start + dx * 4..dst_start + dx * 4 + 4]
+                            .copy_from_slice(&src_row[si..si + 4]);
+                    } else if sa > 0 {
+                        let di = dst_start + dx * 4;
+                        let a16 = u16::from(sa);
+                        row_slice[di] = blend_u8(src_row[si], row_slice[di], a16);
+                        row_slice[di + 1] = blend_u8(src_row[si + 1], row_slice[di + 1], a16);
+                        row_slice[di + 2] = blend_u8(src_row[si + 2], row_slice[di + 2], a16);
+                        let da = u16::from(row_slice[di + 3]);
+                        row_slice[di + 3] = (a16 + ((da * (255 - a16) + 128) >> 8)).min(255) as u8;
+                    }
+                }
+            }
+        }
+        return;
+    }
+
+    // ── Scaled blit path ───────────────────────────────────────────────
+    // Precompute the source-X lookup table once.  This replaces the per-pixel
+    // `(dx + src_col_skip) * sw / rw` integer division with a single table
+    // lookup in the inner blit loops.
+    let x_map: Vec<usize> = (0..effective_rect_w).map(|dx| (dx + src_col_skip) * sw / rw).collect();
+
     if effective_rh >= RAYON_ROW_THRESHOLD {
         dst_rows.par_chunks_mut(row_stride).take(effective_rh).enumerate().for_each(
             |(dy, row_slice)| {
                 let sy = (dy + src_row_skip) * sh / rh;
-                blit_row(
-                    row_slice,
-                    rx,
-                    effective_rect_w,
-                    src,
-                    sw,
-                    sh,
-                    sy,
-                    rw,
-                    opacity,
-                    src_col_skip,
-                );
+                blit_row(row_slice, rx, effective_rect_w, src, sw, sy, opacity, &x_map);
             },
         );
     } else {
         for (dy, row_slice) in dst_rows.chunks_mut(row_stride).take(effective_rh).enumerate() {
             let sy = (dy + src_row_skip) * sh / rh;
-            blit_row(row_slice, rx, effective_rect_w, src, sw, sh, sy, rw, opacity, src_col_skip);
+            blit_row(row_slice, rx, effective_rect_w, src, sw, sy, opacity, &x_map);
         }
     }
 }
@@ -142,6 +185,9 @@ pub fn scale_blit_rgba(
 /// rows in parallel.  The `row_slice` covers exactly one destination row
 /// starting at pixel column 0 (i.e. byte offset `rx * 4` is the first column
 /// we write to).
+///
+/// `x_map` is a precomputed table mapping each destination column to the
+/// corresponding source column, eliminating per-pixel integer division.
 #[allow(
     clippy::cast_possible_truncation,
     clippy::cast_sign_loss,
@@ -155,18 +201,16 @@ fn blit_row(
     effective_rw: usize,
     src: &[u8],
     sw: usize,
-    sh: usize,
     sy: usize,
-    rw: usize,
     opacity: f32,
-    src_col_skip: usize,
+    x_map: &[usize],
 ) {
     // Fast path: when opacity is 1.0, we can skip the f32 multiply on alpha
     // and branch more cheaply.
     if opacity >= 1.0 {
-        blit_row_opaque(row_slice, rx, effective_rw, src, sw, sh, sy, rw, src_col_skip);
+        blit_row_opaque(row_slice, rx, effective_rw, src, sw, sy, x_map);
     } else {
-        blit_row_alpha(row_slice, rx, effective_rw, src, sw, sh, sy, rw, opacity, src_col_skip);
+        blit_row_alpha(row_slice, rx, effective_rw, src, sw, sy, opacity, x_map);
     }
 }
 
@@ -180,16 +224,184 @@ const fn blend_u8(src: u8, dst: u8, alpha: u16) -> u8 {
     ((val + (val >> 8)) >> 8) as u8
 }
 
+// ── SSE2 alpha-blend helpers (x86-64) ──────────────────────────────────────
+//
+// Process 4 RGBA pixels at a time using SSE2 integer arithmetic.
+// Source pixels are gathered (non-contiguous via x_map), destination pixels
+// are contiguous.  The blend formula is identical to the scalar `blend_u8`:
+//   result = ((src*alpha + dst*(255-alpha) + 128) + ((…) >> 8)) >> 8
+//
+// For the alpha channel we set source-alpha to 255 before blending so that
+// `blend_u8(255, dst_alpha, src_alpha)` naturally computes the standard
+// over-composite alpha `a_src + a_dst*(1-a_src)` (within ±1 of the scalar
+// approximation — both are approximate divisions by 255).
+
+/// Read 4 bytes from `src` at `offset` as a native-endian `u32`.
+///
+/// # Safety
+///
+/// Caller must ensure `offset + 3 < src.len()`.
+#[inline(always)]
+unsafe fn read_rgba_u32(src: &[u8], offset: usize) -> u32 {
+    std::ptr::read_unaligned(src.as_ptr().add(offset) as *const u32)
+}
+
+/// Blend 4 gathered source RGBA pixels onto 4 contiguous destination pixels
+/// using SSE2 "over" compositing (no opacity modifier).
+///
+/// # Safety
+///
+/// `dst_ptr` must point to at least 16 writable bytes.  Source pixel values
+/// in `src_pixels` must be valid RGBA `u32` values.
+#[cfg(target_arch = "x86_64")]
+#[inline(always)]
+unsafe fn blend_4px_over_sse2(dst_ptr: *mut u8, src_pixels: [u32; 4]) {
+    use std::arch::x86_64::*;
+
+    let zero = _mm_setzero_si128();
+    let c255 = _mm_set1_epi16(255);
+    let c128 = _mm_set1_epi16(128);
+
+    // Assemble 4 gathered source pixels into one register.
+    let src4 = _mm_set_epi32(
+        src_pixels[3] as i32,
+        src_pixels[2] as i32,
+        src_pixels[1] as i32,
+        src_pixels[0] as i32,
+    );
+
+    // Mask with 0xFF at each pixel's alpha-byte position (bytes 3,7,11,15).
+    let alpha_byte_mask = _mm_set1_epi32(0xFF00_0000_u32 as i32);
+
+    // Fast path: all 4 source pixels fully opaque → direct copy.
+    let alpha_bytes = _mm_and_si128(src4, alpha_byte_mask);
+    if _mm_movemask_epi8(_mm_cmpeq_epi8(alpha_bytes, alpha_byte_mask)) == 0xFFFF {
+        _mm_storeu_si128(dst_ptr as *mut __m128i, src4);
+        return;
+    }
+
+    // Fast path: all 4 source pixels fully transparent → nothing to do.
+    if _mm_movemask_epi8(_mm_cmpeq_epi8(alpha_bytes, zero)) == 0xFFFF {
+        return;
+    }
+
+    let dst4 = _mm_loadu_si128(dst_ptr as *const __m128i);
+
+    // Replace source alpha channel with 255 for correct composite-alpha
+    // via blend_u8(255, dst_alpha, src_alpha).
+    let src_blend = _mm_or_si128(src4, alpha_byte_mask);
+
+    // --- Low 2 pixels (u16 arithmetic) ---
+    let src_lo = _mm_unpacklo_epi8(src_blend, zero);
+    let dst_lo = _mm_unpacklo_epi8(dst4, zero);
+
+    // Extract original source alpha and broadcast within each 4-u16 pixel group.
+    let src_orig_lo = _mm_unpacklo_epi8(src4, zero);
+    // _MM_SHUFFLE(3,3,3,3) = 0xFF → replicate element 3 (alpha) to all 4 positions.
+    let alpha_lo = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src_orig_lo, 0xFF), 0xFF);
+
+    let inv_alpha_lo = _mm_sub_epi16(c255, alpha_lo);
+    let val_lo = _mm_add_epi16(
+        _mm_add_epi16(_mm_mullo_epi16(src_lo, alpha_lo), _mm_mullo_epi16(dst_lo, inv_alpha_lo)),
+        c128,
+    );
+    let result_lo = _mm_srli_epi16(_mm_add_epi16(val_lo, _mm_srli_epi16(val_lo, 8)), 8);
+
+    // --- High 2 pixels ---
+    let src_hi = _mm_unpackhi_epi8(src_blend, zero);
+    let dst_hi = _mm_unpackhi_epi8(dst4, zero);
+    let src_orig_hi = _mm_unpackhi_epi8(src4, zero);
+    let alpha_hi = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src_orig_hi, 0xFF), 0xFF);
+
+    let inv_alpha_hi = _mm_sub_epi16(c255, alpha_hi);
+    let val_hi = _mm_add_epi16(
+        _mm_add_epi16(_mm_mullo_epi16(src_hi, alpha_hi), _mm_mullo_epi16(dst_hi, inv_alpha_hi)),
+        c128,
+    );
+    let result_hi = _mm_srli_epi16(_mm_add_epi16(val_hi, _mm_srli_epi16(val_hi, 8)), 8);
+
+    // Pack back to u8 and store.
+    _mm_storeu_si128(dst_ptr as *mut __m128i, _mm_packus_epi16(result_lo, result_hi));
+}
+
+/// Blend 4 gathered source RGBA pixels onto 4 contiguous destination pixels
+/// using SSE2 "over" compositing **with** an opacity multiplier applied to
+/// each pixel's source alpha.
+///
+/// # Safety
+///
+/// `dst_ptr` must point to at least 16 writable bytes.
+#[cfg(target_arch = "x86_64")]
+#[inline(always)]
+unsafe fn blend_4px_over_alpha_sse2(dst_ptr: *mut u8, src_pixels: [u32; 4], opacity: u16) {
+    use std::arch::x86_64::*;
+
+    let zero = _mm_setzero_si128();
+    let c255 = _mm_set1_epi16(255);
+    let c128 = _mm_set1_epi16(128);
+    let opacity_v = _mm_set1_epi16(opacity as i16);
+
+    let src4 = _mm_set_epi32(
+        src_pixels[3] as i32,
+        src_pixels[2] as i32,
+        src_pixels[1] as i32,
+        src_pixels[0] as i32,
+    );
+
+    let dst4 = _mm_loadu_si128(dst_ptr as *const __m128i);
+    let alpha_byte_mask = _mm_set1_epi32(0xFF00_0000_u32 as i32);
+    let src_blend = _mm_or_si128(src4, alpha_byte_mask);
+
+    // --- Low 2 pixels ---
+    let src_lo = _mm_unpacklo_epi8(src_blend, zero);
+    let dst_lo = _mm_unpacklo_epi8(dst4, zero);
+
+    // Extract original alpha, apply opacity: sa_eff = (sa * opacity + 128) >> 8.
+    // Max value: (255*255+128)>>8 = 254, so no clamping needed.
+    let src_orig_lo = _mm_unpacklo_epi8(src4, zero);
+    let raw_alpha_lo = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src_orig_lo, 0xFF), 0xFF);
+    let alpha_lo = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(raw_alpha_lo, opacity_v), c128), 8);
+
+    let inv_alpha_lo = _mm_sub_epi16(c255, alpha_lo);
+    let val_lo = _mm_add_epi16(
+        _mm_add_epi16(_mm_mullo_epi16(src_lo, alpha_lo), _mm_mullo_epi16(dst_lo, inv_alpha_lo)),
+        c128,
+    );
+    let result_lo = _mm_srli_epi16(_mm_add_epi16(val_lo, _mm_srli_epi16(val_lo, 8)), 8);
+
+    // --- High 2 pixels ---
+    let src_hi = _mm_unpackhi_epi8(src_blend, zero);
+    let dst_hi = _mm_unpackhi_epi8(dst4, zero);
+    let src_orig_hi = _mm_unpackhi_epi8(src4, zero);
+    let raw_alpha_hi = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src_orig_hi, 0xFF), 0xFF);
+    let alpha_hi = _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(raw_alpha_hi, opacity_v), c128), 8);
+
+    let inv_alpha_hi = _mm_sub_epi16(c255, alpha_hi);
+    let val_hi = _mm_add_epi16(
+        _mm_add_epi16(_mm_mullo_epi16(src_hi, alpha_hi), _mm_mullo_epi16(dst_hi, inv_alpha_hi)),
+        c128,
+    );
+    let result_hi = _mm_srli_epi16(_mm_add_epi16(val_hi, _mm_srli_epi16(val_hi, 8)), 8);
+
+    _mm_storeu_si128(dst_ptr as *mut __m128i, _mm_packus_epi16(result_lo, result_hi));
+}
+
 /// Inner blit for fully-opaque layers (`opacity >= 1.0`).  Skips the
 /// per-pixel f32 multiply on the source alpha channel.
 ///
 /// Uses integer-only alpha blending for semi-transparent source pixels.
+/// `x_map` provides precomputed source-X indices (one per destination column).
+///
+/// On x86-64, processes 4 pixels at a time using SSE2 SIMD when the row is
+/// wide enough and bounds can be pre-validated.
 #[allow(
     clippy::cast_possible_truncation,
     clippy::cast_sign_loss,
     clippy::too_many_arguments,
     clippy::suboptimal_flops,
-    clippy::inline_always
+    clippy::inline_always,
+    // dx is used as both x_map index and dst offset, so an iterator is non-trivial.
+    clippy::needless_range_loop
 )]
 #[inline(always)]
 fn blit_row_opaque(
@@ -198,14 +410,65 @@ fn blit_row_opaque(
     effective_rw: usize,
     src: &[u8],
     sw: usize,
-    _sh: usize,
     sy: usize,
-    rw: usize,
-    src_col_skip: usize,
+    x_map: &[usize],
 ) {
     let src_row_base = sy * sw * 4;
+
+    // ── SSE2 fast path: process 4 pixels at a time ─────────────────────
+    #[cfg(target_arch = "x86_64")]
+    {
+        // Pre-validate bounds so the inner SIMD loop is branch-free.
+        let src_row_end = src_row_base + sw * 4;
+        let dst_end = (rx + effective_rw) * 4;
+        if src_row_end <= src.len() && dst_end <= row_slice.len() {
+            let chunks = effective_rw / 4;
+            for c in 0..chunks {
+                let dx = c * 4;
+                // SAFETY: bounds pre-validated above; x_map values < sw;
+                // dst range (rx+dx)*4..(rx+dx+4)*4 < dst_end <= row_slice.len().
+                unsafe {
+                    let pixels = [
+                        read_rgba_u32(src, src_row_base + x_map[dx] * 4),
+                        read_rgba_u32(src, src_row_base + x_map[dx + 1] * 4),
+                        read_rgba_u32(src, src_row_base + x_map[dx + 2] * 4),
+                        read_rgba_u32(src, src_row_base + x_map[dx + 3] * 4),
+                    ];
+                    blend_4px_over_sse2(row_slice.as_mut_ptr().add((rx + dx) * 4), pixels);
+                }
+            }
+
+            // Scalar tail for remaining 0-3 pixels.
+            let tail_start = chunks * 4;
+            for dx in tail_start..effective_rw {
+                let sx = x_map[dx];
+                let src_idx = src_row_base + sx * 4;
+                let sr = src[src_idx];
+                let sg = src[src_idx + 1];
+                let sb = src[src_idx + 2];
+                let sa = src[src_idx + 3];
+                let dst_idx = (rx + dx) * 4;
+                if sa == 255 {
+                    row_slice[dst_idx] = sr;
+                    row_slice[dst_idx + 1] = sg;
+                    row_slice[dst_idx + 2] = sb;
+                    row_slice[dst_idx + 3] = 255;
+                } else if sa > 0 {
+                    let a16 = u16::from(sa);
+                    row_slice[dst_idx] = blend_u8(sr, row_slice[dst_idx], a16);
+                    row_slice[dst_idx + 1] = blend_u8(sg, row_slice[dst_idx + 1], a16);
+                    row_slice[dst_idx + 2] = blend_u8(sb, row_slice[dst_idx + 2], a16);
+                    let da = u16::from(row_slice[dst_idx + 3]);
+                    row_slice[dst_idx + 3] = (a16 + ((da * (255 - a16) + 128) >> 8)).min(255) as u8;
+                }
+            }
+            return;
+        }
+    }
+
+    // ── Scalar fallback (bounds-checked per pixel) ─────────────────────
     for dx in 0..effective_rw {
-        let sx = (dx + src_col_skip) * sw / rw;
+        let sx = x_map[dx];
         let src_idx = src_row_base + sx * 4;
         if src_idx + 3 >= src.len() {
             continue;
@@ -231,7 +494,6 @@ fn blit_row_opaque(
             row_slice[dst_idx] = blend_u8(sr, row_slice[dst_idx], a16);
             row_slice[dst_idx + 1] = blend_u8(sg, row_slice[dst_idx + 1], a16);
             row_slice[dst_idx + 2] = blend_u8(sb, row_slice[dst_idx + 2], a16);
-            // Composite alpha: a_out = a_src + a_dst * (1 - a_src)
             let da = u16::from(row_slice[dst_idx + 3]);
             row_slice[dst_idx + 3] = (a16 + ((da * (255 - a16) + 128) >> 8)).min(255) as u8;
         }
@@ -242,12 +504,18 @@ fn blit_row_opaque(
 /// Applies the opacity multiplier to every source pixel's alpha channel.
 ///
 /// Uses integer-only alpha blending.
+/// `x_map` provides precomputed source-X indices (one per destination column).
+///
+/// On x86-64, processes 4 pixels at a time using SSE2 SIMD when the row is
+/// wide enough and bounds can be pre-validated.
 #[allow(
     clippy::cast_possible_truncation,
     clippy::cast_sign_loss,
     clippy::too_many_arguments,
     clippy::suboptimal_flops,
-    clippy::inline_always
+    clippy::inline_always,
+    // dx is used as both x_map index and dst offset, so an iterator is non-trivial.
+    clippy::needless_range_loop
 )]
 #[inline(always)]
 fn blit_row_alpha(
@@ -256,18 +524,70 @@ fn blit_row_alpha(
     effective_rw: usize,
     src: &[u8],
     sw: usize,
-    _sh: usize,
     sy: usize,
-    rw: usize,
     opacity: f32,
-    src_col_skip: usize,
+    x_map: &[usize],
 ) {
     // Pre-compute opacity as a 0..255 integer multiplier.
     let opacity_u16 = (opacity * 255.0 + 0.5) as u16;
     let src_row_base = sy * sw * 4;
 
+    // ── SSE2 fast path ─────────────────────────────────────────────────
+    #[cfg(target_arch = "x86_64")]
+    {
+        let src_row_end = src_row_base + sw * 4;
+        let dst_end = (rx + effective_rw) * 4;
+        if src_row_end <= src.len() && dst_end <= row_slice.len() {
+            let chunks = effective_rw / 4;
+            for c in 0..chunks {
+                let dx = c * 4;
+                unsafe {
+                    let pixels = [
+                        read_rgba_u32(src, src_row_base + x_map[dx] * 4),
+                        read_rgba_u32(src, src_row_base + x_map[dx + 1] * 4),
+                        read_rgba_u32(src, src_row_base + x_map[dx + 2] * 4),
+                        read_rgba_u32(src, src_row_base + x_map[dx + 3] * 4),
+                    ];
+                    blend_4px_over_alpha_sse2(
+                        row_slice.as_mut_ptr().add((rx + dx) * 4),
+                        pixels,
+                        opacity_u16,
+                    );
+                }
+            }
+
+            // Scalar tail.
+            let tail_start = chunks * 4;
+            for dx in tail_start..effective_rw {
+                let sx = x_map[dx];
+                let src_idx = src_row_base + sx * 4;
+                let sr = src[src_idx];
+                let sg = src[src_idx + 1];
+                let sb = src[src_idx + 2];
+                let sa = src[src_idx + 3];
+                let dst_idx = (rx + dx) * 4;
+                let sa_eff = ((u16::from(sa) * opacity_u16 + 128) >> 8).min(255);
+                if sa_eff == 255 {
+                    row_slice[dst_idx] = sr;
+                    row_slice[dst_idx + 1] = sg;
+                    row_slice[dst_idx + 2] = sb;
+                    row_slice[dst_idx + 3] = 255;
+                } else if sa_eff > 0 {
+                    row_slice[dst_idx] = blend_u8(sr, row_slice[dst_idx], sa_eff);
+                    row_slice[dst_idx + 1] = blend_u8(sg, row_slice[dst_idx + 1], sa_eff);
+                    row_slice[dst_idx + 2] = blend_u8(sb, row_slice[dst_idx + 2], sa_eff);
+                    let da = u16::from(row_slice[dst_idx + 3]);
+                    row_slice[dst_idx + 3] =
+                        (sa_eff + ((da * (255 - sa_eff) + 128) >> 8)).min(255) as u8;
+                }
+            }
+            return;
+        }
+    }
+
+    // ── Scalar fallback ────────────────────────────────────────────────
     for dx in 0..effective_rw {
-        let sx = (dx + src_col_skip) * sw / rw;
+        let sx = x_map[dx];
         let src_idx = src_row_base + sx * 4;
         if src_idx + 3 >= src.len() {
             continue;
@@ -283,7 +603,6 @@ fn blit_row_alpha(
             continue;
         }
 
-        // Effective alpha: (sa * opacity) / 255, done in integer.
         let sa_eff = ((u16::from(sa) * opacity_u16 + 128) >> 8).min(255);
         if sa_eff == 255 {
             row_slice[dst_idx] = sr;