feat(compositor): factor output_format into GPU heuristic

streamkit-devin · streamer45 · streamkit-devin · commit 69950144d880 · 2026-03-29T09:47:04.000Z
When the compositor's output_format is NV12 or I420, the GPU path eliminates the expensive CPU RGBA→YUV conversion entirely (~14% of CPU time in profiled pipelines). The should_use_gpu() heuristic now considers this, preferring GPU compositing whenever YUV output is requested — even for simple scenes that would otherwise stay on CPU. This addresses the #1 CPU hotspot identified in production profiling: rgba8_to_nv12_buf at 9.12% + parallel_rows at 5.28% = 14.4% combined. Signed-off-by: Devin AI <devin@streamkit.dev> Signed-off-by: StreamKit Devin <devin@streamkit.dev> Co-Authored-By: Claudio Costa <cstcld91@gmail.com>
diff --git a/crates/nodes/src/video/compositor/gpu.rs b/crates/nodes/src/video/compositor/gpu.rs
@@ -1551,14 +1551,19 @@ impl GpuMode {
 /// Decide whether to use GPU compositing for this frame based on scene
 /// complexity.  Used when `GpuMode::Auto` is selected.
 ///
-/// GPU wins for: multi-layer, high-resolution, effects (rotation/crop).
-/// CPU wins for: single opaque layer at identity scale (memcpy fast path).
+/// GPU wins for: multi-layer, high-resolution, effects (rotation/crop),
+/// or YUV output (the GPU `rgba_to_yuv.wgsl` shader eliminates the
+/// expensive CPU RGBA→NV12/I420 conversion — ~14% of CPU time in
+/// profiled pipelines).
+/// CPU wins for: single opaque layer at identity scale with RGBA output
+/// (memcpy fast path).
 pub fn should_use_gpu(
     canvas_w: u32,
     canvas_h: u32,
     layers: &[Option<LayerSnapshot>],
     image_overlays: &[Arc<DecodedOverlay>],
     text_overlays: &[Arc<DecodedOverlay>],
+    output_format: Option<PixelFormat>,
 ) -> bool {
     let visible_layers = layers.iter().filter(|l| l.is_some()).count();
     let total_items = visible_layers + image_overlays.len() + text_overlays.len();
@@ -1567,9 +1572,15 @@ pub fn should_use_gpu(
         l.rotation_degrees.abs() > 0.01 || l.crop_zoom > 1.01 || l.crop_shape != CropShape::Rect
     });
 
+    // When the output needs YUV (NV12/I420), the GPU path eliminates the
+    // CPU RGBA→YUV conversion entirely — the `rgba_to_yuv.wgsl` compute
+    // shader handles it on the GPU and the CPU only receives the
+    // already-converted buffer from the readback.
+    let needs_yuv_output = matches!(output_format, Some(PixelFormat::Nv12 | PixelFormat::I420));
+
     // GPU is worthwhile when there's enough work to amortise
     // the upload + readback overhead (~0.5ms for 1080p).
-    total_items >= 2 || total_pixels >= 1920 * 1080 || has_effects
+    total_items >= 2 || total_pixels >= 1920 * 1080 || has_effects || needs_yuv_output
 }
 
 // ── GPU/CPU path hysteresis ─────────────────────────────────────────────────
@@ -1615,8 +1626,10 @@ pub fn should_use_gpu_with_state(
     layers: &[Option<LayerSnapshot>],
     image_overlays: &[Arc<DecodedOverlay>],
     text_overlays: &[Arc<DecodedOverlay>],
+    output_format: Option<PixelFormat>,
 ) -> bool {
-    let vote_gpu = should_use_gpu(canvas_w, canvas_h, layers, image_overlays, text_overlays);
+    let vote_gpu =
+        should_use_gpu(canvas_w, canvas_h, layers, image_overlays, text_overlays, output_format);
 
     if vote_gpu == state.last_used_gpu {
         // Same path as last frame — reset the flip counter.
diff --git a/crates/nodes/src/video/compositor/gpu_tests.rs b/crates/nodes/src/video/compositor/gpu_tests.rs
@@ -727,24 +727,27 @@ fn gpu_circle_crop_with_zoom() {
 fn gpu_should_use_gpu_heuristic() {
     use super::gpu::should_use_gpu;
 
-    // Single small layer → CPU.
+    // Single small layer, RGBA output → CPU.
     let small_layer = make_layer(solid_rgba(320, 240, 0, 0, 0, 255), 320, 240, None);
     assert!(
-        !should_use_gpu(320, 240, &[Some(small_layer)], &[], &[]),
+        !should_use_gpu(320, 240, &[Some(small_layer)], &[], &[], None),
         "Single small layer should prefer CPU"
     );
 
     // Two layers → GPU.
     let l1 = make_layer(solid_rgba(320, 240, 0, 0, 0, 255), 320, 240, None);
     let l2 = make_layer(solid_rgba(320, 240, 0, 0, 0, 255), 320, 240, None);
     assert!(
-        should_use_gpu(320, 240, &[Some(l1), Some(l2)], &[], &[]),
+        should_use_gpu(320, 240, &[Some(l1), Some(l2)], &[], &[], None),
         "Two layers should prefer GPU"
     );
 
     // 1080p single layer → GPU (high pixel count).
     let big_layer = make_layer(solid_rgba(1920, 1080, 0, 0, 0, 255), 1920, 1080, None);
-    assert!(should_use_gpu(1920, 1080, &[Some(big_layer)], &[], &[]), "1080p should prefer GPU");
+    assert!(
+        should_use_gpu(1920, 1080, &[Some(big_layer)], &[], &[], None),
+        "1080p should prefer GPU"
+    );
 
     // Single layer with rotation → GPU (effects).
     let rotated = make_layer_with_props(
@@ -761,9 +764,30 @@ fn gpu_should_use_gpu_heuristic() {
         CropShape::Rect,
     );
     assert!(
-        should_use_gpu(320, 240, &[Some(rotated)], &[], &[]),
+        should_use_gpu(320, 240, &[Some(rotated)], &[], &[], None),
         "Rotated layer should prefer GPU"
     );
+
+    // Single small layer with NV12 output → GPU (YUV conversion offload).
+    let nv12_layer = make_layer(solid_rgba(320, 240, 0, 0, 0, 255), 320, 240, None);
+    assert!(
+        should_use_gpu(320, 240, &[Some(nv12_layer)], &[], &[], Some(PixelFormat::Nv12)),
+        "NV12 output should prefer GPU even for single small layer"
+    );
+
+    // Single small layer with I420 output → GPU (YUV conversion offload).
+    let i420_layer = make_layer(solid_rgba(320, 240, 0, 0, 0, 255), 320, 240, None);
+    assert!(
+        should_use_gpu(320, 240, &[Some(i420_layer)], &[], &[], Some(PixelFormat::I420)),
+        "I420 output should prefer GPU even for single small layer"
+    );
+
+    // Single small layer with RGBA8 output → CPU (no conversion needed).
+    let rgba_layer = make_layer(solid_rgba(320, 240, 0, 0, 0, 255), 320, 240, None);
+    assert!(
+        !should_use_gpu(320, 240, &[Some(rgba_layer)], &[], &[], Some(PixelFormat::Rgba8)),
+        "RGBA8 output should prefer CPU for single small layer"
+    );
 }
 
 // ── Phase 2 tests ───────────────────────────────────────────────────────────
@@ -908,12 +932,13 @@ fn gpu_hysteresis_stability() {
 
     // First 4 frames voting GPU should NOT flip (hysteresis = 5).
     for _ in 0..4 {
-        let result = gpu::should_use_gpu_with_state(&mut state, 320, 240, &gpu_scene, &[], &[]);
+        let result =
+            gpu::should_use_gpu_with_state(&mut state, 320, 240, &gpu_scene, &[], &[], None);
         assert!(!result, "Should stay on CPU during hysteresis window");
     }
 
     // 5th consecutive frame should flip to GPU.
-    let result = gpu::should_use_gpu_with_state(&mut state, 320, 240, &gpu_scene, &[], &[]);
+    let result = gpu::should_use_gpu_with_state(&mut state, 320, 240, &gpu_scene, &[], &[], None);
     assert!(result, "Should flip to GPU after 5 consecutive votes");
 
     // Now on GPU. Build a scene that votes CPU.
@@ -922,13 +947,13 @@ fn gpu_hysteresis_stability() {
 
     // Interleave: vote CPU, then vote GPU — should reset the counter.
     for _ in 0..3 {
-        gpu::should_use_gpu_with_state(&mut state, 320, 240, &cpu_scene, &[], &[]);
+        gpu::should_use_gpu_with_state(&mut state, 320, 240, &cpu_scene, &[], &[], None);
     }
     // Interrupt with a GPU vote (re-add two layers).
     let l3 = make_layer(solid_rgba(320, 240, 0, 0, 0, 255), 320, 240, None);
     let l4 = make_layer(solid_rgba(320, 240, 0, 0, 0, 255), 320, 240, None);
     let gpu_scene2: Vec<Option<LayerSnapshot>> = vec![Some(l3), Some(l4)];
-    let result = gpu::should_use_gpu_with_state(&mut state, 320, 240, &gpu_scene2, &[], &[]);
+    let result = gpu::should_use_gpu_with_state(&mut state, 320, 240, &gpu_scene2, &[], &[], None);
     assert!(result, "Interruption should reset counter; stay on GPU");
 }
 
@@ -943,7 +968,7 @@ fn gpu_hysteresis_seeded_skips_warmup() {
     let gpu_scene: Vec<Option<LayerSnapshot>> = vec![Some(l1), Some(l2)];
 
     // First frame should immediately use GPU — no warm-up.
-    let result = gpu::should_use_gpu_with_state(&mut state, 320, 240, &gpu_scene, &[], &[]);
+    let result = gpu::should_use_gpu_with_state(&mut state, 320, 240, &gpu_scene, &[], &[], None);
     assert!(result, "Seeded state should use GPU on the very first frame");
 }
 
diff --git a/crates/nodes/src/video/compositor/mod.rs b/crates/nodes/src/video/compositor/mod.rs
@@ -563,6 +563,8 @@ impl ProcessorNode for CompositorNode {
         let initial_canvas_w = self.config.width;
         #[cfg(feature = "gpu")]
         let initial_canvas_h = self.config.height;
+        #[cfg(feature = "gpu")]
+        let initial_output_format = self.output_format;
 
         let composite_thread = tokio::task::spawn_blocking(move || {
             // Per-slot cache for YUV→RGBA conversions. Avoids redundant
@@ -606,6 +608,7 @@ impl ProcessorNode for CompositorNode {
                     &[], // no layers yet — seed from canvas size alone
                     &[],
                     &[],
+                    initial_output_format,
                 );
             #[cfg(feature = "gpu")]
             let mut gpu_path_state = gpu::GpuPathState::new_seeded(initial_should_gpu);
@@ -657,6 +660,7 @@ impl ProcessorNode for CompositorNode {
                                     &work.layers,
                                     &work.image_overlays,
                                     &work.text_overlays,
+                                    work.output_format,
                                 )
                         },
                     };