Skip to content

Commit 6995014

Browse files
feat(compositor): factor output_format into GPU heuristic
When the compositor's output_format is NV12 or I420, the GPU path eliminates the expensive CPU RGBA→YUV conversion entirely (~14% of CPU time in profiled pipelines). The should_use_gpu() heuristic now considers this, preferring GPU compositing whenever YUV output is requested — even for simple scenes that would otherwise stay on CPU. This addresses the #1 CPU hotspot identified in production profiling: rgba8_to_nv12_buf at 9.12% + parallel_rows at 5.28% = 14.4% combined. Signed-off-by: Devin AI <devin@streamkit.dev> Signed-off-by: StreamKit Devin <devin@streamkit.dev> Co-Authored-By: Claudio Costa <cstcld91@gmail.com>
1 parent fbf2084 commit 6995014

File tree

3 files changed

+56
-14
lines changed

3 files changed

+56
-14
lines changed

crates/nodes/src/video/compositor/gpu.rs

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1551,14 +1551,19 @@ impl GpuMode {
15511551
/// Decide whether to use GPU compositing for this frame based on scene
15521552
/// complexity. Used when `GpuMode::Auto` is selected.
15531553
///
1554-
/// GPU wins for: multi-layer, high-resolution, effects (rotation/crop).
1555-
/// CPU wins for: single opaque layer at identity scale (memcpy fast path).
1554+
/// GPU wins for: multi-layer, high-resolution, effects (rotation/crop),
1555+
/// or YUV output (the GPU `rgba_to_yuv.wgsl` shader eliminates the
1556+
/// expensive CPU RGBA→NV12/I420 conversion — ~14% of CPU time in
1557+
/// profiled pipelines).
1558+
/// CPU wins for: single opaque layer at identity scale with RGBA output
1559+
/// (memcpy fast path).
15561560
pub fn should_use_gpu(
15571561
canvas_w: u32,
15581562
canvas_h: u32,
15591563
layers: &[Option<LayerSnapshot>],
15601564
image_overlays: &[Arc<DecodedOverlay>],
15611565
text_overlays: &[Arc<DecodedOverlay>],
1566+
output_format: Option<PixelFormat>,
15621567
) -> bool {
15631568
let visible_layers = layers.iter().filter(|l| l.is_some()).count();
15641569
let total_items = visible_layers + image_overlays.len() + text_overlays.len();
@@ -1567,9 +1572,15 @@ pub fn should_use_gpu(
15671572
l.rotation_degrees.abs() > 0.01 || l.crop_zoom > 1.01 || l.crop_shape != CropShape::Rect
15681573
});
15691574

1575+
// When the output needs YUV (NV12/I420), the GPU path eliminates the
1576+
// CPU RGBA→YUV conversion entirely — the `rgba_to_yuv.wgsl` compute
1577+
// shader handles it on the GPU and the CPU only receives the
1578+
// already-converted buffer from the readback.
1579+
let needs_yuv_output = matches!(output_format, Some(PixelFormat::Nv12 | PixelFormat::I420));
1580+
15701581
// GPU is worthwhile when there's enough work to amortise
15711582
// the upload + readback overhead (~0.5ms for 1080p).
1572-
total_items >= 2 || total_pixels >= 1920 * 1080 || has_effects
1583+
total_items >= 2 || total_pixels >= 1920 * 1080 || has_effects || needs_yuv_output
15731584
}
15741585

15751586
// ── GPU/CPU path hysteresis ─────────────────────────────────────────────────
@@ -1615,8 +1626,10 @@ pub fn should_use_gpu_with_state(
16151626
layers: &[Option<LayerSnapshot>],
16161627
image_overlays: &[Arc<DecodedOverlay>],
16171628
text_overlays: &[Arc<DecodedOverlay>],
1629+
output_format: Option<PixelFormat>,
16181630
) -> bool {
1619-
let vote_gpu = should_use_gpu(canvas_w, canvas_h, layers, image_overlays, text_overlays);
1631+
let vote_gpu =
1632+
should_use_gpu(canvas_w, canvas_h, layers, image_overlays, text_overlays, output_format);
16201633

16211634
if vote_gpu == state.last_used_gpu {
16221635
// Same path as last frame — reset the flip counter.

crates/nodes/src/video/compositor/gpu_tests.rs

Lines changed: 35 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -727,24 +727,27 @@ fn gpu_circle_crop_with_zoom() {
727727
fn gpu_should_use_gpu_heuristic() {
728728
use super::gpu::should_use_gpu;
729729

730-
// Single small layer → CPU.
730+
// Single small layer, RGBA output → CPU.
731731
let small_layer = make_layer(solid_rgba(320, 240, 0, 0, 0, 255), 320, 240, None);
732732
assert!(
733-
!should_use_gpu(320, 240, &[Some(small_layer)], &[], &[]),
733+
!should_use_gpu(320, 240, &[Some(small_layer)], &[], &[], None),
734734
"Single small layer should prefer CPU"
735735
);
736736

737737
// Two layers → GPU.
738738
let l1 = make_layer(solid_rgba(320, 240, 0, 0, 0, 255), 320, 240, None);
739739
let l2 = make_layer(solid_rgba(320, 240, 0, 0, 0, 255), 320, 240, None);
740740
assert!(
741-
should_use_gpu(320, 240, &[Some(l1), Some(l2)], &[], &[]),
741+
should_use_gpu(320, 240, &[Some(l1), Some(l2)], &[], &[], None),
742742
"Two layers should prefer GPU"
743743
);
744744

745745
// 1080p single layer → GPU (high pixel count).
746746
let big_layer = make_layer(solid_rgba(1920, 1080, 0, 0, 0, 255), 1920, 1080, None);
747-
assert!(should_use_gpu(1920, 1080, &[Some(big_layer)], &[], &[]), "1080p should prefer GPU");
747+
assert!(
748+
should_use_gpu(1920, 1080, &[Some(big_layer)], &[], &[], None),
749+
"1080p should prefer GPU"
750+
);
748751

749752
// Single layer with rotation → GPU (effects).
750753
let rotated = make_layer_with_props(
@@ -761,9 +764,30 @@ fn gpu_should_use_gpu_heuristic() {
761764
CropShape::Rect,
762765
);
763766
assert!(
764-
should_use_gpu(320, 240, &[Some(rotated)], &[], &[]),
767+
should_use_gpu(320, 240, &[Some(rotated)], &[], &[], None),
765768
"Rotated layer should prefer GPU"
766769
);
770+
771+
// Single small layer with NV12 output → GPU (YUV conversion offload).
772+
let nv12_layer = make_layer(solid_rgba(320, 240, 0, 0, 0, 255), 320, 240, None);
773+
assert!(
774+
should_use_gpu(320, 240, &[Some(nv12_layer)], &[], &[], Some(PixelFormat::Nv12)),
775+
"NV12 output should prefer GPU even for single small layer"
776+
);
777+
778+
// Single small layer with I420 output → GPU (YUV conversion offload).
779+
let i420_layer = make_layer(solid_rgba(320, 240, 0, 0, 0, 255), 320, 240, None);
780+
assert!(
781+
should_use_gpu(320, 240, &[Some(i420_layer)], &[], &[], Some(PixelFormat::I420)),
782+
"I420 output should prefer GPU even for single small layer"
783+
);
784+
785+
// Single small layer with RGBA8 output → CPU (no conversion needed).
786+
let rgba_layer = make_layer(solid_rgba(320, 240, 0, 0, 0, 255), 320, 240, None);
787+
assert!(
788+
!should_use_gpu(320, 240, &[Some(rgba_layer)], &[], &[], Some(PixelFormat::Rgba8)),
789+
"RGBA8 output should prefer CPU for single small layer"
790+
);
767791
}
768792

769793
// ── Phase 2 tests ───────────────────────────────────────────────────────────
@@ -908,12 +932,13 @@ fn gpu_hysteresis_stability() {
908932

909933
// First 4 frames voting GPU should NOT flip (hysteresis = 5).
910934
for _ in 0..4 {
911-
let result = gpu::should_use_gpu_with_state(&mut state, 320, 240, &gpu_scene, &[], &[]);
935+
let result =
936+
gpu::should_use_gpu_with_state(&mut state, 320, 240, &gpu_scene, &[], &[], None);
912937
assert!(!result, "Should stay on CPU during hysteresis window");
913938
}
914939

915940
// 5th consecutive frame should flip to GPU.
916-
let result = gpu::should_use_gpu_with_state(&mut state, 320, 240, &gpu_scene, &[], &[]);
941+
let result = gpu::should_use_gpu_with_state(&mut state, 320, 240, &gpu_scene, &[], &[], None);
917942
assert!(result, "Should flip to GPU after 5 consecutive votes");
918943

919944
// Now on GPU. Build a scene that votes CPU.
@@ -922,13 +947,13 @@ fn gpu_hysteresis_stability() {
922947

923948
// Interleave: vote CPU, then vote GPU — should reset the counter.
924949
for _ in 0..3 {
925-
gpu::should_use_gpu_with_state(&mut state, 320, 240, &cpu_scene, &[], &[]);
950+
gpu::should_use_gpu_with_state(&mut state, 320, 240, &cpu_scene, &[], &[], None);
926951
}
927952
// Interrupt with a GPU vote (re-add two layers).
928953
let l3 = make_layer(solid_rgba(320, 240, 0, 0, 0, 255), 320, 240, None);
929954
let l4 = make_layer(solid_rgba(320, 240, 0, 0, 0, 255), 320, 240, None);
930955
let gpu_scene2: Vec<Option<LayerSnapshot>> = vec![Some(l3), Some(l4)];
931-
let result = gpu::should_use_gpu_with_state(&mut state, 320, 240, &gpu_scene2, &[], &[]);
956+
let result = gpu::should_use_gpu_with_state(&mut state, 320, 240, &gpu_scene2, &[], &[], None);
932957
assert!(result, "Interruption should reset counter; stay on GPU");
933958
}
934959

@@ -943,7 +968,7 @@ fn gpu_hysteresis_seeded_skips_warmup() {
943968
let gpu_scene: Vec<Option<LayerSnapshot>> = vec![Some(l1), Some(l2)];
944969

945970
// First frame should immediately use GPU — no warm-up.
946-
let result = gpu::should_use_gpu_with_state(&mut state, 320, 240, &gpu_scene, &[], &[]);
971+
let result = gpu::should_use_gpu_with_state(&mut state, 320, 240, &gpu_scene, &[], &[], None);
947972
assert!(result, "Seeded state should use GPU on the very first frame");
948973
}
949974

crates/nodes/src/video/compositor/mod.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -563,6 +563,8 @@ impl ProcessorNode for CompositorNode {
563563
let initial_canvas_w = self.config.width;
564564
#[cfg(feature = "gpu")]
565565
let initial_canvas_h = self.config.height;
566+
#[cfg(feature = "gpu")]
567+
let initial_output_format = self.output_format;
566568

567569
let composite_thread = tokio::task::spawn_blocking(move || {
568570
// Per-slot cache for YUV→RGBA conversions. Avoids redundant
@@ -606,6 +608,7 @@ impl ProcessorNode for CompositorNode {
606608
&[], // no layers yet — seed from canvas size alone
607609
&[],
608610
&[],
611+
initial_output_format,
609612
);
610613
#[cfg(feature = "gpu")]
611614
let mut gpu_path_state = gpu::GpuPathState::new_seeded(initial_should_gpu);
@@ -657,6 +660,7 @@ impl ProcessorNode for CompositorNode {
657660
&work.layers,
658661
&work.image_overlays,
659662
&work.text_overlays,
663+
work.output_format,
660664
)
661665
},
662666
};

0 commit comments

Comments
 (0)