diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 39db0efd..2d7dcdef 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -7,6 +7,12 @@ on: branches: [ main ] workflow_dispatch: {} +# Cancel superseded runs on the same PR / branch so the single +# self-hosted GPU runner isn't blocked by stale jobs. +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: skit: name: Skit diff --git a/crates/nodes/src/video/compositor/gpu.rs b/crates/nodes/src/video/compositor/gpu.rs index a655e76b..bbc04205 100644 --- a/crates/nodes/src/video/compositor/gpu.rs +++ b/crates/nodes/src/video/compositor/gpu.rs @@ -1551,14 +1551,19 @@ impl GpuMode { /// Decide whether to use GPU compositing for this frame based on scene /// complexity. Used when `GpuMode::Auto` is selected. /// -/// GPU wins for: multi-layer, high-resolution, effects (rotation/crop). -/// CPU wins for: single opaque layer at identity scale (memcpy fast path). +/// GPU wins for: multi-layer, high-resolution, effects (rotation/crop), +/// or YUV output (the GPU `rgba_to_yuv.wgsl` shader eliminates the +/// expensive CPU RGBA→NV12/I420 conversion — ~14% of CPU time in +/// profiled pipelines). +/// CPU wins for: single opaque layer at identity scale with RGBA output +/// (memcpy fast path). pub fn should_use_gpu( canvas_w: u32, canvas_h: u32, layers: &[Option], image_overlays: &[Arc], text_overlays: &[Arc], + output_format: Option, ) -> bool { let visible_layers = layers.iter().filter(|l| l.is_some()).count(); let total_items = visible_layers + image_overlays.len() + text_overlays.len(); @@ -1567,9 +1572,15 @@ pub fn should_use_gpu( l.rotation_degrees.abs() > 0.01 || l.crop_zoom > 1.01 || l.crop_shape != CropShape::Rect }); + // When the output needs YUV (NV12/I420), the GPU path eliminates the + // CPU RGBA→YUV conversion entirely — the `rgba_to_yuv.wgsl` compute + // shader handles it on the GPU and the CPU only receives the + // already-converted buffer from the readback. + let needs_yuv_output = matches!(output_format, Some(PixelFormat::Nv12 | PixelFormat::I420)); + // GPU is worthwhile when there's enough work to amortise // the upload + readback overhead (~0.5ms for 1080p). - total_items >= 2 || total_pixels >= 1920 * 1080 || has_effects + total_items >= 2 || total_pixels >= 1920 * 1080 || has_effects || needs_yuv_output } // ── GPU/CPU path hysteresis ───────────────────────────────────────────────── @@ -1615,8 +1626,10 @@ pub fn should_use_gpu_with_state( layers: &[Option], image_overlays: &[Arc], text_overlays: &[Arc], + output_format: Option, ) -> bool { - let vote_gpu = should_use_gpu(canvas_w, canvas_h, layers, image_overlays, text_overlays); + let vote_gpu = + should_use_gpu(canvas_w, canvas_h, layers, image_overlays, text_overlays, output_format); if vote_gpu == state.last_used_gpu { // Same path as last frame — reset the flip counter. diff --git a/crates/nodes/src/video/compositor/gpu_tests.rs b/crates/nodes/src/video/compositor/gpu_tests.rs index a4599dfc..d1761065 100644 --- a/crates/nodes/src/video/compositor/gpu_tests.rs +++ b/crates/nodes/src/video/compositor/gpu_tests.rs @@ -727,10 +727,10 @@ fn gpu_circle_crop_with_zoom() { fn gpu_should_use_gpu_heuristic() { use super::gpu::should_use_gpu; - // Single small layer → CPU. + // Single small layer, RGBA output → CPU. let small_layer = make_layer(solid_rgba(320, 240, 0, 0, 0, 255), 320, 240, None); assert!( - !should_use_gpu(320, 240, &[Some(small_layer)], &[], &[]), + !should_use_gpu(320, 240, &[Some(small_layer)], &[], &[], None), "Single small layer should prefer CPU" ); @@ -738,13 +738,16 @@ fn gpu_should_use_gpu_heuristic() { let l1 = make_layer(solid_rgba(320, 240, 0, 0, 0, 255), 320, 240, None); let l2 = make_layer(solid_rgba(320, 240, 0, 0, 0, 255), 320, 240, None); assert!( - should_use_gpu(320, 240, &[Some(l1), Some(l2)], &[], &[]), + should_use_gpu(320, 240, &[Some(l1), Some(l2)], &[], &[], None), "Two layers should prefer GPU" ); // 1080p single layer → GPU (high pixel count). let big_layer = make_layer(solid_rgba(1920, 1080, 0, 0, 0, 255), 1920, 1080, None); - assert!(should_use_gpu(1920, 1080, &[Some(big_layer)], &[], &[]), "1080p should prefer GPU"); + assert!( + should_use_gpu(1920, 1080, &[Some(big_layer)], &[], &[], None), + "1080p should prefer GPU" + ); // Single layer with rotation → GPU (effects). let rotated = make_layer_with_props( @@ -761,9 +764,30 @@ fn gpu_should_use_gpu_heuristic() { CropShape::Rect, ); assert!( - should_use_gpu(320, 240, &[Some(rotated)], &[], &[]), + should_use_gpu(320, 240, &[Some(rotated)], &[], &[], None), "Rotated layer should prefer GPU" ); + + // Single small layer with NV12 output → GPU (YUV conversion offload). + let nv12_layer = make_layer(solid_rgba(320, 240, 0, 0, 0, 255), 320, 240, None); + assert!( + should_use_gpu(320, 240, &[Some(nv12_layer)], &[], &[], Some(PixelFormat::Nv12)), + "NV12 output should prefer GPU even for single small layer" + ); + + // Single small layer with I420 output → GPU (YUV conversion offload). + let i420_layer = make_layer(solid_rgba(320, 240, 0, 0, 0, 255), 320, 240, None); + assert!( + should_use_gpu(320, 240, &[Some(i420_layer)], &[], &[], Some(PixelFormat::I420)), + "I420 output should prefer GPU even for single small layer" + ); + + // Single small layer with RGBA8 output → CPU (no conversion needed). + let rgba_layer = make_layer(solid_rgba(320, 240, 0, 0, 0, 255), 320, 240, None); + assert!( + !should_use_gpu(320, 240, &[Some(rgba_layer)], &[], &[], Some(PixelFormat::Rgba8)), + "RGBA8 output should prefer CPU for single small layer" + ); } // ── Phase 2 tests ─────────────────────────────────────────────────────────── @@ -908,12 +932,13 @@ fn gpu_hysteresis_stability() { // First 4 frames voting GPU should NOT flip (hysteresis = 5). for _ in 0..4 { - let result = gpu::should_use_gpu_with_state(&mut state, 320, 240, &gpu_scene, &[], &[]); + let result = + gpu::should_use_gpu_with_state(&mut state, 320, 240, &gpu_scene, &[], &[], None); assert!(!result, "Should stay on CPU during hysteresis window"); } // 5th consecutive frame should flip to GPU. - let result = gpu::should_use_gpu_with_state(&mut state, 320, 240, &gpu_scene, &[], &[]); + let result = gpu::should_use_gpu_with_state(&mut state, 320, 240, &gpu_scene, &[], &[], None); assert!(result, "Should flip to GPU after 5 consecutive votes"); // Now on GPU. Build a scene that votes CPU. @@ -922,13 +947,13 @@ fn gpu_hysteresis_stability() { // Interleave: vote CPU, then vote GPU — should reset the counter. for _ in 0..3 { - gpu::should_use_gpu_with_state(&mut state, 320, 240, &cpu_scene, &[], &[]); + gpu::should_use_gpu_with_state(&mut state, 320, 240, &cpu_scene, &[], &[], None); } // Interrupt with a GPU vote (re-add two layers). let l3 = make_layer(solid_rgba(320, 240, 0, 0, 0, 255), 320, 240, None); let l4 = make_layer(solid_rgba(320, 240, 0, 0, 0, 255), 320, 240, None); let gpu_scene2: Vec> = vec![Some(l3), Some(l4)]; - let result = gpu::should_use_gpu_with_state(&mut state, 320, 240, &gpu_scene2, &[], &[]); + let result = gpu::should_use_gpu_with_state(&mut state, 320, 240, &gpu_scene2, &[], &[], None); assert!(result, "Interruption should reset counter; stay on GPU"); } @@ -943,7 +968,7 @@ fn gpu_hysteresis_seeded_skips_warmup() { let gpu_scene: Vec> = vec![Some(l1), Some(l2)]; // First frame should immediately use GPU — no warm-up. - let result = gpu::should_use_gpu_with_state(&mut state, 320, 240, &gpu_scene, &[], &[]); + let result = gpu::should_use_gpu_with_state(&mut state, 320, 240, &gpu_scene, &[], &[], None); assert!(result, "Seeded state should use GPU on the very first frame"); } diff --git a/crates/nodes/src/video/compositor/mod.rs b/crates/nodes/src/video/compositor/mod.rs index f46e042f..5bf1a3ba 100644 --- a/crates/nodes/src/video/compositor/mod.rs +++ b/crates/nodes/src/video/compositor/mod.rs @@ -563,6 +563,8 @@ impl ProcessorNode for CompositorNode { let initial_canvas_w = self.config.width; #[cfg(feature = "gpu")] let initial_canvas_h = self.config.height; + #[cfg(feature = "gpu")] + let initial_output_format = self.output_format; let composite_thread = tokio::task::spawn_blocking(move || { // Per-slot cache for YUV→RGBA conversions. Avoids redundant @@ -606,6 +608,7 @@ impl ProcessorNode for CompositorNode { &[], // no layers yet — seed from canvas size alone &[], &[], + initial_output_format, ); #[cfg(feature = "gpu")] let mut gpu_path_state = gpu::GpuPathState::new_seeded(initial_should_gpu); @@ -657,6 +660,7 @@ impl ProcessorNode for CompositorNode { &work.layers, &work.image_overlays, &work.text_overlays, + work.output_format, ) }, }; diff --git a/crates/nodes/src/video/compositor/tests.rs b/crates/nodes/src/video/compositor/tests.rs index af4d1974..8a295c69 100644 --- a/crates/nodes/src/video/compositor/tests.rs +++ b/crates/nodes/src/video/compositor/tests.rs @@ -1710,11 +1710,14 @@ async fn test_oneshot_output_timestamps_monotonic() { /// even in batch mode, capping throughput at wall-clock fps. #[tokio::test] async fn test_oneshot_processes_faster_than_realtime() { - let frame_count: usize = 30; - let fps: u32 = 30; - // At real-time pacing, 30 frames at 30 fps = 1 second. - // We assert it completes in well under half that. - let max_allowed = std::time::Duration::from_millis(500); + let frame_count: usize = 10; + let fps: u32 = 5; + // At real-time pacing, 10 frames at 5 fps = 2 seconds. + // Without pacing the tiny 4×4 frames should finish well under 1.5s + // even on a loaded CI runner. The previous 30@30 (budget 500ms) + // flaked on the GPU runner because per-frame scheduling overhead + // (~30ms) nearly matched the pacing interval (33ms). + let max_allowed = std::time::Duration::from_millis(1500); let (input_tx, input_rx) = mpsc::channel(256); let mut inputs = HashMap::new(); @@ -1749,7 +1752,10 @@ async fn test_oneshot_processes_faster_than_realtime() { assert!( elapsed < max_allowed, "Oneshot compositor took {elapsed:?} for {frame_count} frames at {fps} fps — \ - expected < {max_allowed:?} (should not be real-time paced)", + expected < {max_allowed:?} (should not be real-time paced). \ + If this is close to {} ms (real-time pace), the oneshot tick \ + path may have regressed to interval-based pacing.", + frame_count as u64 * 1000 / u64::from(fps), ); } @@ -2496,7 +2502,16 @@ async fn test_compositor_output_format_runtime_change() { }; // Start with no output_format (RGBA8). - let config = CompositorConfig { width: 4, height: 4, ..Default::default() }; + // Force CPU mode so the test isn't blocked by GpuContext::try_init() + // competing for the GPU with other parallel tests on the self-hosted + // runner. This test validates runtime output_format switching, not + // GPU compositing. + let config = CompositorConfig { + width: 4, + height: 4, + gpu_mode: Some("cpu".to_string()), + ..Default::default() + }; let node = CompositorNode::new(config, GlobalCompositorConfig::default()); let node_handle = tokio::spawn(async move { Box::new(node).run(context).await }); @@ -2507,17 +2522,17 @@ async fn test_compositor_output_format_runtime_change() { // Send a frame — should come out as RGBA8. let frame = make_rgba_frame(2, 2, 255, 0, 0, 255); input_tx.send(Packet::Video(frame)).await.unwrap(); - tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; + tokio::time::sleep(tokio::time::Duration::from_millis(200)).await; // Send UpdateParams to switch output_format to NV12. let update = serde_json::json!({ "output_format": "nv12" }); ctrl_tx.send(NodeControlMessage::UpdateParams(update)).await.unwrap(); - tokio::time::sleep(tokio::time::Duration::from_millis(50)).await; + tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; // Send another frame — should come out as NV12. let frame2 = make_rgba_frame(2, 2, 0, 255, 0, 255); input_tx.send(Packet::Video(frame2)).await.unwrap(); - tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; + tokio::time::sleep(tokio::time::Duration::from_millis(200)).await; drop(input_tx); drop(ctrl_tx);