Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,12 @@ on:
branches: [ main ]
workflow_dispatch: {}

# Cancel superseded runs on the same PR / branch so the single
# self-hosted GPU runner isn't blocked by stale jobs.
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

jobs:
skit:
name: Skit
Expand Down
21 changes: 17 additions & 4 deletions crates/nodes/src/video/compositor/gpu.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1551,14 +1551,19 @@ impl GpuMode {
/// Decide whether to use GPU compositing for this frame based on scene
/// complexity. Used when `GpuMode::Auto` is selected.
///
/// GPU wins for: multi-layer, high-resolution, effects (rotation/crop).
/// CPU wins for: single opaque layer at identity scale (memcpy fast path).
/// GPU wins for: multi-layer, high-resolution, effects (rotation/crop),
/// or YUV output (the GPU `rgba_to_yuv.wgsl` shader eliminates the
/// expensive CPU RGBA→NV12/I420 conversion — ~14% of CPU time in
/// profiled pipelines).
/// CPU wins for: single opaque layer at identity scale with RGBA output
/// (memcpy fast path).
pub fn should_use_gpu(
canvas_w: u32,
canvas_h: u32,
layers: &[Option<LayerSnapshot>],
image_overlays: &[Arc<DecodedOverlay>],
text_overlays: &[Arc<DecodedOverlay>],
output_format: Option<PixelFormat>,
) -> bool {
let visible_layers = layers.iter().filter(|l| l.is_some()).count();
let total_items = visible_layers + image_overlays.len() + text_overlays.len();
Expand All @@ -1567,9 +1572,15 @@ pub fn should_use_gpu(
l.rotation_degrees.abs() > 0.01 || l.crop_zoom > 1.01 || l.crop_shape != CropShape::Rect
});

// When the output needs YUV (NV12/I420), the GPU path eliminates the
// CPU RGBA→YUV conversion entirely — the `rgba_to_yuv.wgsl` compute
// shader handles it on the GPU and the CPU only receives the
// already-converted buffer from the readback.
let needs_yuv_output = matches!(output_format, Some(PixelFormat::Nv12 | PixelFormat::I420));

// GPU is worthwhile when there's enough work to amortise
// the upload + readback overhead (~0.5ms for 1080p).
total_items >= 2 || total_pixels >= 1920 * 1080 || has_effects
total_items >= 2 || total_pixels >= 1920 * 1080 || has_effects || needs_yuv_output
}

// ── GPU/CPU path hysteresis ─────────────────────────────────────────────────
Expand Down Expand Up @@ -1615,8 +1626,10 @@ pub fn should_use_gpu_with_state(
layers: &[Option<LayerSnapshot>],
image_overlays: &[Arc<DecodedOverlay>],
text_overlays: &[Arc<DecodedOverlay>],
output_format: Option<PixelFormat>,
) -> bool {
let vote_gpu = should_use_gpu(canvas_w, canvas_h, layers, image_overlays, text_overlays);
let vote_gpu =
should_use_gpu(canvas_w, canvas_h, layers, image_overlays, text_overlays, output_format);

if vote_gpu == state.last_used_gpu {
// Same path as last frame — reset the flip counter.
Expand Down
45 changes: 35 additions & 10 deletions crates/nodes/src/video/compositor/gpu_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -727,24 +727,27 @@ fn gpu_circle_crop_with_zoom() {
fn gpu_should_use_gpu_heuristic() {
use super::gpu::should_use_gpu;

// Single small layer → CPU.
// Single small layer, RGBA output → CPU.
let small_layer = make_layer(solid_rgba(320, 240, 0, 0, 0, 255), 320, 240, None);
assert!(
!should_use_gpu(320, 240, &[Some(small_layer)], &[], &[]),
!should_use_gpu(320, 240, &[Some(small_layer)], &[], &[], None),
"Single small layer should prefer CPU"
);

// Two layers → GPU.
let l1 = make_layer(solid_rgba(320, 240, 0, 0, 0, 255), 320, 240, None);
let l2 = make_layer(solid_rgba(320, 240, 0, 0, 0, 255), 320, 240, None);
assert!(
should_use_gpu(320, 240, &[Some(l1), Some(l2)], &[], &[]),
should_use_gpu(320, 240, &[Some(l1), Some(l2)], &[], &[], None),
"Two layers should prefer GPU"
);

// 1080p single layer → GPU (high pixel count).
let big_layer = make_layer(solid_rgba(1920, 1080, 0, 0, 0, 255), 1920, 1080, None);
assert!(should_use_gpu(1920, 1080, &[Some(big_layer)], &[], &[]), "1080p should prefer GPU");
assert!(
should_use_gpu(1920, 1080, &[Some(big_layer)], &[], &[], None),
"1080p should prefer GPU"
);

// Single layer with rotation → GPU (effects).
let rotated = make_layer_with_props(
Expand All @@ -761,9 +764,30 @@ fn gpu_should_use_gpu_heuristic() {
CropShape::Rect,
);
assert!(
should_use_gpu(320, 240, &[Some(rotated)], &[], &[]),
should_use_gpu(320, 240, &[Some(rotated)], &[], &[], None),
"Rotated layer should prefer GPU"
);

// Single small layer with NV12 output → GPU (YUV conversion offload).
let nv12_layer = make_layer(solid_rgba(320, 240, 0, 0, 0, 255), 320, 240, None);
assert!(
should_use_gpu(320, 240, &[Some(nv12_layer)], &[], &[], Some(PixelFormat::Nv12)),
"NV12 output should prefer GPU even for single small layer"
);

// Single small layer with I420 output → GPU (YUV conversion offload).
let i420_layer = make_layer(solid_rgba(320, 240, 0, 0, 0, 255), 320, 240, None);
assert!(
should_use_gpu(320, 240, &[Some(i420_layer)], &[], &[], Some(PixelFormat::I420)),
"I420 output should prefer GPU even for single small layer"
);

// Single small layer with RGBA8 output → CPU (no conversion needed).
let rgba_layer = make_layer(solid_rgba(320, 240, 0, 0, 0, 255), 320, 240, None);
assert!(
!should_use_gpu(320, 240, &[Some(rgba_layer)], &[], &[], Some(PixelFormat::Rgba8)),
"RGBA8 output should prefer CPU for single small layer"
);
}

// ── Phase 2 tests ───────────────────────────────────────────────────────────
Expand Down Expand Up @@ -908,12 +932,13 @@ fn gpu_hysteresis_stability() {

// First 4 frames voting GPU should NOT flip (hysteresis = 5).
for _ in 0..4 {
let result = gpu::should_use_gpu_with_state(&mut state, 320, 240, &gpu_scene, &[], &[]);
let result =
gpu::should_use_gpu_with_state(&mut state, 320, 240, &gpu_scene, &[], &[], None);
assert!(!result, "Should stay on CPU during hysteresis window");
}

// 5th consecutive frame should flip to GPU.
let result = gpu::should_use_gpu_with_state(&mut state, 320, 240, &gpu_scene, &[], &[]);
let result = gpu::should_use_gpu_with_state(&mut state, 320, 240, &gpu_scene, &[], &[], None);
assert!(result, "Should flip to GPU after 5 consecutive votes");

// Now on GPU. Build a scene that votes CPU.
Expand All @@ -922,13 +947,13 @@ fn gpu_hysteresis_stability() {

// Interleave: vote CPU, then vote GPU — should reset the counter.
for _ in 0..3 {
gpu::should_use_gpu_with_state(&mut state, 320, 240, &cpu_scene, &[], &[]);
gpu::should_use_gpu_with_state(&mut state, 320, 240, &cpu_scene, &[], &[], None);
}
// Interrupt with a GPU vote (re-add two layers).
let l3 = make_layer(solid_rgba(320, 240, 0, 0, 0, 255), 320, 240, None);
let l4 = make_layer(solid_rgba(320, 240, 0, 0, 0, 255), 320, 240, None);
let gpu_scene2: Vec<Option<LayerSnapshot>> = vec![Some(l3), Some(l4)];
let result = gpu::should_use_gpu_with_state(&mut state, 320, 240, &gpu_scene2, &[], &[]);
let result = gpu::should_use_gpu_with_state(&mut state, 320, 240, &gpu_scene2, &[], &[], None);
assert!(result, "Interruption should reset counter; stay on GPU");
}

Expand All @@ -943,7 +968,7 @@ fn gpu_hysteresis_seeded_skips_warmup() {
let gpu_scene: Vec<Option<LayerSnapshot>> = vec![Some(l1), Some(l2)];

// First frame should immediately use GPU — no warm-up.
let result = gpu::should_use_gpu_with_state(&mut state, 320, 240, &gpu_scene, &[], &[]);
let result = gpu::should_use_gpu_with_state(&mut state, 320, 240, &gpu_scene, &[], &[], None);
assert!(result, "Seeded state should use GPU on the very first frame");
}

Expand Down
4 changes: 4 additions & 0 deletions crates/nodes/src/video/compositor/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -563,6 +563,8 @@ impl ProcessorNode for CompositorNode {
let initial_canvas_w = self.config.width;
#[cfg(feature = "gpu")]
let initial_canvas_h = self.config.height;
#[cfg(feature = "gpu")]
let initial_output_format = self.output_format;

let composite_thread = tokio::task::spawn_blocking(move || {
// Per-slot cache for YUV→RGBA conversions. Avoids redundant
Expand Down Expand Up @@ -606,6 +608,7 @@ impl ProcessorNode for CompositorNode {
&[], // no layers yet — seed from canvas size alone
&[],
&[],
initial_output_format,
);
#[cfg(feature = "gpu")]
let mut gpu_path_state = gpu::GpuPathState::new_seeded(initial_should_gpu);
Expand Down Expand Up @@ -657,6 +660,7 @@ impl ProcessorNode for CompositorNode {
&work.layers,
&work.image_overlays,
&work.text_overlays,
work.output_format,
)
},
};
Expand Down
35 changes: 25 additions & 10 deletions crates/nodes/src/video/compositor/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1710,11 +1710,14 @@ async fn test_oneshot_output_timestamps_monotonic() {
/// even in batch mode, capping throughput at wall-clock fps.
#[tokio::test]
async fn test_oneshot_processes_faster_than_realtime() {
let frame_count: usize = 30;
let fps: u32 = 30;
// At real-time pacing, 30 frames at 30 fps = 1 second.
// We assert it completes in well under half that.
let max_allowed = std::time::Duration::from_millis(500);
let frame_count: usize = 10;
let fps: u32 = 5;
// At real-time pacing, 10 frames at 5 fps = 2 seconds.
// Without pacing the tiny 4×4 frames should finish well under 1.5s
// even on a loaded CI runner. The previous 30@30 (budget 500ms)
// flaked on the GPU runner because per-frame scheduling overhead
// (~30ms) nearly matched the pacing interval (33ms).
let max_allowed = std::time::Duration::from_millis(1500);

let (input_tx, input_rx) = mpsc::channel(256);
let mut inputs = HashMap::new();
Expand Down Expand Up @@ -1749,7 +1752,10 @@ async fn test_oneshot_processes_faster_than_realtime() {
assert!(
elapsed < max_allowed,
"Oneshot compositor took {elapsed:?} for {frame_count} frames at {fps} fps — \
expected < {max_allowed:?} (should not be real-time paced)",
expected < {max_allowed:?} (should not be real-time paced). \
If this is close to {} ms (real-time pace), the oneshot tick \
path may have regressed to interval-based pacing.",
frame_count as u64 * 1000 / u64::from(fps),
);
}

Expand Down Expand Up @@ -2496,7 +2502,16 @@ async fn test_compositor_output_format_runtime_change() {
};

// Start with no output_format (RGBA8).
let config = CompositorConfig { width: 4, height: 4, ..Default::default() };
// Force CPU mode so the test isn't blocked by GpuContext::try_init()
// competing for the GPU with other parallel tests on the self-hosted
// runner. This test validates runtime output_format switching, not
// GPU compositing.
let config = CompositorConfig {
width: 4,
height: 4,
gpu_mode: Some("cpu".to_string()),
..Default::default()
};
let node = CompositorNode::new(config, GlobalCompositorConfig::default());

let node_handle = tokio::spawn(async move { Box::new(node).run(context).await });
Expand All @@ -2507,17 +2522,17 @@ async fn test_compositor_output_format_runtime_change() {
// Send a frame — should come out as RGBA8.
let frame = make_rgba_frame(2, 2, 255, 0, 0, 255);
input_tx.send(Packet::Video(frame)).await.unwrap();
tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
tokio::time::sleep(tokio::time::Duration::from_millis(200)).await;

// Send UpdateParams to switch output_format to NV12.
let update = serde_json::json!({ "output_format": "nv12" });
ctrl_tx.send(NodeControlMessage::UpdateParams(update)).await.unwrap();
tokio::time::sleep(tokio::time::Duration::from_millis(50)).await;
tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;

// Send another frame — should come out as NV12.
let frame2 = make_rgba_frame(2, 2, 0, 255, 0, 255);
input_tx.send(Packet::Video(frame2)).await.unwrap();
tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
tokio::time::sleep(tokio::time::Duration::from_millis(200)).await;

drop(input_tx);
drop(ctrl_tx);
Expand Down
Loading