From 69950144d8804337ce6e37f38a3dea0949c99cac Mon Sep 17 00:00:00 2001
From: StreamKit Devin <devin@streamkit.dev>
Date: Sun, 29 Mar 2026 09:47:04 +0000
Subject: [PATCH 1/4] feat(compositor): factor output_format into GPU heuristic
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the compositor's output_format is NV12 or I420, the GPU path
eliminates the expensive CPU RGBA→YUV conversion entirely (~14% of
CPU time in profiled pipelines). The should_use_gpu() heuristic now
considers this, preferring GPU compositing whenever YUV output is
requested — even for simple scenes that would otherwise stay on CPU.

This addresses the #1 CPU hotspot identified in production profiling:
rgba8_to_nv12_buf at 9.12% + parallel_rows at 5.28% = 14.4% combined.

Signed-off-by: Devin AI <devin@streamkit.dev>
Signed-off-by: StreamKit Devin <devin@streamkit.dev>
Co-Authored-By: Claudio Costa <cstcld91@gmail.com>
---
 crates/nodes/src/video/compositor/gpu.rs      | 21 +++++++--
 .../nodes/src/video/compositor/gpu_tests.rs   | 45 ++++++++++++++-----
 crates/nodes/src/video/compositor/mod.rs      |  4 ++
 3 files changed, 56 insertions(+), 14 deletions(-)
diff --git a/crates/nodes/src/video/compositor/gpu.rs b/crates/nodes/src/video/compositor/gpu.rs
index a655e76b..bbc04205 100644
--- a/crates/nodes/src/video/compositor/gpu.rs
+++ b/crates/nodes/src/video/compositor/gpu.rs
@@ -1551,14 +1551,19 @@ impl GpuMode {
 /// Decide whether to use GPU compositing for this frame based on scene
 /// complexity.  Used when `GpuMode::Auto` is selected.
 ///
-/// GPU wins for: multi-layer, high-resolution, effects (rotation/crop).
-/// CPU wins for: single opaque layer at identity scale (memcpy fast path).
+/// GPU wins for: multi-layer, high-resolution, effects (rotation/crop),
+/// or YUV output (the GPU `rgba_to_yuv.wgsl` shader eliminates the
+/// expensive CPU RGBA→NV12/I420 conversion — ~14% of CPU time in
+/// profiled pipelines).
+/// CPU wins for: single opaque layer at identity scale with RGBA output
+/// (memcpy fast path).
 pub fn should_use_gpu(
     canvas_w: u32,
     canvas_h: u32,
     layers: &[Option<LayerSnapshot>],
     image_overlays: &[Arc<DecodedOverlay>],
     text_overlays: &[Arc<DecodedOverlay>],
+    output_format: Option<PixelFormat>,
 ) -> bool {
     let visible_layers = layers.iter().filter(|l| l.is_some()).count();
     let total_items = visible_layers + image_overlays.len() + text_overlays.len();
@@ -1567,9 +1572,15 @@ pub fn should_use_gpu(
         l.rotation_degrees.abs() > 0.01 || l.crop_zoom > 1.01 || l.crop_shape != CropShape::Rect
     });
 
+    // When the output needs YUV (NV12/I420), the GPU path eliminates the
+    // CPU RGBA→YUV conversion entirely — the `rgba_to_yuv.wgsl` compute
+    // shader handles it on the GPU and the CPU only receives the
+    // already-converted buffer from the readback.
+    let needs_yuv_output = matches!(output_format, Some(PixelFormat::Nv12 | PixelFormat::I420));
+
     // GPU is worthwhile when there's enough work to amortise
     // the upload + readback overhead (~0.5ms for 1080p).
-    total_items >= 2 || total_pixels >= 1920 * 1080 || has_effects
+    total_items >= 2 || total_pixels >= 1920 * 1080 || has_effects || needs_yuv_output
 }
 
 // ── GPU/CPU path hysteresis ─────────────────────────────────────────────────
@@ -1615,8 +1626,10 @@ pub fn should_use_gpu_with_state(
     layers: &[Option<LayerSnapshot>],
     image_overlays: &[Arc<DecodedOverlay>],
     text_overlays: &[Arc<DecodedOverlay>],
+    output_format: Option<PixelFormat>,
 ) -> bool {
-    let vote_gpu = should_use_gpu(canvas_w, canvas_h, layers, image_overlays, text_overlays);
+    let vote_gpu =
+        should_use_gpu(canvas_w, canvas_h, layers, image_overlays, text_overlays, output_format);
 
     if vote_gpu == state.last_used_gpu {
         // Same path as last frame — reset the flip counter.
diff --git a/crates/nodes/src/video/compositor/gpu_tests.rs b/crates/nodes/src/video/compositor/gpu_tests.rs
index a4599dfc..d1761065 100644
--- a/crates/nodes/src/video/compositor/gpu_tests.rs
+++ b/crates/nodes/src/video/compositor/gpu_tests.rs
@@ -727,10 +727,10 @@ fn gpu_circle_crop_with_zoom() {
 fn gpu_should_use_gpu_heuristic() {
     use super::gpu::should_use_gpu;
 
-    // Single small layer → CPU.
+    // Single small layer, RGBA output → CPU.
     let small_layer = make_layer(solid_rgba(320, 240, 0, 0, 0, 255), 320, 240, None);
     assert!(
-        !should_use_gpu(320, 240, &[Some(small_layer)], &[], &[]),
+        !should_use_gpu(320, 240, &[Some(small_layer)], &[], &[], None),
         "Single small layer should prefer CPU"
     );
 
@@ -738,13 +738,16 @@ fn gpu_should_use_gpu_heuristic() {
     let l1 = make_layer(solid_rgba(320, 240, 0, 0, 0, 255), 320, 240, None);
     let l2 = make_layer(solid_rgba(320, 240, 0, 0, 0, 255), 320, 240, None);
     assert!(
-        should_use_gpu(320, 240, &[Some(l1), Some(l2)], &[], &[]),
+        should_use_gpu(320, 240, &[Some(l1), Some(l2)], &[], &[], None),
         "Two layers should prefer GPU"
     );
 
     // 1080p single layer → GPU (high pixel count).
     let big_layer = make_layer(solid_rgba(1920, 1080, 0, 0, 0, 255), 1920, 1080, None);
-    assert!(should_use_gpu(1920, 1080, &[Some(big_layer)], &[], &[]), "1080p should prefer GPU");
+    assert!(
+        should_use_gpu(1920, 1080, &[Some(big_layer)], &[], &[], None),
+        "1080p should prefer GPU"
+    );
 
     // Single layer with rotation → GPU (effects).
     let rotated = make_layer_with_props(
@@ -761,9 +764,30 @@ fn gpu_should_use_gpu_heuristic() {
         CropShape::Rect,
     );
     assert!(
-        should_use_gpu(320, 240, &[Some(rotated)], &[], &[]),
+        should_use_gpu(320, 240, &[Some(rotated)], &[], &[], None),
         "Rotated layer should prefer GPU"
     );
+
+    // Single small layer with NV12 output → GPU (YUV conversion offload).
+    let nv12_layer = make_layer(solid_rgba(320, 240, 0, 0, 0, 255), 320, 240, None);
+    assert!(
+        should_use_gpu(320, 240, &[Some(nv12_layer)], &[], &[], Some(PixelFormat::Nv12)),
+        "NV12 output should prefer GPU even for single small layer"
+    );
+
+    // Single small layer with I420 output → GPU (YUV conversion offload).
+    let i420_layer = make_layer(solid_rgba(320, 240, 0, 0, 0, 255), 320, 240, None);
+    assert!(
+        should_use_gpu(320, 240, &[Some(i420_layer)], &[], &[], Some(PixelFormat::I420)),
+        "I420 output should prefer GPU even for single small layer"
+    );
+
+    // Single small layer with RGBA8 output → CPU (no conversion needed).
+    let rgba_layer = make_layer(solid_rgba(320, 240, 0, 0, 0, 255), 320, 240, None);
+    assert!(
+        !should_use_gpu(320, 240, &[Some(rgba_layer)], &[], &[], Some(PixelFormat::Rgba8)),
+        "RGBA8 output should prefer CPU for single small layer"
+    );
 }
 
 // ── Phase 2 tests ───────────────────────────────────────────────────────────
@@ -908,12 +932,13 @@ fn gpu_hysteresis_stability() {
 
     // First 4 frames voting GPU should NOT flip (hysteresis = 5).
     for _ in 0..4 {
-        let result = gpu::should_use_gpu_with_state(&mut state, 320, 240, &gpu_scene, &[], &[]);
+        let result =
+            gpu::should_use_gpu_with_state(&mut state, 320, 240, &gpu_scene, &[], &[], None);
         assert!(!result, "Should stay on CPU during hysteresis window");
     }
 
     // 5th consecutive frame should flip to GPU.
-    let result = gpu::should_use_gpu_with_state(&mut state, 320, 240, &gpu_scene, &[], &[]);
+    let result = gpu::should_use_gpu_with_state(&mut state, 320, 240, &gpu_scene, &[], &[], None);
     assert!(result, "Should flip to GPU after 5 consecutive votes");
 
     // Now on GPU. Build a scene that votes CPU.
@@ -922,13 +947,13 @@ fn gpu_hysteresis_stability() {
 
     // Interleave: vote CPU, then vote GPU — should reset the counter.
     for _ in 0..3 {
-        gpu::should_use_gpu_with_state(&mut state, 320, 240, &cpu_scene, &[], &[]);
+        gpu::should_use_gpu_with_state(&mut state, 320, 240, &cpu_scene, &[], &[], None);
     }
     // Interrupt with a GPU vote (re-add two layers).
     let l3 = make_layer(solid_rgba(320, 240, 0, 0, 0, 255), 320, 240, None);
     let l4 = make_layer(solid_rgba(320, 240, 0, 0, 0, 255), 320, 240, None);
     let gpu_scene2: Vec<Option<LayerSnapshot>> = vec![Some(l3), Some(l4)];
-    let result = gpu::should_use_gpu_with_state(&mut state, 320, 240, &gpu_scene2, &[], &[]);
+    let result = gpu::should_use_gpu_with_state(&mut state, 320, 240, &gpu_scene2, &[], &[], None);
     assert!(result, "Interruption should reset counter; stay on GPU");
 }
 
@@ -943,7 +968,7 @@ fn gpu_hysteresis_seeded_skips_warmup() {
     let gpu_scene: Vec<Option<LayerSnapshot>> = vec![Some(l1), Some(l2)];
 
     // First frame should immediately use GPU — no warm-up.
-    let result = gpu::should_use_gpu_with_state(&mut state, 320, 240, &gpu_scene, &[], &[]);
+    let result = gpu::should_use_gpu_with_state(&mut state, 320, 240, &gpu_scene, &[], &[], None);
     assert!(result, "Seeded state should use GPU on the very first frame");
 }
 
diff --git a/crates/nodes/src/video/compositor/mod.rs b/crates/nodes/src/video/compositor/mod.rs
index f46e042f..5bf1a3ba 100644
--- a/crates/nodes/src/video/compositor/mod.rs
+++ b/crates/nodes/src/video/compositor/mod.rs
@@ -563,6 +563,8 @@ impl ProcessorNode for CompositorNode {
         let initial_canvas_w = self.config.width;
         #[cfg(feature = "gpu")]
         let initial_canvas_h = self.config.height;
+        #[cfg(feature = "gpu")]
+        let initial_output_format = self.output_format;
 
         let composite_thread = tokio::task::spawn_blocking(move || {
             // Per-slot cache for YUV→RGBA conversions. Avoids redundant
@@ -606,6 +608,7 @@ impl ProcessorNode for CompositorNode {
                     &[], // no layers yet — seed from canvas size alone
                     &[],
                     &[],
+                    initial_output_format,
                 );
             #[cfg(feature = "gpu")]
             let mut gpu_path_state = gpu::GpuPathState::new_seeded(initial_should_gpu);
@@ -657,6 +660,7 @@ impl ProcessorNode for CompositorNode {
                                     &work.layers,
                                     &work.image_overlays,
                                     &work.text_overlays,
+                                    work.output_format,
                                 )
                         },
                     };

From 8041cfcbfe2507b424f7eac1b7422bd8d5c174b8 Mon Sep 17 00:00:00 2001
From: StreamKit Devin <devin@streamkit.dev>
Date: Sun, 29 Mar 2026 10:22:12 +0000
Subject: [PATCH 2/4] ci: cancel superseded workflow runs on same PR

Adds a concurrency group keyed on PR number / branch ref with
cancel-in-progress: true. This prevents the single self-hosted GPU
runner from being blocked by stale jobs when new commits are pushed.

Signed-off-by: Devin AI <devin@streamkit.dev>
Signed-off-by: StreamKit Devin <devin@streamkit.dev>
Co-Authored-By: Claudio Costa <cstcld91@gmail.com>
---
 .github/workflows/ci.yml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 39db0efd..2d7dcdef 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -7,6 +7,12 @@ on:
     branches: [ main ]
   workflow_dispatch: {}
 
+# Cancel superseded runs on the same PR / branch so the single
+# self-hosted GPU runner isn't blocked by stale jobs.
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
   skit:
     name: Skit

From ea101aa6ebe5cc91b6fecf2e518c05c0c2effef5 Mon Sep 17 00:00:00 2001
From: StreamKit Devin <devin@streamkit.dev>
Date: Sun, 29 Mar 2026 10:31:09 +0000
Subject: [PATCH 3/4] test(compositor): fix flaky oneshot timing and runtime
 format tests

Two tests flaked on the self-hosted GPU runner where many tests run
concurrently and compete for CPU:

1. test_oneshot_processes_faster_than_realtime: reduced from 30@30fps
   (budget 500ms vs 1000ms real-time = 10% margin) to 10@5fps
   (budget 1500ms vs 2000ms real-time = 25% margin).  The previous
   budget was nearly indistinguishable from per-frame scheduling
   overhead (~30ms) under CI load.

2. test_compositor_output_format_runtime_change: increased inter-step
   sleeps from 100/50/100ms to 300/200/300ms.  The compositor thread
   can be starved for CPU when GPU tests run in parallel, so the
   original windows were not enough for even one tick to fire.

Signed-off-by: Devin AI <devin@streamkit.dev>
Signed-off-by: StreamKit Devin <devin@streamkit.dev>
Co-Authored-By: Claudio Costa <cstcld91@gmail.com>
---
 crates/nodes/src/video/compositor/tests.rs | 28 +++++++++++++++-------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/crates/nodes/src/video/compositor/tests.rs b/crates/nodes/src/video/compositor/tests.rs
index af4d1974..99fe3429 100644
--- a/crates/nodes/src/video/compositor/tests.rs
+++ b/crates/nodes/src/video/compositor/tests.rs
@@ -1710,11 +1710,14 @@ async fn test_oneshot_output_timestamps_monotonic() {
 /// even in batch mode, capping throughput at wall-clock fps.
 #[tokio::test]
 async fn test_oneshot_processes_faster_than_realtime() {
-    let frame_count: usize = 30;
-    let fps: u32 = 30;
-    // At real-time pacing, 30 frames at 30 fps = 1 second.
-    // We assert it completes in well under half that.
-    let max_allowed = std::time::Duration::from_millis(500);
+    let frame_count: usize = 10;
+    let fps: u32 = 5;
+    // At real-time pacing, 10 frames at 5 fps = 2 seconds.
+    // Without pacing the tiny 4×4 frames should finish well under 1.5s
+    // even on a loaded CI runner.  The previous 30@30 (budget 500ms)
+    // flaked on the GPU runner because per-frame scheduling overhead
+    // (~30ms) nearly matched the pacing interval (33ms).
+    let max_allowed = std::time::Duration::from_millis(1500);
 
     let (input_tx, input_rx) = mpsc::channel(256);
     let mut inputs = HashMap::new();
@@ -1749,7 +1752,10 @@ async fn test_oneshot_processes_faster_than_realtime() {
     assert!(
         elapsed < max_allowed,
         "Oneshot compositor took {elapsed:?} for {frame_count} frames at {fps} fps — \
-         expected < {max_allowed:?} (should not be real-time paced)",
+         expected < {max_allowed:?} (should not be real-time paced).  \
+         If this is close to {} ms (real-time pace), the oneshot tick \
+         path may have regressed to interval-based pacing.",
+        frame_count as u64 * 1000 / u64::from(fps),
     );
 }
 
@@ -2507,17 +2513,21 @@ async fn test_compositor_output_format_runtime_change() {
     // Send a frame — should come out as RGBA8.
     let frame = make_rgba_frame(2, 2, 255, 0, 0, 255);
     input_tx.send(Packet::Video(frame)).await.unwrap();
-    tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
+    // Give the compositor enough ticks to process the frame.  On the
+    // self-hosted GPU runner many tests run in parallel, so the
+    // compositor thread may be starved for CPU.  300ms ≈ 9 ticks at
+    // 30 fps — generous enough even under heavy load.
+    tokio::time::sleep(tokio::time::Duration::from_millis(300)).await;
 
     // Send UpdateParams to switch output_format to NV12.
     let update = serde_json::json!({ "output_format": "nv12" });
     ctrl_tx.send(NodeControlMessage::UpdateParams(update)).await.unwrap();
-    tokio::time::sleep(tokio::time::Duration::from_millis(50)).await;
+    tokio::time::sleep(tokio::time::Duration::from_millis(200)).await;
 
     // Send another frame — should come out as NV12.
     let frame2 = make_rgba_frame(2, 2, 0, 255, 0, 255);
     input_tx.send(Packet::Video(frame2)).await.unwrap();
-    tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
+    tokio::time::sleep(tokio::time::Duration::from_millis(300)).await;
 
     drop(input_tx);
     drop(ctrl_tx);

From 52d596bdcce6055e14880eac6f8a2955b51447e6 Mon Sep 17 00:00:00 2001
From: StreamKit Devin <devin@streamkit.dev>
Date: Sun, 29 Mar 2026 10:37:18 +0000
Subject: [PATCH 4/4] test(compositor): fix flaky runtime format change test on
 GPU runner
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The test consistently got only 1 output frame instead of ≥ 2 on the
self-hosted GPU runner.  Root cause: when compiled with --features gpu
and gpu_mode Auto (the default), the compositor OS thread blocks on
GpuContext::try_init() before processing any compositing work.  On the
GPU runner with many tests competing for the device, init can exceed
the total sleep budget (800ms).  By the time it finishes, both input
frames have been drained to just the latest (Dynamic mode behaviour),
producing a single output.

Fix: set gpu_mode: "cpu" explicitly.  This test validates runtime
output_format switching via UpdateParams, not GPU compositing — GPU
init is unnecessary overhead that creates the race.

Also reduces sleep durations to 200/100/200ms (from 300/200/300ms)
since without GPU init the compositor thread starts processing
immediately.

Signed-off-by: Devin AI <devin@streamkit.dev>
Signed-off-by: StreamKit Devin <devin@streamkit.dev>
Co-Authored-By: Claudio Costa <cstcld91@gmail.com>
---
 crates/nodes/src/video/compositor/tests.rs | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/crates/nodes/src/video/compositor/tests.rs b/crates/nodes/src/video/compositor/tests.rs
index 99fe3429..8a295c69 100644
--- a/crates/nodes/src/video/compositor/tests.rs
+++ b/crates/nodes/src/video/compositor/tests.rs
@@ -2502,7 +2502,16 @@ async fn test_compositor_output_format_runtime_change() {
     };
 
     // Start with no output_format (RGBA8).
-    let config = CompositorConfig { width: 4, height: 4, ..Default::default() };
+    // Force CPU mode so the test isn't blocked by GpuContext::try_init()
+    // competing for the GPU with other parallel tests on the self-hosted
+    // runner.  This test validates runtime output_format switching, not
+    // GPU compositing.
+    let config = CompositorConfig {
+        width: 4,
+        height: 4,
+        gpu_mode: Some("cpu".to_string()),
+        ..Default::default()
+    };
     let node = CompositorNode::new(config, GlobalCompositorConfig::default());
 
     let node_handle = tokio::spawn(async move { Box::new(node).run(context).await });
@@ -2513,21 +2522,17 @@ async fn test_compositor_output_format_runtime_change() {
     // Send a frame — should come out as RGBA8.
     let frame = make_rgba_frame(2, 2, 255, 0, 0, 255);
     input_tx.send(Packet::Video(frame)).await.unwrap();
-    // Give the compositor enough ticks to process the frame.  On the
-    // self-hosted GPU runner many tests run in parallel, so the
-    // compositor thread may be starved for CPU.  300ms ≈ 9 ticks at
-    // 30 fps — generous enough even under heavy load.
-    tokio::time::sleep(tokio::time::Duration::from_millis(300)).await;
+    tokio::time::sleep(tokio::time::Duration::from_millis(200)).await;
 
     // Send UpdateParams to switch output_format to NV12.
     let update = serde_json::json!({ "output_format": "nv12" });
     ctrl_tx.send(NodeControlMessage::UpdateParams(update)).await.unwrap();
-    tokio::time::sleep(tokio::time::Duration::from_millis(200)).await;
+    tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
 
     // Send another frame — should come out as NV12.
     let frame2 = make_rgba_frame(2, 2, 0, 255, 0, 255);
     input_tx.send(Packet::Video(frame2)).await.unwrap();
-    tokio::time::sleep(tokio::time::Duration::from_millis(300)).await;
+    tokio::time::sleep(tokio::time::Duration::from_millis(200)).await;
 
     drop(input_tx);
     drop(ctrl_tx);