diff --git a/Cargo.toml b/Cargo.toml
index 03c104ca..681d4619 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -22,6 +22,7 @@ gltf = { version = "1.1", default-features = false }
 log = "0.4"
 mint = "0.5"
 naga = { version = "22", features = ["wgsl-in"] }
+nanorand = { version = "0.7", default-features = false }
 profiling = "1"
 slab = "0.4"
 strum = { version = "0.25", features = ["derive"] }
@@ -79,7 +80,7 @@ glam = { workspace = true }
 log = { workspace = true }
 mint = { workspace = true, features = ["serde"] }
 naga = { workspace = true }
-nanorand = { version = "0.7", default-features = false, features = ["wyrand"] }
+nanorand = { workspace = true, features = ["wyrand"] }
 profiling = { workspace = true }
 ron = "0.8"
 serde = { version = "1", features = ["serde_derive"] }
@@ -95,7 +96,6 @@ egui-winit = "0.28"
 console_error_panic_hook = "0.1.7"
 console_log = "1"
 web-sys = { workspace = true, features = ["Window"] }
-getrandom = { version = "0.2", features = ["js"] }
 
 [target.'cfg(any(target_os = "windows", target_os = "linux"))'.dev-dependencies]
 renderdoc = "0.12"
diff --git a/blade-graphics/src/util.rs b/blade-graphics/src/util.rs
index 23e876e0..f0d2013e 100644
--- a/blade-graphics/src/util.rs
+++ b/blade-graphics/src/util.rs
@@ -94,14 +94,19 @@ impl super::TextureFormat {
     }
 }
 
+impl super::Extent {
+    pub fn group_by(&self, size: [u32; 3]) -> [u32; 3] {
+        [
+            (self.width + size[0] - 1) / size[0],
+            (self.height + size[1] - 1) / size[1],
+            (self.depth + size[2] - 1) / size[2],
+        ]
+    }
+}
+
 impl super::ComputePipeline {
     /// Return the dispatch group counts sufficient to cover the given extent.
     pub fn get_dispatch_for(&self, extent: super::Extent) -> [u32; 3] {
-        let wg_size = self.get_workgroup_size();
-        [
-            (extent.width + wg_size[0] - 1) / wg_size[0],
-            (extent.height + wg_size[1] - 1) / wg_size[1],
-            (extent.depth + wg_size[2] - 1) / wg_size[2],
-        ]
+        extent.group_by(self.get_workgroup_size())
     }
 }
diff --git a/blade-helpers/src/camera.rs b/blade-helpers/src/camera.rs
index b866ca7f..246d014b 100644
--- a/blade-helpers/src/camera.rs
+++ b/blade-helpers/src/camera.rs
@@ -1,5 +1,7 @@
 use super::ExposeHud;
 
+const MAX_FLY_SPEED: f32 = 1000000.0;
+
 pub struct ControlledCamera {
     pub inner: blade_render::Camera,
     pub fly_speed: f32,
@@ -86,6 +88,14 @@ impl ControlledCamera {
 
         true
     }
+
+    pub fn on_wheel(&mut self, delta: winit::event::MouseScrollDelta) {
+        let shift = match delta {
+            winit::event::MouseScrollDelta::LineDelta(_, lines) => lines,
+            winit::event::MouseScrollDelta::PixelDelta(position) => position.y as f32,
+        };
+        self.fly_speed = (self.fly_speed * shift.exp()).clamp(1.0, MAX_FLY_SPEED);
+    }
 }
 
 impl ExposeHud for ControlledCamera {
@@ -105,7 +115,7 @@ impl ExposeHud for ControlledCamera {
         });
         ui.add(egui::Slider::new(&mut self.inner.fov_y, 0.5f32..=2.0f32).text("FOV"));
         ui.add(
-            egui::Slider::new(&mut self.fly_speed, 1f32..=100000f32)
+            egui::Slider::new(&mut self.fly_speed, 1f32..=MAX_FLY_SPEED)
                 .text("Fly speed")
                 .logarithmic(true),
         );
diff --git a/blade-helpers/src/hud.rs b/blade-helpers/src/hud.rs
index a135a337..8940825f 100644
--- a/blade-helpers/src/hud.rs
+++ b/blade-helpers/src/hud.rs
@@ -15,27 +15,34 @@ impl ExposeHud for blade_render::RayConfig {
         );
         ui.checkbox(&mut self.temporal_tap, "Temporal tap");
         ui.add(
-            egui::widgets::Slider::new(&mut self.temporal_history, 0..=50).text("Temporal history"),
+            egui::widgets::Slider::new(&mut self.temporal_confidence, 0.0..=50.0)
+                .text("Temporal confidence"),
         );
         ui.add(egui::widgets::Slider::new(&mut self.spatial_taps, 0..=10).text("Spatial taps"));
         ui.add(
-            egui::widgets::Slider::new(&mut self.spatial_tap_history, 0..=50)
-                .text("Spatial tap history"),
+            egui::widgets::Slider::new(&mut self.spatial_confidence, 0.0..=50.0)
+                .text("Spatial confidence"),
         );
+        ui.add(egui::widgets::Slider::new(&mut self.group_mixer, 1..=10).text("Group mixer"));
         ui.add(
-            egui::widgets::Slider::new(&mut self.spatial_radius, 1..=50)
-                .text("Spatial radius (px)"),
+            egui::widgets::Slider::new(&mut self.spatial_min_distance, 1..=10)
+                .text("Spatial minimum distance (px)"),
         );
         ui.add(
             egui::widgets::Slider::new(&mut self.t_start, 0.001..=0.5)
                 .text("T min")
                 .logarithmic(true),
         );
+        ui.checkbox(&mut self.pairwise_mis, "Pairwise MIS");
+        ui.add(
+            egui::widgets::Slider::new(&mut self.defensive_mis, 0.0..=1.0).text("Defensive MIS"),
+        );
     }
 }
 
 impl ExposeHud for blade_render::DenoiserConfig {
     fn populate_hud(&mut self, ui: &mut egui::Ui) {
+        ui.checkbox(&mut self.enabled, "Enable denoiser");
         ui.add(egui::Slider::new(&mut self.temporal_weight, 0.0..=1.0f32).text("Temporal weight"));
         ui.add(egui::Slider::new(&mut self.num_passes, 0..=5u32).text("A-trous passes"));
     }
@@ -57,6 +64,15 @@ impl ExposeHud for blade_render::PostProcConfig {
     }
 }
 
+impl ExposeHud for blade_render::FrameConfig {
+    fn populate_hud(&mut self, ui: &mut egui::Ui) {
+        ui.horizontal(|ui| {
+            self.reset_reservoirs |= ui.button("Reset Accumulation").clicked();
+            ui.toggle_value(&mut self.frozen, "Freeze");
+        });
+    }
+}
+
 impl ExposeHud for blade_render::DebugConfig {
     fn populate_hud(&mut self, ui: &mut egui::Ui) {
         use strum::IntoEnumIterator as _;
diff --git a/blade-render/code/a-trous.wgsl b/blade-render/code/a-trous.wgsl
new file mode 100644
index 00000000..d4eda9e6
--- /dev/null
+++ b/blade-render/code/a-trous.wgsl
@@ -0,0 +1,79 @@
+#include "quaternion.inc.wgsl"
+#include "surface.inc.wgsl"
+
+// Spatio-temporal variance-guided filtering
+// https://research.nvidia.com/sites/default/files/pubs/2017-07_Spatiotemporal-Variance-Guided-Filtering%3A//svgf_preprint.pdf
+
+// Note: using "ilm" in place of "illumination and the 2nd moment of its luminance"
+
+struct Params {
+    extent: vec2<i32>,
+    iteration: u32,
+}
+
+var<uniform> params: Params;
+var t_depth: texture_2d<f32>;
+var t_flat_normal: texture_2d<f32>;
+var input: texture_2d<f32>;
+var output: texture_storage_2d<rgba16float, write>;
+
+const LUMA: vec3<f32> = vec3<f32>(0.2126, 0.7152, 0.0722);
+const MIN_WEIGHT: f32 = 0.01;
+
+fn read_surface(pixel: vec2<i32>) -> Surface {
+    var surface = Surface();
+    surface.flat_normal = normalize(textureLoad(t_flat_normal, pixel, 0).xyz);
+    surface.depth = textureLoad(t_depth, pixel, 0).x;
+    return surface;
+}
+
+const GAUSSIAN_WEIGHTS = vec2<f32>(0.44198, 0.27901);
+const SIGMA_L: f32 = 4.0;
+const EPSILON: f32 = 0.001;
+
+fn compare_luminance(a_lum: f32, b_lum: f32, variance: f32) -> f32 {
+    return exp(-abs(a_lum - b_lum) / (SIGMA_L * variance + EPSILON));
+}
+
+fn w4(w: f32) -> vec4<f32> {
+    return vec4<f32>(vec3<f32>(w), w * w);
+}
+
+@compute @workgroup_size(8, 8)
+fn atrous3x3(@builtin(global_invocation_id) global_id: vec3<u32>) {
+    let center = vec2<i32>(global_id.xy);
+    if (any(center >= params.extent)) {
+        return;
+    }
+
+    let center_ilm = textureLoad(input, center, 0);
+    let center_luma = dot(center_ilm.xyz, LUMA);
+    let variance = sqrt(center_ilm.w);
+    let center_suf = read_surface(center);
+    var sum_weight = GAUSSIAN_WEIGHTS[0] * GAUSSIAN_WEIGHTS[0];
+    var sum_ilm = w4(sum_weight) * center_ilm;
+
+    for (var yy=-1; yy<=1; yy+=1) {
+        for (var xx=-1; xx<=1; xx+=1) {
+            let p = center + vec2<i32>(xx, yy) * (1i << params.iteration);
+            if (all(p == center) || any(p < vec2<i32>(0)) || any(p >= params.extent)) {
+                continue;
+            }
+
+            //TODO: store in group-shared memory
+            let surface = read_surface(p);
+            var weight = GAUSSIAN_WEIGHTS[abs(xx)] * GAUSSIAN_WEIGHTS[abs(yy)];
+            //TODO: make it stricter on higher iterations
+            weight *= compare_flat_normals(surface.flat_normal, center_suf.flat_normal);
+            //Note: should we use a projected depth instead of the surface one?
+            weight *= compare_depths(surface.depth, center_suf.depth);
+            let other_ilm = textureLoad(input, p, 0);
+            weight *= compare_luminance(center_luma, dot(other_ilm.xyz, LUMA), variance);
+            sum_ilm += w4(weight) * other_ilm;
+            sum_weight += weight;
+        }
+    }
+
+    let filtered_ilm = select(center_ilm, sum_ilm / w4(sum_weight), sum_weight > MIN_WEIGHT);
+    textureStore(output, global_id.xy, filtered_ilm);
+}
diff --git a/blade-render/code/accum.inc.wgsl b/blade-render/code/accum.inc.wgsl
new file mode 100644
index 00000000..5b30c0a2
--- /dev/null
+++ b/blade-render/code/accum.inc.wgsl
@@ -0,0 +1,21 @@
+const LUMA: vec3<f32> = vec3<f32>(0.2126, 0.7152, 0.0722);
+const MOTION_FACTOR: f32 = 0.1;
+
+var inout_diffuse: texture_storage_2d<rgba16float, read_write>;
+
+fn accumulate_temporal(
+    pixel: vec2<i32>, cur_illumination: vec3<f32>,
+    temporal_weight: f32, prev_pixel: vec2<i32>,
+    motion_sqr: f32,
+) {
+    var illumination = cur_illumination;
+    if (prev_pixel.x >= 0 && temporal_weight < 1.0) {
+        let factor = mix(temporal_weight, 1.0, min(pow(motion_sqr, 0.25) * MOTION_FACTOR, 1.0));
+        let prev_illumination = textureLoad(inout_diffuse, prev_pixel).xyz;
+        illumination = mix(prev_illumination, illumination, factor);
+    }
+
+    let luminocity = dot(illumination, LUMA);
+    let ilm = vec4<f32>(illumination, luminocity * luminocity);
+    textureStore(inout_diffuse, pixel, ilm);
+}
diff --git a/blade-render/code/blur.wgsl b/blade-render/code/blur.wgsl
deleted file mode 100644
index 3207ef60..00000000
--- a/blade-render/code/blur.wgsl
+++ /dev/null
@@ -1,162 +0,0 @@
-#include "camera.inc.wgsl"
-#include "gbuf.inc.wgsl"
-#include "quaternion.inc.wgsl"
-#include "surface.inc.wgsl"
-
-// Spatio-temporal variance-guided filtering
-// https://research.nvidia.com/sites/default/files/pubs/2017-07_Spatiotemporal-Variance-Guided-Filtering%3A//svgf_preprint.pdf
-
-// Note: using "ilm" in place of "illumination and the 2nd moment of its luminance"
-
-struct Params {
-    extent: vec2<i32>,
-    temporal_weight: f32,
-    iteration: u32,
-    use_motion_vectors: u32,
-}
-
-var<uniform> camera: CameraParams;
-var<uniform> prev_camera: CameraParams;
-var<uniform> params: Params;
-var t_depth: texture_2d<f32>;
-var t_prev_depth: texture_2d<f32>;
-var t_flat_normal: texture_2d<f32>;
-var t_prev_flat_normal: texture_2d<f32>;
-var t_motion: texture_2d<f32>;
-var input: texture_2d<f32>;
-var prev_input: texture_2d<f32>;
-var output: texture_storage_2d<rgba16float, write>;
-
-const LUMA: vec3<f32> = vec3<f32>(0.2126, 0.7152, 0.0722);
-const MIN_WEIGHT: f32 = 0.01;
-
-fn read_surface(pixel: vec2<i32>) -> Surface {
-    var surface = Surface();
-    surface.flat_normal = normalize(textureLoad(t_flat_normal, pixel, 0).xyz);
-    surface.depth = textureLoad(t_depth, pixel, 0).x;
-    return surface;
-}
-fn read_prev_surface(pixel: vec2<i32>) -> Surface {
-    var surface = Surface();
-    surface.flat_normal = normalize(textureLoad(t_prev_flat_normal, pixel, 0).xyz);
-    surface.depth = textureLoad(t_prev_depth, pixel, 0).x;
-    return surface;
-}
-
-fn get_prev_pixel(pixel: vec2<i32>, pos_world: vec3<f32>) -> vec2<f32> {
-    if (USE_MOTION_VECTORS && params.use_motion_vectors != 0u) {
-        let motion = textureLoad(t_motion, pixel, 0).xy / MOTION_SCALE;
-        return vec2<f32>(pixel) + 0.5 + motion;
-    } else {
-        return get_projected_pixel_float(prev_camera, pos_world);
-    }
-}
-
-@compute @workgroup_size(8, 8)
-fn temporal_accum(@builtin(global_invocation_id) global_id: vec3<u32>) {
-    let pixel = vec2<i32>(global_id.xy);
-    if (any(pixel >= params.extent)) {
-        return;
-    }
-
-    let surface = read_surface(pixel);
-    let pos_world = camera.position + surface.depth * get_ray_direction(camera, pixel);
-    // considering all samples in 2x2 quad, to help with edges
-    var center_pixel = get_prev_pixel(pixel, pos_world);
-    var prev_pixels = array<vec2<i32>, 4>(
-        vec2<i32>(vec2<f32>(center_pixel.x - 0.5, center_pixel.y - 0.5)),
-        vec2<i32>(vec2<f32>(center_pixel.x + 0.5, center_pixel.y - 0.5)),
-        vec2<i32>(vec2<f32>(center_pixel.x + 0.5, center_pixel.y + 0.5)),
-        vec2<i32>(vec2<f32>(center_pixel.x - 0.5, center_pixel.y + 0.5)),
-    );
-    //Note: careful about the pixel center when there is a perfect match
-    let w_bot_right = fract(center_pixel + vec2<f32>(0.5));
-    var prev_weights = vec4<f32>(
-        (1.0 - w_bot_right.x) * (1.0 - w_bot_right.y),
-        w_bot_right.x * (1.0 - w_bot_right.y),
-        w_bot_right.x * w_bot_right.y,
-        (1.0 - w_bot_right.x) * w_bot_right.y,
-    );
-
-    var sum_weight = 0.0;
-    var sum_ilm = vec4<f32>(0.0);
-    //TODO: optimize depth load with a gather operation
-    for (var i = 0; i < 4; i += 1) {
-        let prev_pixel = prev_pixels[i];
-        if (all(prev_pixel >= vec2<i32>(0)) && all(prev_pixel < params.extent)) {
-            let prev_surface = read_prev_surface(prev_pixel);
-            if (compare_flat_normals(surface.flat_normal, prev_surface.flat_normal) < 0.5) {
-                continue;
-            }
-            let projected_distance = length(pos_world - prev_camera.position);
-            if (compare_depths(prev_surface.depth, projected_distance) < 0.5) {
-                continue;
-            }
-            let w = prev_weights[i];
-            sum_weight += w;
-            let illumination = w * textureLoad(prev_input, prev_pixel, 0).xyz;
-            let luminocity = dot(illumination, LUMA);
-            sum_ilm += vec4<f32>(illumination, luminocity * luminocity);
-        }
-    }
-
-    let cur_illumination = textureLoad(input, pixel, 0).xyz;
-    let cur_luminocity = dot(cur_illumination, LUMA);
-    var mixed_ilm = vec4<f32>(cur_illumination, cur_luminocity * cur_luminocity);
-    if (sum_weight > MIN_WEIGHT) {
-        let prev_ilm = sum_ilm / vec4(vec3<f32>(sum_weight), max(0.001, sum_weight*sum_weight));
-        mixed_ilm = mix(mixed_ilm, prev_ilm, sum_weight * (1.0 - params.temporal_weight));
-    }
-    textureStore(output, global_id.xy, mixed_ilm);
-}
-
-const GAUSSIAN_WEIGHTS = vec2<f32>(0.44198, 0.27901);
-const SIGMA_L: f32 = 4.0;
-const EPSILON: f32 = 0.001;
-
-fn compare_luminance(a_lum: f32, b_lum: f32, variance: f32) -> f32 {
-    return exp(-abs(a_lum - b_lum) / (SIGMA_L * variance + EPSILON));
-}
-
-fn w4(w: f32) -> vec4<f32> {
-    return vec4<f32>(vec3<f32>(w), w * w);
-}
-
-@compute @workgroup_size(8, 8)
-fn atrous3x3(@builtin(global_invocation_id) global_id: vec3<u32>) {
-    let center = vec2<i32>(global_id.xy);
-    if (any(center >= params.extent)) {
-        return;
-    }
-
-    let center_ilm = textureLoad(input, center, 0);
-    let center_luma = dot(center_ilm.xyz, LUMA);
-    let variance = sqrt(center_ilm.w);
-    let center_suf = read_surface(center);
-    var sum_weight = GAUSSIAN_WEIGHTS[0] * GAUSSIAN_WEIGHTS[0];
-    var sum_ilm = w4(sum_weight) * center_ilm;
-
-    for (var yy=-1; yy<=1; yy+=1) {
-        for (var xx=-1; xx<=1; xx+=1) {
-            let p = center + vec2<i32>(xx, yy) * (1i << params.iteration);
-            if (all(p == center) || any(p < vec2<i32>(0)) || any(p >= params.extent)) {
-                continue;
-            }
-
-            //TODO: store in group-shared memory
-            let surface = read_surface(p);
-            var weight = GAUSSIAN_WEIGHTS[abs(xx)] * GAUSSIAN_WEIGHTS[abs(yy)];
-            //TODO: make it stricter on higher iterations
-            weight *= compare_flat_normals(surface.flat_normal, center_suf.flat_normal);
-            //Note: should we use a projected depth instead of the surface one?
-            weight *= compare_depths(surface.depth, center_suf.depth);
-            let other_ilm = textureLoad(input, p, 0);
-            weight *= compare_luminance(center_luma, dot(other_ilm.xyz, LUMA), variance);
-            sum_ilm += w4(weight) * other_ilm;
-            sum_weight += weight;
-        }
-    }
-
-    let filtered_ilm = select(center_ilm, sum_ilm / w4(sum_weight), sum_weight > MIN_WEIGHT);
-    textureStore(output, global_id.xy, filtered_ilm);
-}
diff --git a/blade-render/code/color.inc.wgsl b/blade-render/code/color.inc.wgsl
new file mode 100644
index 00000000..8e84cfda
--- /dev/null
+++ b/blade-render/code/color.inc.wgsl
@@ -0,0 +1,19 @@
+fn hsv_to_rgb(h: f32, s: f32, v: f32) -> vec3<f32> {
+    let c = v * s;
+    let x = c * (1.0 - abs((h / 60.0) % 2.0 - 1.0));
+    var q = vec3<f32>(v - c);
+    if (h < 60.0) {
+        q.r += c; q.g += x;
+    } else if (h < 120.0) {
+        q.g += c; q.r += x;
+    } else if (h < 180.0) {
+        q.g += c; q.b += x;
+    } else if (h < 240.0) {
+        q.b += c; q.g += x;
+    } else if (h < 300.0) {
+        q.b += c; q.r += x;
+    } else {
+        q.r += c; q.b += x;
+    }
+    return q;
+}
diff --git a/blade-render/code/env-importance.inc.wgsl b/blade-render/code/env-importance.inc.wgsl
index 43421fce..f5a77914 100644
--- a/blade-render/code/env-importance.inc.wgsl
+++ b/blade-render/code/env-importance.inc.wgsl
@@ -11,7 +11,7 @@ fn compute_texel_solid_angle(itc: vec2<i32>, dim: vec2<u32>) -> f32 {
     return meridian_solid_angle * meridian_part;
 }
 
-fn generate_environment_sample(rng: ptr<function, RandomState>, dim: vec2<u32>) -> EnvImportantSample {
+fn generate_environment_sample(rng: ptr<private, RandomState>, dim: vec2<u32>) -> EnvImportantSample {
     var es = EnvImportantSample();
     es.pdf = 1.0;
     var mip = i32(textureNumLevels(env_weights));
diff --git a/blade-render/code/fill-gbuf.wgsl b/blade-render/code/fill-gbuf.wgsl
deleted file mode 100644
index 346edf51..00000000
--- a/blade-render/code/fill-gbuf.wgsl
+++ /dev/null
@@ -1,204 +0,0 @@
-#include "quaternion.inc.wgsl"
-#include "camera.inc.wgsl"
-#include "debug.inc.wgsl"
-#include "debug-param.inc.wgsl"
-#include "gbuf.inc.wgsl"
-
-//TODO: use proper WGSL
-const RAY_FLAG_CULL_NO_OPAQUE: u32 = 0x80u;
-
-// Has to match the host!
-struct Vertex {
-    pos: vec3<f32>,
-    bitangent_sign: f32,
-    tex_coords: vec2<f32>,
-    normal: u32,
-    tangent: u32,
-}
-struct VertexBuffer {
-    data: array<Vertex>,
-}
-struct IndexBuffer {
-    data: array<u32>,
-}
-var<storage, read> vertex_buffers: binding_array<VertexBuffer>;
-var<storage, read> index_buffers: binding_array<IndexBuffer>;
-var textures: binding_array<texture_2d<f32>>;
-var sampler_linear: sampler;
-var sampler_nearest: sampler;
-
-struct HitEntry {
-    index_buf: u32,
-    vertex_buf: u32,
-    winding: f32,
-    // packed quaternion
-    geometry_to_world_rotation: u32,
-    geometry_to_object: mat4x3<f32>,
-    prev_object_to_world: mat4x3<f32>,
-    base_color_texture: u32,
-    // packed color factor
-    base_color_factor: u32,
-    normal_texture: u32,
-}
-var<storage, read> hit_entries: array<HitEntry>;
-
-var<uniform> camera: CameraParams;
-var<uniform> prev_camera: CameraParams;
-var<uniform> debug: DebugParams;
-var acc_struct: acceleration_structure;
-
-var out_depth: texture_storage_2d<r32float, write>;
-var out_flat_normal: texture_storage_2d<rgba8snorm, write>;
-var out_basis: texture_storage_2d<rgba8snorm, write>;
-var out_albedo: texture_storage_2d<rgba8unorm, write>;
-var out_motion: texture_storage_2d<rg8snorm, write>;
-var out_debug: texture_storage_2d<rgba8unorm, write>;
-
-fn decode_normal(raw: u32) -> vec3<f32> {
-    return unpack4x8snorm(raw).xyz;
-}
-
-fn debug_raw_normal(pos: vec3<f32>, normal_raw: u32, rotation: vec4<f32>, debug_len: f32, color: u32) {
-    let nw = normalize(qrot(rotation, decode_normal(normal_raw)));
-    debug_line(pos, pos + debug_len * nw, color);
-}
-
-@compute @workgroup_size(8, 4)
-fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
-    if (any(global_id.xy >= camera.target_size)) {
-        return;
-    }
-
-    var rq: ray_query;
-    let ray_dir = get_ray_direction(camera, vec2<i32>(global_id.xy));
-    rayQueryInitialize(&rq, acc_struct, RayDesc(RAY_FLAG_CULL_NO_OPAQUE, 0xFFu, 0.0, camera.depth, camera.position, ray_dir));
-    rayQueryProceed(&rq);
-    let intersection = rayQueryGetCommittedIntersection(&rq);
-
-    var depth = 0.0;
-    var basis = vec4<f32>(0.0);
-    var flat_normal = vec3<f32>(0.0);
-    var albedo = vec3<f32>(1.0);
-    var motion = vec2<f32>(0.0);
-    let enable_debug = all(global_id.xy == debug.mouse_pos);
-
-    if (intersection.kind != RAY_QUERY_INTERSECTION_NONE) {
-        let entry = hit_entries[intersection.instance_custom_index + intersection.geometry_index];
-        depth = intersection.t;
-
-        var indices = intersection.primitive_index * 3u + vec3<u32>(0u, 1u, 2u);
-        if (entry.index_buf != ~0u) {
-            let iptr = &index_buffers[entry.index_buf].data;
-            indices = vec3<u32>((*iptr)[indices.x], (*iptr)[indices.y], (*iptr)[indices.z]);
-        }
-
-        let vptr = &vertex_buffers[entry.vertex_buf].data;
-        let vertices = array<Vertex, 3>(
-            (*vptr)[indices.x],
-            (*vptr)[indices.y],
-            (*vptr)[indices.z],
-        );
-
-        let positions_object = entry.geometry_to_object * mat3x4(
-            vec4<f32>(vertices[0].pos, 1.0), vec4<f32>(vertices[1].pos, 1.0), vec4<f32>(vertices[2].pos, 1.0)
-        );
-        let positions = intersection.object_to_world * mat3x4(
-            vec4<f32>(positions_object[0], 1.0), vec4<f32>(positions_object[1], 1.0), vec4<f32>(positions_object[2], 1.0)
-        );
-        flat_normal = entry.winding * normalize(cross(positions[1].xyz - positions[0].xyz, positions[2].xyz - positions[0].xyz));
-
-        let barycentrics = vec3<f32>(1.0 - intersection.barycentrics.x - intersection.barycentrics.y, intersection.barycentrics);
-        let position_object = vec4<f32>(positions_object * barycentrics, 1.0);
-        let tex_coords = mat3x2(vertices[0].tex_coords, vertices[1].tex_coords, vertices[2].tex_coords) * barycentrics;
-        let normal_geo = normalize(mat3x3(decode_normal(vertices[0].normal), decode_normal(vertices[1].normal), decode_normal(vertices[2].normal)) * barycentrics);
-        let tangent_geo = normalize(mat3x3(decode_normal(vertices[0].tangent), decode_normal(vertices[1].tangent), decode_normal(vertices[2].tangent)) * barycentrics);
-        let bitangent_geo = normalize(cross(normal_geo, tangent_geo)) * vertices[0].bitangent_sign;
-
-        let lod = 0.0; //TODO: this is actually complicated
-
-        let geo_to_world_rot = normalize(unpack4x8snorm(entry.geometry_to_world_rotation));
-        let tangent_space_geo = mat3x3(tangent_geo, bitangent_geo, normal_geo);
-        var normal_local: vec3<f32>;
-        if ((debug.texture_flags & DebugTextureFlags_NORMAL) != 0u) {
-            normal_local = vec3<f32>(0.0, 0.0, 1.0); // ignore normal map
-        } else {
-            let n_xy = textureSampleLevel(textures[entry.normal_texture], sampler_linear, tex_coords, lod).xy;
-            normal_local = vec3<f32>(n_xy, sqrt(max(0.0, 1.0 - dot(n_xy.xy, n_xy.xy))));
-        }
-        var normal = qrot(geo_to_world_rot, tangent_space_geo * normal_local);
-        basis = shortest_arc_quat(vec3<f32>(0.0, 0.0, 1.0), normalize(normal));
-
-        let hit_position = camera.position + intersection.t * ray_dir;
-        if (enable_debug) {
-            debug_buf.entry.custom_index = intersection.instance_custom_index;
-            debug_buf.entry.depth = intersection.t;
-            debug_buf.entry.tex_coords = tex_coords;
-            debug_buf.entry.base_color_texture = entry.base_color_texture;
-            debug_buf.entry.normal_texture = entry.normal_texture;
-            debug_buf.entry.position = hit_position;
-            debug_buf.entry.flat_normal = flat_normal;
-        }
-        if (enable_debug && (debug.draw_flags & DebugDrawFlags_SPACE) != 0u) {
-            let normal_len = 0.15 * intersection.t;
-            let side = 0.05 * intersection.t;
-            debug_line(hit_position, hit_position + normal_len * qrot(geo_to_world_rot, normal_geo), 0xFFFFFFu);
-            debug_line(hit_position - side * tangent_geo, hit_position + side * tangent_geo, 0x808080u);
-            debug_line(hit_position - side * bitangent_geo, hit_position + side * bitangent_geo, 0x808080u);
-        }
-        if (enable_debug && (debug.draw_flags & DebugDrawFlags_GEOMETRY) != 0u) {
-            let debug_len = intersection.t * 0.2;
-            debug_line(positions[0].xyz, positions[1].xyz, 0x00FFFFu);
-            debug_line(positions[1].xyz, positions[2].xyz, 0x00FFFFu);
-            debug_line(positions[2].xyz, positions[0].xyz, 0x00FFFFu);
-            let poly_center = (positions[0].xyz + positions[1].xyz + positions[2].xyz) / 3.0;
-            debug_line(poly_center, poly_center + 0.2 * debug_len * flat_normal, 0xFF00FFu);
-            // note: dynamic indexing into positions isn't allowed by WGSL yet
-            debug_raw_normal(positions[0].xyz, vertices[0].normal, geo_to_world_rot, 0.5*debug_len, 0xFFFF00u);
-            debug_raw_normal(positions[1].xyz, vertices[1].normal, geo_to_world_rot, 0.5*debug_len, 0xFFFF00u);
-            debug_raw_normal(positions[2].xyz, vertices[2].normal, geo_to_world_rot, 0.5*debug_len, 0xFFFF00u);
-            // draw tangent space
-            debug_line(hit_position, hit_position + debug_len * qrot(basis, vec3<f32>(1.0, 0.0, 0.0)), 0x0000FFu);
-            debug_line(hit_position, hit_position + debug_len * qrot(basis, vec3<f32>(0.0, 1.0, 0.0)), 0x00FF00u);
-            debug_line(hit_position, hit_position + debug_len * qrot(basis, vec3<f32>(0.0, 0.0, 1.0)), 0xFF0000u);
-        }
-
-        let base_color_factor = unpack4x8unorm(entry.base_color_factor);
-        if ((debug.texture_flags & DebugTextureFlags_ALBEDO) != 0u) {
-            albedo = base_color_factor.xyz;
-        } else {
-            let base_color_sample = textureSampleLevel(textures[entry.base_color_texture], sampler_linear, tex_coords, lod);
-            albedo = (base_color_factor * base_color_sample).xyz;
-        }
-
-        if (debug.view_mode == DebugMode_HitConsistency) {
-            let reprojected = get_projected_pixel(camera, hit_position);
-            let barycentrics_pos_diff = (intersection.object_to_world * position_object).xyz - hit_position;
-            let camera_projection_diff = vec2<f32>(global_id.xy) - vec2<f32>(reprojected);
-            let consistency = vec4<f32>(length(barycentrics_pos_diff), length(camera_projection_diff), 0.0, 0.0);
-            textureStore(out_debug, global_id.xy, consistency);
-        }
-
-        let prev_position = (entry.prev_object_to_world * position_object).xyz;
-        let prev_screen = get_projected_pixel_float(prev_camera, prev_position);
-        //TODO: consider just storing integers here?
-        //TODO: technically this "0.5" is just a waste compute on both packing and unpacking
-        motion = prev_screen - vec2<f32>(global_id.xy) - 0.5;
-        if (debug.view_mode == DebugMode_Motion) {
-            textureStore(out_debug, global_id.xy, vec4<f32>(motion * MOTION_SCALE + vec2<f32>(0.5), 0.0, 1.0));
-        }
-    } else {
-        if (enable_debug) {
-            debug_buf.entry = DebugEntry();
-        }
-        if (debug.view_mode != DebugMode_Final) {
-            textureStore(out_debug, global_id.xy, vec4<f32>(0.0));
-        }
-    }
-
-    // TODO: option to avoid writing data for the sky
-    textureStore(out_depth, global_id.xy, vec4<f32>(depth, 0.0, 0.0, 0.0));
-    textureStore(out_basis, global_id.xy, basis);
-    textureStore(out_flat_normal, global_id.xy, vec4<f32>(flat_normal, 0.0));
-    textureStore(out_albedo, global_id.xy, vec4<f32>(albedo, 0.0));
-    textureStore(out_motion, global_id.xy, vec4<f32>(motion * MOTION_SCALE, 0.0, 0.0));
-}
diff --git a/blade-render/code/gbuf.inc.wgsl b/blade-render/code/gbuf.inc.wgsl
deleted file mode 100644
index ecb4642d..00000000
--- a/blade-render/code/gbuf.inc.wgsl
+++ /dev/null
@@ -1,2 +0,0 @@
-const MOTION_SCALE: f32 = 0.02;
-const USE_MOTION_VECTORS: bool = true;
\ No newline at end of file
diff --git a/blade-render/code/geometry.inc.wgsl b/blade-render/code/geometry.inc.wgsl
new file mode 100644
index 00000000..2ba8994c
--- /dev/null
+++ b/blade-render/code/geometry.inc.wgsl
@@ -0,0 +1,187 @@
+//TODO: https://github.com/gfx-rs/wgpu/pull/5429
+const RAY_FLAG_CULL_NO_OPAQUE: u32 = 0x80u;
+
+// Has to match the host!
+struct Vertex {
+    pos: vec3<f32>,
+    bitangent_sign: f32,
+    tex_coords: vec2<f32>,
+    normal: u32,
+    tangent: u32,
+}
+struct VertexBuffer {
+    data: array<Vertex>,
+}
+struct IndexBuffer {
+    data: array<u32>,
+}
+var<storage, read> vertex_buffers: binding_array<VertexBuffer>;
+var<storage, read> index_buffers: binding_array<IndexBuffer>;
+var textures: binding_array<texture_2d<f32>>;
+
+struct HitEntry {
+    index_buf: u32,
+    vertex_buf: u32,
+    winding: f32,
+    // packed quaternion
+    geometry_to_world_rotation: u32,
+    geometry_to_object: mat4x3<f32>,
+    prev_object_to_world: mat4x3<f32>,
+    base_color_texture: u32,
+    // packed color factor
+    base_color_factor: u32,
+    normal_texture: u32,
+}
+var<storage, read> hit_entries: array<HitEntry>;
+
+fn decode_normal(raw: u32) -> vec3<f32> {
+    return unpack4x8snorm(raw).xyz;
+}
+
+fn debug_raw_normal(pos: vec3<f32>, normal_raw: u32, rotation: vec4<f32>, debug_len: f32, color: u32) {
+    let nw = normalize(qrot(rotation, decode_normal(normal_raw)));
+    debug_line(pos, pos + debug_len * nw, color);
+}
+
+struct RichSurface {
+    inner: Surface,
+    position: vec3<f32>,
+    albedo: vec3<f32>,
+    motion: vec2<f32>,
+}
+
+fn fetch_geometry(pixel_coord: vec2<i32>, is_primary: bool, enable_debug: bool) -> RichSurface {
+    var rq: ray_query;
+    let ray_dir = get_ray_direction(camera, pixel_coord);
+    rayQueryInitialize(&rq, acc_struct, RayDesc(RAY_FLAG_CULL_NO_OPAQUE, 0xFFu, 0.0, camera.depth, camera.position, ray_dir));
+    rayQueryProceed(&rq);
+    let intersection = rayQueryGetCommittedIntersection(&rq);
+
+    var rs = RichSurface();
+    rs.albedo = vec3<f32>(1.0);
+
+    if (intersection.kind == RAY_QUERY_INTERSECTION_NONE) {
+        if (enable_debug) {
+            debug_buf.entry = DebugEntry();
+        }
+        return rs;
+    }
+
+    let entry = hit_entries[intersection.instance_custom_index + intersection.geometry_index];
+
+    var indices = intersection.primitive_index * 3u + vec3<u32>(0u, 1u, 2u);
+    if (entry.index_buf != ~0u) {
+        let iptr = &index_buffers[entry.index_buf].data;
+        indices = vec3<u32>((*iptr)[indices.x], (*iptr)[indices.y], (*iptr)[indices.z]);
+    }
+
+    let vptr = &vertex_buffers[entry.vertex_buf].data;
+    let vertices = array<Vertex, 3>(
+        (*vptr)[indices.x],
+        (*vptr)[indices.y],
+        (*vptr)[indices.z],
+    );
+
+    let positions_object = entry.geometry_to_object * mat3x4(
+        vec4<f32>(vertices[0].pos, 1.0), vec4<f32>(vertices[1].pos, 1.0), vec4<f32>(vertices[2].pos, 1.0)
+    );
+    let positions = intersection.object_to_world * mat3x4(
+        vec4<f32>(positions_object[0], 1.0), vec4<f32>(positions_object[1], 1.0), vec4<f32>(positions_object[2], 1.0)
+    );
+    let flat_normal = entry.winding * normalize(cross(positions[1].xyz - positions[0].xyz, positions[2].xyz - positions[0].xyz));
+
+    let barycentrics = vec3<f32>(1.0 - intersection.barycentrics.x - intersection.barycentrics.y, intersection.barycentrics);
+    let position_object = vec4<f32>(positions_object * barycentrics, 1.0);
+    let tex_coords = mat3x2(vertices[0].tex_coords, vertices[1].tex_coords, vertices[2].tex_coords) * barycentrics;
+    let normal_geo = normalize(mat3x3(decode_normal(vertices[0].normal), decode_normal(vertices[1].normal), decode_normal(vertices[2].normal)) * barycentrics);
+    let tangent_geo = normalize(mat3x3(decode_normal(vertices[0].tangent), decode_normal(vertices[1].tangent), decode_normal(vertices[2].tangent)) * barycentrics);
+    let bitangent_geo = normalize(cross(normal_geo, tangent_geo)) * vertices[0].bitangent_sign;
+
+    let lod = 0.0; //TODO: this is actually complicated
+
+    let geo_to_world_rot = normalize(unpack4x8snorm(entry.geometry_to_world_rotation));
+    let tangent_space_geo = mat3x3(tangent_geo, bitangent_geo, normal_geo);
+    var normal_local: vec3<f32>;
+    if ((debug.texture_flags & DebugTextureFlags_NORMAL) != 0u) {
+        normal_local = vec3<f32>(0.0, 0.0, 1.0); // ignore normal map
+    } else {
+        let n_xy = textureSampleLevel(textures[entry.normal_texture], sampler_linear, tex_coords, lod).xy;
+        normal_local = vec3<f32>(n_xy, sqrt(max(0.0, 1.0 - dot(n_xy.xy, n_xy.xy))));
+    }
+    let normal = qrot(geo_to_world_rot, tangent_space_geo * normal_local);
+    let basis = shortest_arc_quat(vec3<f32>(0.0, 0.0, 1.0), normalize(normal));
+
+    let hit_position = camera.position + intersection.t * ray_dir;
+    if (enable_debug && is_primary) {
+        debug_buf.entry.custom_index = intersection.instance_custom_index;
+        debug_buf.entry.depth = intersection.t;
+        debug_buf.entry.tex_coords = tex_coords;
+        debug_buf.entry.base_color_texture = entry.base_color_texture;
+        debug_buf.entry.normal_texture = entry.normal_texture;
+        debug_buf.entry.position = hit_position;
+        debug_buf.entry.flat_normal = flat_normal;
+    }
+    if (enable_debug && (debug.draw_flags & DebugDrawFlags_SPACE) != 0u) {
+        let normal_len = 0.15 * intersection.t;
+        let side = 0.05 * intersection.t;
+        debug_line(hit_position, hit_position + normal_len * qrot(geo_to_world_rot, normal_geo), 0xFFFFFFu);
+        debug_line(hit_position - side * tangent_geo, hit_position + side * tangent_geo, 0x808080u);
+        debug_line(hit_position - side * bitangent_geo, hit_position + side * bitangent_geo, 0x808080u);
+    }
+    if (enable_debug && (debug.draw_flags & DebugDrawFlags_GEOMETRY) != 0u) {
+        let debug_len = intersection.t * 0.2;
+        debug_line(positions[0].xyz, positions[1].xyz, 0x00FFFFu);
+        debug_line(positions[1].xyz, positions[2].xyz, 0x00FFFFu);
+        debug_line(positions[2].xyz, positions[0].xyz, 0x00FFFFu);
+        let poly_center = (positions[0].xyz + positions[1].xyz + positions[2].xyz) / 3.0;
+        debug_line(poly_center, poly_center + 0.2 * debug_len * flat_normal, 0xFF00FFu);
+        // note: dynamic indexing into positions isn't allowed by WGSL yet
+        debug_raw_normal(positions[0].xyz, vertices[0].normal, geo_to_world_rot, 0.5*debug_len, 0xFFFF00u);
+        debug_raw_normal(positions[1].xyz, vertices[1].normal, geo_to_world_rot, 0.5*debug_len, 0xFFFF00u);
+        debug_raw_normal(positions[2].xyz, vertices[2].normal, geo_to_world_rot, 0.5*debug_len, 0xFFFF00u);
+        // draw tangent space
+        debug_line(hit_position, hit_position + debug_len * qrot(basis, vec3<f32>(1.0, 0.0, 0.0)), 0x0000FFu);
+        debug_line(hit_position, hit_position + debug_len * qrot(basis, vec3<f32>(0.0, 1.0, 0.0)), 0x00FF00u);
+        debug_line(hit_position, hit_position + debug_len * qrot(basis, vec3<f32>(0.0, 0.0, 1.0)), 0xFF0000u);
+    }
+
+    rs.albedo = unpack4x8unorm(entry.base_color_factor).xyz;
+    if ((debug.texture_flags & DebugTextureFlags_ALBEDO) == 0u) {
+    let base_color_sample = textureSampleLevel(textures[entry.base_color_texture], sampler_linear, tex_coords, lod);
+        rs.albedo *= base_color_sample.xyz;
+    }
+
+    let prev_position = (entry.prev_object_to_world * position_object).xyz;
+    let prev_screen = get_projected_pixel_float(prev_camera, prev_position);
+    //TODO: consider just storing integers here?
+    //TODO: technically this "0.5" is just a waste compute on both packing and unpacking
+    rs.motion = prev_screen - vec2<f32>(pixel_coord) - 0.5;
+    rs.position = hit_position;
+
+    if (is_primary) {
+        if (debug.view_mode == DebugMode_Depth) {
+            let value = 1.0 / intersection.t;
+            textureStore(out_debug, pixel_coord, vec4<f32>(value));
+        }
+        if (debug.view_mode == DebugMode_Normal) {
+            textureStore(out_debug, pixel_coord, vec4<f32>(normal, 0.0));
+        }
+        if (debug.view_mode == DebugMode_HitConsistency) {
+            let reprojected = get_projected_pixel(camera, hit_position);
+            let barycentrics_pos_diff = (intersection.object_to_world * position_object).xyz - hit_position;
+            let camera_projection_diff = vec2<f32>(pixel_coord - reprojected);
+            let consistency = vec4<f32>(length(barycentrics_pos_diff), length(camera_projection_diff), 0.0, 0.0);
+            textureStore(out_debug, pixel_coord, consistency);
+        }
+        if (debug.view_mode == DebugMode_Motion) {
+            let motion = rs.motion * MOTION_SCALE;
+            textureStore(out_debug, pixel_coord, vec4<f32>(motion, 0.0, 0.0));
+        }
+    }
+
+    // Write down the Surface
+    rs.inner.basis = basis;
+    rs.inner.flat_normal = flat_normal;
+    rs.inner.depth = intersection.t;
+    return rs;
+}
diff --git a/blade-render/code/motion.inc.wgsl b/blade-render/code/motion.inc.wgsl
new file mode 100644
index 00000000..3e721716
--- /dev/null
+++ b/blade-render/code/motion.inc.wgsl
@@ -0,0 +1 @@
+const MOTION_SCALE: f32 = 0.02;
diff --git a/blade-render/code/random.inc.wgsl b/blade-render/code/random.inc.wgsl
index 3f68478c..1bce9317 100644
--- a/blade-render/code/random.inc.wgsl
+++ b/blade-render/code/random.inc.wgsl
@@ -28,7 +28,7 @@ fn rot32(x: u32, bits: u32) -> u32 {
 }
 
 // https://en.wikipedia.org/wiki/MurmurHash
-fn murmur3(rng: ptr<function, RandomState>) -> u32 {
+fn murmur3(rng: ptr<private, RandomState>) -> u32 {
     let c1 = 0xcc9e2d51u;
     let c2 = 0x1b873593u;
     let r1 = 15u;
@@ -56,7 +56,11 @@ fn murmur3(rng: ptr<function, RandomState>) -> u32 {
     return hash;
 }
 
-fn random_gen(rng: ptr<function, RandomState>) -> f32 {
+fn random_u32(rng: ptr<private, RandomState>) -> u32 {
+    return murmur3(rng);
+}
+
+fn random_gen(rng: ptr<private, RandomState>) -> f32 {
     let v = murmur3(rng);
     let one = bitcast<u32>(1.0);
     let mask = (1u << 23u) - 1u;
diff --git a/blade-render/code/ray-trace.wgsl b/blade-render/code/ray-trace.wgsl
index b27b1065..93cacd15 100644
--- a/blade-render/code/ray-trace.wgsl
+++ b/blade-render/code/ray-trace.wgsl
@@ -1,3 +1,4 @@
+#include "color.inc.wgsl"
 #include "quaternion.inc.wgsl"
 #include "random.inc.wgsl"
 #include "env-importance.inc.wgsl"
@@ -5,36 +6,45 @@
 #include "debug-param.inc.wgsl"
 #include "camera.inc.wgsl"
 #include "surface.inc.wgsl"
-#include "gbuf.inc.wgsl"
-
-//TODO: use proper WGSL
-const RAY_FLAG_CULL_NO_OPAQUE: u32 = 0x80u;
+#include "geometry.inc.wgsl"
+#include "motion.inc.wgsl"
+#include "accum.inc.wgsl"
 
 const PI: f32 = 3.1415926;
-const MAX_RESERVOIRS: u32 = 2u;
-// See "9.1 pairwise mis for robust reservoir reuse"
-// "Correlations and Reuse for Fast and Accurate Physically Based Light Transport"
-const PAIRWISE_MIS: bool = true;
-// Base MIS for canonical samples. The constant isolates a critical difference between
-// Bitterli's pseudocode (where it's 1) and NVidia's RTXDI implementation (where it's 0).
-// With Bitterli's 1 we have MIS not respecting the prior history enough.
-const BASE_CANONICAL_MIS: f32 = 0.05;
+const MAX_RESAMPLE: u32 = 4u;
+
+const DRAW_DEBUG: bool = false;
 // See "DECOUPLING SHADING AND REUSE" in
 // "Rearchitecting Spatiotemporal Resampling for Production"
 const DECOUPLED_SHADING: bool = false;
+const WRITE_DEBUG_IMAGE: bool = false;
+//TODO: currently unused
+const WRITE_MOTION_VECTORS: bool = false;
+
+//TODO: crashes on AMD 6850U if `GROUP_SIZE_TOTAL` > 32
+const GROUP_SIZE: vec2<u32> = vec2<u32>(8, 4);
+const GROUP_SIZE_TOTAL: u32 = GROUP_SIZE.x * GROUP_SIZE.y;
+
+var<private> p_debug_len: f32;
+var<private> p_rng: RandomState;
 
 struct MainParams {
     frame_index: u32,
     num_environment_samples: u32,
     environment_importance_sampling: u32,
     temporal_tap: u32,
-    temporal_history: u32,
+    temporal_tap_confidence: f32,
     spatial_taps: u32,
-    spatial_tap_history: u32,
-    spatial_radius: i32,
+    spatial_tap_confidence: f32,
+    spatial_min_distance: i32,
     t_start: f32,
+    use_pairwise_mis: u32,
+    defensive_mis: f32,
     use_motion_vectors: u32,
-};
+    temporal_accumulation_weight: f32,
+    pad: f32,
+    grid_scale: vec2<u32>,
+}
 
 var<uniform> camera: CameraParams;
 var<uniform> prev_camera: CameraParams;
@@ -54,7 +64,14 @@ struct StoredReservoir {
     confidence: f32,
 }
 var<storage, read_write> reservoirs: array<StoredReservoir>;
-var<storage, read> prev_reservoirs: array<StoredReservoir>;
+
+struct PixelCache {
+    surface: Surface,
+    reservoir: StoredReservoir,
+    //Note: we could store direction XY in local camera space instead
+    world_pos: vec3<f32>,
+}
+var<workgroup> pixel_cache: array<PixelCache, GROUP_SIZE_TOTAL>;
 
 struct LightSample {
     radiance: vec3<f32>,
@@ -83,12 +100,6 @@ fn get_reservoir_index(pixel: vec2<i32>, camera: CameraParams) -> i32 {
     }
 }
 
-fn get_pixel_from_reservoir_index(index: i32, camera: CameraParams) -> vec2<i32> {
-    let y = index / i32(camera.target_size.x);
-    let x = index - y * i32(camera.target_size.x);
-    return vec2<i32>(x, y);
-}
-
 fn bump_reservoir(r: ptr<function, LiveReservoir>, history: f32) {
     (*r).history += history;
 }
@@ -102,9 +113,11 @@ fn make_reservoir(ls: LightSample, light_index: u32, brdf: vec3<f32>) -> LiveRes
     r.history = 1.0;
     return r;
 }
-fn merge_reservoir(r: ptr<function, LiveReservoir>, other: LiveReservoir, random: f32) -> bool {
+
+fn merge_reservoir(r: ptr<function, LiveReservoir>, other: LiveReservoir) -> bool {
     (*r).weight_sum += other.weight_sum;
     (*r).history += other.history;
+    let random = random_gen(&p_rng);
     if ((*r).weight_sum * random < other.weight_sum) {
         (*r).selected_light_index = other.selected_light_index;
         (*r).selected_uv = other.selected_uv;
@@ -115,24 +128,24 @@ fn merge_reservoir(r: ptr<function, LiveReservoir>, other: LiveReservoir, random
         return false;
     }
 }
-fn unpack_reservoir(f: StoredReservoir, max_history: u32) -> LiveReservoir {
+fn unpack_reservoir(f: StoredReservoir, max_confidence: f32, radiance: vec3<f32>) -> LiveReservoir {
     var r: LiveReservoir;
     r.selected_light_index = f.light_index;
     r.selected_uv = f.light_uv;
     r.selected_target_score = f.target_score;
-    r.radiance = vec3<f32>(0.0); // to be continued...
-    let history = min(f.confidence, f32(max_history));
+    r.radiance = radiance;
+    let history = min(f.confidence, max_confidence);
     r.weight_sum = f.contribution_weight * f.target_score * history;
     r.history = history;
     return r;
 }
-fn pack_reservoir_detail(r: LiveReservoir, denom_factor: f32) -> StoredReservoir {
+fn pack_reservoir_detail(r: LiveReservoir, effective_history: f32) -> StoredReservoir {
     var f: StoredReservoir;
     f.light_index = r.selected_light_index;
     f.light_uv = r.selected_uv;
     f.target_score = r.selected_target_score;
     f.confidence = r.history;
-    let denom = f.target_score * denom_factor;
+    let denom = f.target_score * effective_history;
     f.contribution_weight = select(0.0, r.weight_sum / denom, denom > 0.0);
     return f;
 }
@@ -140,14 +153,11 @@ fn pack_reservoir(r: LiveReservoir) -> StoredReservoir {
     return pack_reservoir_detail(r, r.history);
 }
 
-var t_depth: texture_2d<f32>;
-var t_prev_depth: texture_2d<f32>;
-var t_basis: texture_2d<f32>;
-var t_prev_basis: texture_2d<f32>;
-var t_flat_normal: texture_2d<f32>;
-var t_prev_flat_normal: texture_2d<f32>;
-var t_motion: texture_2d<f32>;
-var out_diffuse: texture_storage_2d<rgba16float, write>;
+var inout_depth: texture_storage_2d<r32float, read_write>;
+var inout_basis: texture_storage_2d<rgba8snorm, read_write>;
+var inout_flat_normal: texture_storage_2d<rgba8snorm, read_write>;
+var out_albedo: texture_storage_2d<rgba8unorm, write>;
+var out_motion: texture_storage_2d<rg8snorm, write>;
 var out_debug: texture_storage_2d<rgba8unorm, write>;
 
 fn sample_circle(random: f32) -> vec2<f32> {
@@ -176,9 +186,9 @@ fn evaluate_environment(dir: vec3<f32>) -> vec3<f32> {
     return textureSampleLevel(env_map, sampler_linear, uv, 0.0).xyz;
 }
 
-fn sample_light_from_sphere(rng: ptr<function, RandomState>) -> LightSample {
-    let a = random_gen(rng);
-    let h = 1.0 - 2.0 * random_gen(rng); // make sure to allow h==1
+fn sample_light_from_sphere() -> LightSample {
+    let a = random_gen(&p_rng);
+    let h = 1.0 - 2.0 * random_gen(&p_rng); // make sure to allow h==1
     let tangential = sqrt(1.0 - square(h)) * sample_circle(a);
     let dir = vec3<f32>(tangential.x, h, tangential.y);
     var ls = LightSample();
@@ -188,33 +198,33 @@ fn sample_light_from_sphere(rng: ptr<function, RandomState>) -> LightSample {
     return ls;
 }
 
-fn sample_light_from_environment(rng: ptr<function, RandomState>) -> LightSample {
+fn sample_light_from_environment() -> LightSample {
     let dim = textureDimensions(env_map, 0);
-    let es = generate_environment_sample(rng, dim);
+    let es = generate_environment_sample(&p_rng, dim);
     var ls = LightSample();
     ls.pdf = es.pdf;
     // sample the incoming radiance
     ls.radiance = textureLoad(env_map, es.pixel, 0).xyz;
     // for determining direction - offset randomly within the texel
     // Note: this only works if the texels are sufficiently small
-    ls.uv = (vec2<f32>(es.pixel) + vec2<f32>(random_gen(rng), random_gen(rng))) / vec2<f32>(dim);
+    ls.uv = (vec2<f32>(es.pixel) + vec2<f32>(random_gen(&p_rng), random_gen(&p_rng))) / vec2<f32>(dim);
     return ls;
 }
 
-fn read_surface(pixel: vec2<i32>) -> Surface {
+fn read_prev_surface(pixel: vec2<i32>) -> Surface {
     var surface: Surface;
-    surface.basis = normalize(textureLoad(t_basis, pixel, 0));
-    surface.flat_normal = normalize(textureLoad(t_flat_normal, pixel, 0).xyz);
-    surface.depth = textureLoad(t_depth, pixel, 0).x;
+    surface.basis = normalize(textureLoad(inout_basis, pixel));
+    surface.flat_normal = normalize(textureLoad(inout_flat_normal, pixel).xyz);
+    surface.depth = textureLoad(inout_depth, pixel).x;
     return surface;
 }
 
-fn read_prev_surface(pixel: vec2<i32>) -> Surface {
-    var surface: Surface;
-    surface.basis = normalize(textureLoad(t_prev_basis, pixel, 0));
-    surface.flat_normal = normalize(textureLoad(t_prev_flat_normal, pixel, 0).xyz);
-    surface.depth = textureLoad(t_prev_depth, pixel, 0).x;
-    return surface;
+fn thread_index_to_coord(thread_index: u32, group_id: vec3<u32>) -> vec2<i32> {
+    let cluster_id = group_id.xy / parameters.grid_scale;
+    let cluster_offset = group_id.xy - cluster_id * parameters.grid_scale;
+    let local_id = vec2<u32>(thread_index % GROUP_SIZE.x, thread_index / GROUP_SIZE.x);
+    let global_id = (cluster_id * GROUP_SIZE + local_id) * parameters.grid_scale + cluster_offset;
+    return vec2<i32>(global_id);
 }
 
 fn evaluate_brdf(surface: Surface, dir: vec3<f32>) -> f32 {
@@ -224,7 +234,7 @@ fn evaluate_brdf(surface: Surface, dir: vec3<f32>) -> f32 {
     return lambert_brdf * max(0.0, lambert_term);
 }
 
-fn check_ray_occluded(acs: acceleration_structure, position: vec3<f32>, direction: vec3<f32>, debug_len: f32) -> bool {
+fn check_ray_occluded(acs: acceleration_structure, position: vec3<f32>, direction: vec3<f32>) -> bool {
     var rq: ray_query;
     let flags = RAY_FLAG_TERMINATE_ON_FIRST_HIT | RAY_FLAG_CULL_NO_OPAQUE;
     rayQueryInitialize(&rq, acs,
@@ -234,9 +244,9 @@ fn check_ray_occluded(acs: acceleration_structure, position: vec3<f32>, directio
     let intersection = rayQueryGetCommittedIntersection(&rq);
 
     let occluded = intersection.kind != RAY_QUERY_INTERSECTION_NONE;
-    if (debug_len != 0.0) {
+    if (DRAW_DEBUG && p_debug_len != 0.0) {
         let color = select(0xFFFFFFu, 0x0000FFu, occluded);
-        debug_line(position, position + debug_len * direction, color);
+        debug_line(position, position + p_debug_len * direction, color);
     }
     return occluded;
 }
@@ -255,15 +265,6 @@ fn evaluate_reflected_light(surface: Surface, light_index: u32, light_uv: vec2<f
     return radiance * brdf;
 }
 
-fn get_prev_pixel(pixel: vec2<i32>, pos_world: vec3<f32>) -> vec2<f32> {
-    if (USE_MOTION_VECTORS && parameters.use_motion_vectors != 0u) {
-        let motion = textureLoad(t_motion, pixel, 0).xy / MOTION_SCALE;
-        return vec2<f32>(pixel) + 0.5 + motion;
-    } else {
-        return get_projected_pixel_float(prev_camera, pos_world);
-    }
-}
-
 struct TargetScore {
     color: vec3<f32>,
     score: f32,
@@ -274,7 +275,7 @@ fn make_target_score(color: vec3<f32>) -> TargetScore {
 }
 
 fn estimate_target_score_with_occlusion(
-    surface: Surface, position: vec3<f32>, light_index: u32, light_uv: vec2<f32>, acs: acceleration_structure, debug_len: f32
+    surface: Surface, position: vec3<f32>, light_index: u32, light_uv: vec2<f32>, acs: acceleration_structure,
 ) -> TargetScore {
     if (light_index != 0u) {
         return TargetScore();
@@ -288,16 +289,16 @@ fn estimate_target_score_with_occlusion(
         return TargetScore();
     }
 
-    if (check_ray_occluded(acs, position, direction, debug_len)) {
+    if (check_ray_occluded(acs, position, direction)) {
         return TargetScore();
-    } else {
-        //Note: same as `evaluate_reflected_light`
-        let radiance = textureSampleLevel(env_map, sampler_nearest, light_uv, 0.0).xyz;
-        return make_target_score(brdf * radiance);
     }
+
+    //Note: same as `evaluate_reflected_light`
+    let radiance = textureSampleLevel(env_map, sampler_nearest, light_uv, 0.0).xyz;
+    return make_target_score(brdf * radiance);
 }
 
-fn evaluate_sample(ls: LightSample, surface: Surface, start_pos: vec3<f32>, debug_len: f32) -> f32 {
+fn evaluate_sample(ls: LightSample, surface: Surface, start_pos: vec3<f32>) -> f32 {
     let dir = map_equirect_uv_to_dir(ls.uv);
     if (dot(dir, surface.flat_normal) <= 0.0) {
         return 0.0;
@@ -313,211 +314,352 @@ fn evaluate_sample(ls: LightSample, surface: Surface, start_pos: vec3<f32>, debu
         return 0.0;
     }
 
-    if (check_ray_occluded(acc_struct, start_pos, dir, debug_len)) {
+    if (check_ray_occluded(acc_struct, start_pos, dir)) {
         return 0.0;
     }
 
     return brdf;
 }
 
-struct HeuristicFactors {
-    weight: f32,
-    //history: f32,
-}
-
-fn balance_heuristic(w0: f32, w1: f32, h0: f32, h1: f32) -> HeuristicFactors {
-    var hf: HeuristicFactors;
-    let balance_denom = h0 * w0 + h1 * w1;
-    hf.weight = select(h0 * w0 / balance_denom, 0.0, balance_denom <= 0.0);
-    //hf.history = select(pow(clamp(w1 / w0, 0.0, 1.0), 8.0), 1.0, w0 <= 0.0);
-    return hf;
-}
-
-struct RestirOutput {
-    radiance: vec3<f32>,
-}
-
-fn compute_restir(surface: Surface, pixel: vec2<i32>, rng: ptr<function, RandomState>, enable_debug: bool) -> RestirOutput {
-    if (debug.view_mode == DebugMode_Depth) {
-        textureStore(out_debug, pixel, vec4<f32>(surface.depth / camera.depth));
-    }
-    let ray_dir = get_ray_direction(camera, pixel);
-    let pixel_index = get_reservoir_index(pixel, camera);
-    if (surface.depth == 0.0) {
-        reservoirs[pixel_index] = StoredReservoir();
-        let env = evaluate_environment(ray_dir);
-        return RestirOutput(env);
-    }
-
-    let debug_len = select(0.0, surface.depth * 0.2, enable_debug);
-    let position = camera.position + surface.depth * ray_dir;
-    let normal = qrot(surface.basis, vec3<f32>(0.0, 0.0, 1.0));
-    if (debug.view_mode == DebugMode_Normal) {
-        textureStore(out_debug, pixel, vec4<f32>(normal, 0.0));
-    }
-
-    var canonical = LiveReservoir();
+fn produce_canonical(
+    surface: Surface, position: vec3<f32>,
+) -> LiveReservoir {
+    var reservoir = LiveReservoir();
     for (var i = 0u; i < parameters.num_environment_samples; i += 1u) {
         var ls: LightSample;
         if (parameters.environment_importance_sampling != 0u) {
-            ls = sample_light_from_environment(rng);
+            ls = sample_light_from_environment();
         } else {
-            ls = sample_light_from_sphere(rng);
+            ls = sample_light_from_sphere();
         }
 
-        let brdf = evaluate_sample(ls, surface, position, debug_len);
+        let brdf = evaluate_sample(ls, surface, position);
         if (brdf > 0.0) {
             let other = make_reservoir(ls, 0u, vec3<f32>(brdf));
-            merge_reservoir(&canonical, other, random_gen(rng));
+            merge_reservoir(&reservoir, other);
         } else {
-            bump_reservoir(&canonical, 1.0);
+            bump_reservoir(&reservoir, 1.0);
         }
     }
+    return reservoir;
+}
 
-    //TODO: find best match in a 2x2 grid
-    let prev_pixel = vec2<i32>(get_prev_pixel(pixel, position));
+struct TemporalReprojection {
+    is_valid: bool,
+    pixel: vec2<i32>,
+    surface: Surface,
+    reservoir: StoredReservoir,
+}
 
-    // First, gather the list of reservoirs to merge with
-    var accepted_reservoir_indices = array<i32, MAX_RESERVOIRS>();
-    var accepted_count = 0u;
-    var temporal_index = ~0u;
-    for (var tap = 0u; tap <= parameters.spatial_taps; tap += 1u) {
-        var other_pixel = prev_pixel;
-        if (tap != 0u) {
-            let r0 = max(prev_pixel - vec2<i32>(parameters.spatial_radius), vec2<i32>(0));
-            let r1 = min(prev_pixel + vec2<i32>(parameters.spatial_radius + 1), vec2<i32>(prev_camera.target_size));
-            other_pixel = vec2<i32>(mix(vec2<f32>(r0), vec2<f32>(r1), vec2<f32>(random_gen(rng), random_gen(rng))));
-        } else if (parameters.temporal_tap == 0u)
-        {
-            continue;
-        }
+fn find_temporal(surface: Surface, pixel: vec2<i32>, center_coord: vec2<f32>) -> TemporalReprojection {
+    var tr = TemporalReprojection();
+    tr.is_valid = false;
+    if (surface.depth == 0.0) {
+        return tr;
+    }
+
+    // Find best match in a 2x2 grid
+    let center_pixel = vec2<i32>(center_coord);
+    // Trick to start with closer pixels
+    let center_sum = vec2<i32>(center_coord - 0.5) + vec2<i32>(center_coord + 0.5);
+    var prev_pixels = array<vec2<i32>, 4>(
+        center_pixel.xy,
+        vec2<i32>(center_sum.x - center_pixel.x, center_pixel.y),
+        center_sum - center_pixel,
+        vec2<i32>(center_pixel.x, center_sum.y - center_pixel.y),
+    );
 
-        let other_index = get_reservoir_index(other_pixel, prev_camera);
-        if (other_index < 0) {
+    for (var i = 0; i < 4 && !tr.is_valid; i += 1) {
+        tr.pixel = prev_pixels[i];
+        let prev_reservoir_index = get_reservoir_index(tr.pixel, prev_camera);
+        if (prev_reservoir_index < 0) {
             continue;
         }
-        if (prev_reservoirs[other_index].confidence == 0.0) {
+        tr.reservoir = reservoirs[prev_reservoir_index];
+        if (tr.reservoir.confidence == 0.0) {
             continue;
         }
-
-        let other_surface = read_prev_surface(other_pixel);
-        let compatibility = compare_surfaces(surface, other_surface);
-        if (compatibility < 0.1) {
-            // if the surfaces are too different, there is no trust in this sample
+        tr.surface = read_prev_surface(tr.pixel);
+        if (compare_surfaces(surface, tr.surface) < 0.1) {
             continue;
         }
-
-        if (tap == 0u) {
-            temporal_index = accepted_count;
+        tr.is_valid = true;
+
+        if (WRITE_DEBUG_IMAGE && debug.view_mode == DebugMode_Reprojection) {
+            var colors = array<vec3<f32>, 4>(
+                vec3<f32>(1.0, 1.0, 1.0),
+                vec3<f32>(1.0, 0.0, 0.0),
+                vec3<f32>(0.0, 1.0, 0.0),
+                vec3<f32>(0.0, 0.0, 1.0),
+            );
+            textureStore(out_debug, pixel, vec4<f32>(colors[i], 1.0));
         }
-        accepted_reservoir_indices[accepted_count] = other_index;
-        if (accepted_count < MAX_RESERVOIRS) {
-            accepted_count += 1u;
+    }
+    return tr;
+}
+
+struct ResampleBase {
+    surface: Surface,
+    canonical: LiveReservoir,
+    world_pos: vec3<f32>,
+    accepted_count: f32,
+}
+struct ResampleResult {
+    selected: bool,
+    mis_canonical: f32,
+    mis_sample: f32,
+}
+
+fn ratio(a: f32, b: f32) -> f32 {
+    return select(0.0, a / (a+b), a+b > 0.0);
+}
+
+// Resample following Algorithm 8 in section 9.1 of Bitterli thesis
+fn resample(
+    dst: ptr<function, LiveReservoir>, color_and_weight: ptr<function, vec4<f32>>,
+    base: ResampleBase, other: PixelCache, other_acs: acceleration_structure,
+    max_confidence: f32,
+) -> ResampleResult {
+    var src: LiveReservoir;
+    let neighbor = other.reservoir;
+    var rr = ResampleResult();
+    if (parameters.use_pairwise_mis != 0u) {
+        let canonical = base.canonical;
+        let neighbor_history = min(neighbor.confidence, max_confidence);
+        let mis_scale = 1.0 / (base.accepted_count + parameters.defensive_mis);
+        {   // scoping this to hint the register allocation
+            let t_canonical_at_neighbor = estimate_target_score_with_occlusion(
+                other.surface, other.world_pos, canonical.selected_light_index, canonical.selected_uv, other_acs);
+            let r_canonical = ratio(canonical.history * canonical.selected_target_score / base.accepted_count, neighbor_history * t_canonical_at_neighbor.score);
+            rr.mis_canonical = mis_scale * (parameters.defensive_mis / base.accepted_count + r_canonical);
         }
+
+        let t_neighbor_at_canonical = estimate_target_score_with_occlusion(
+            base.surface, base.world_pos, neighbor.light_index, neighbor.light_uv, acc_struct);
+        let r_neighbor = ratio(neighbor_history * neighbor.target_score, canonical.history * t_neighbor_at_canonical.score / base.accepted_count);
+        rr.mis_sample = mis_scale * r_neighbor;
+
+        src.history = neighbor_history;
+        src.selected_light_index = neighbor.light_index;
+        src.selected_uv = neighbor.light_uv;
+        src.selected_target_score = t_neighbor_at_canonical.score;
+        src.weight_sum = t_neighbor_at_canonical.score * neighbor.contribution_weight * rr.mis_sample;
+        src.radiance = t_neighbor_at_canonical.color;
+    } else {
+        rr.mis_canonical = 1.0;
+        rr.mis_sample = 1.0;
+        let radiance = evaluate_reflected_light(base.surface, neighbor.light_index, neighbor.light_uv);
+        src = unpack_reservoir(neighbor, max_confidence, radiance);
+    }
+
+    if (DECOUPLED_SHADING) {
+        *color_and_weight += src.weight_sum * vec4<f32>(neighbor.contribution_weight * src.radiance, 1.0);
+    }
+    if (src.weight_sum <= 0.0) {
+        bump_reservoir(dst, src.history);
+    } else {
+        merge_reservoir(dst, src);
+        rr.selected = true;
+    }
+    return rr;
+}
+
+struct ResampleOutput {
+    reservoir: StoredReservoir,
+    color: vec3<f32>,
+}
+
+fn revive_canonical(ro: ResampleOutput) -> LiveReservoir {
+    let radiance = select(vec3<f32>(0.0), ro.color / ro.reservoir.contribution_weight, ro.reservoir.contribution_weight > 0.0);
+    return unpack_reservoir(ro.reservoir, 100.0, radiance);
+}
+
+fn finalize_canonical(reservoir: LiveReservoir) -> ResampleOutput {
+    var ro = ResampleOutput();
+    ro.reservoir = pack_reservoir(reservoir);
+    ro.color = ro.reservoir.contribution_weight * reservoir.radiance;
+    return ro;
+}
+
+fn finalize_resampling(
+    reservoir: ptr<function, LiveReservoir>, color_and_weight: ptr<function, vec4<f32>>,
+    base: ResampleBase, mis_canonical: f32,
+) -> ResampleOutput {
+    var canonical = base.canonical;
+    if (parameters.use_pairwise_mis != 0u) {
+        canonical.weight_sum *= mis_canonical / canonical.history;
+    }
+    merge_reservoir(reservoir, canonical);
+
+    let effective_history = select((*reservoir).history, 1.0, parameters.use_pairwise_mis != 0u);
+    var ro = ResampleOutput();
+    ro.reservoir = pack_reservoir_detail(*reservoir, effective_history);
+
+    if (DECOUPLED_SHADING) {
+        if (canonical.selected_target_score > 0.0) {
+            let contribution_weight = canonical.weight_sum / canonical.selected_target_score;
+            *color_and_weight += canonical.weight_sum * vec4<f32>(contribution_weight * canonical.radiance, 1.0);
+        }
+        ro.color = (*color_and_weight).xyz / max((*color_and_weight).w, 0.001);
+    } else {
+        ro.color = ro.reservoir.contribution_weight * (*reservoir).radiance;
+    }
+    return ro;
+}
+
+fn resample_temporal(
+    surface: Surface, cur_pixel: vec2<i32>, position: vec3<f32>,
+    local_index: u32, tr: TemporalReprojection,
+) -> ResampleOutput {
+    if (surface.depth == 0.0) {
+        return ResampleOutput();
+    }
+
+    let canonical = produce_canonical(surface, position);
+    if (parameters.temporal_tap == 0u || !tr.is_valid) {
+        return finalize_canonical(canonical);
     }
 
-    // Next, evaluate the MIS of each of the samples versus the canonical one.
     var reservoir = LiveReservoir();
-    var shaded_color = vec3<f32>(0.0);
-    var mis_canonical = BASE_CANONICAL_MIS;
     var color_and_weight = vec4<f32>(0.0);
-    for (var rid = 0u; rid < accepted_count; rid += 1u) {
-        let neighbor_index = accepted_reservoir_indices[rid];
-        let neighbor = prev_reservoirs[neighbor_index];
-
-        let max_history = select(parameters.spatial_tap_history, parameters.temporal_history, rid == temporal_index);
-        var other: LiveReservoir;
-        if (PAIRWISE_MIS) {
-            let neighbor_pixel = get_pixel_from_reservoir_index(neighbor_index, prev_camera);
-            let neighbor_history = min(neighbor.confidence, f32(max_history));
-            {   // scoping this to hint the register allocation
-                let neighbor_surface = read_prev_surface(neighbor_pixel);
-                let neighbor_dir = get_ray_direction(prev_camera, neighbor_pixel);
-                let neighbor_position = prev_camera.position + neighbor_surface.depth * neighbor_dir;
-
-                let t_canonical_at_neighbor = estimate_target_score_with_occlusion(
-                    neighbor_surface, neighbor_position, canonical.selected_light_index, canonical.selected_uv, prev_acc_struct, debug_len);
-                let mis_sub_canonical = balance_heuristic(
-                    t_canonical_at_neighbor.score, canonical.selected_target_score,
-                    neighbor_history * f32(accepted_count), canonical.history);
-                mis_canonical += 1.0 - mis_sub_canonical.weight;
-            }
-
-            // Notes about t_neighbor_at_neighbor:
-            // 1. we assume lights aren't moving. Technically we should check if the
-            //   target light has moved, and re-evaluate the occlusion.
-            // 2. we can use the cached target score, and there is no use of the target color
-            //let t_neighbor_at_neighbor = estimate_target_pdf(neighbor_surface, neighbor_position, neighbor.selected_dir);
-            let t_neighbor_at_canonical = estimate_target_score_with_occlusion(
-                surface, position, neighbor.light_index, neighbor.light_uv, acc_struct, debug_len);
-            let mis_neighbor = balance_heuristic(
-                neighbor.target_score, t_neighbor_at_canonical.score,
-                neighbor_history * f32(accepted_count), canonical.history);
-
-            other.history = neighbor_history;
-            other.selected_light_index = neighbor.light_index;
-            other.selected_uv = neighbor.light_uv;
-            other.selected_target_score = t_neighbor_at_canonical.score;
-            other.weight_sum = t_neighbor_at_canonical.score * neighbor.contribution_weight * mis_neighbor.weight;
-            //Note: should be needed according to the paper
-            // other.history *= min(mis_neighbor.history, mis_sub_canonical.history);
-            other.radiance = t_neighbor_at_canonical.color;
-        } else {
-            other = unpack_reservoir(neighbor, max_history);
-            other.radiance = evaluate_reflected_light(surface, other.selected_light_index, other.selected_uv);
-        }
+    let base = ResampleBase(surface, canonical, position, 1.0);
 
-        if (DECOUPLED_SHADING) {
-            color_and_weight += other.weight_sum * vec4<f32>(neighbor.contribution_weight * other.radiance, 1.0);
+    let prev_dir = get_ray_direction(prev_camera, tr.pixel);
+    let prev_world_pos = prev_camera.position + tr.surface.depth * prev_dir;
+    let other = PixelCache(tr.surface, tr.reservoir, prev_world_pos);
+    let rr = resample(&reservoir, &color_and_weight, base, other, prev_acc_struct, parameters.temporal_tap_confidence);
+    let mis_canonical = rr.mis_canonical;
+
+    if (WRITE_DEBUG_IMAGE && debug.view_mode == DebugMode_TemporalMatch) {
+        textureStore(out_debug, cur_pixel, vec4<f32>(1.0));
+    }
+    if (WRITE_DEBUG_IMAGE && debug.view_mode == DebugMode_TemporalMisCanonical) {
+        textureStore(out_debug, cur_pixel, vec4<f32>(mis_canonical));
+    }
+
+    return finalize_resampling(&reservoir, &color_and_weight, base, mis_canonical);
+}
+
+fn resample_spatial(
+    surface: Surface, cur_pixel: vec2<i32>, position: vec3<f32>,
+    group_id: vec3<u32>, canonical: LiveReservoir,
+) -> ResampleOutput {
+    if (surface.depth == 0.0) {
+        let dir = normalize(position - camera.position);
+        var ro = ResampleOutput();
+        ro.color = evaluate_environment(dir);
+        return ro;
+    }
+
+    // gather the list of neighbors (within the workgroup) to resample.
+    var accepted_count = 0u;
+    var accepted_local_indices = array<u32, MAX_RESAMPLE>();
+    let max_accepted = min(MAX_RESAMPLE, parameters.spatial_taps);
+    let num_candidates = parameters.spatial_taps * 4u;
+    for (var i = 0u; i < num_candidates && accepted_count < max_accepted; i += 1u) {
+        let other_cache_index = random_u32(&p_rng) % GROUP_SIZE_TOTAL;
+        let diff = thread_index_to_coord(other_cache_index, group_id) - cur_pixel;
+        if (dot(diff, diff) < parameters.spatial_min_distance * parameters.spatial_min_distance) {
+            continue;
         }
-        if (other.weight_sum <= 0.0) {
-            bump_reservoir(&reservoir, other.history);
-        } else {
-            merge_reservoir(&reservoir, other, random_gen(rng));
+        let other = pixel_cache[other_cache_index];
+        // if the surfaces are too different, there is no trust in this sample
+        if (other.reservoir.confidence > 0.0 && compare_surfaces(surface, other.surface) > 0.1) {
+            accepted_local_indices[accepted_count] = other_cache_index;
+            accepted_count += 1u;
         }
     }
 
-    // Finally, merge in the canonical sample
-    if (PAIRWISE_MIS) {
-        canonical.weight_sum *= mis_canonical / canonical.history;
-    }
-    if (DECOUPLED_SHADING) {
-        //FIXME: issue with near zero denominator. Do we need do use BASE_CANONICAL_MIS?
-        let cw = canonical.weight_sum / max(canonical.selected_target_score * mis_canonical, 0.1);
-        color_and_weight += canonical.weight_sum * vec4<f32>(cw * canonical.radiance, 1.0);
+    var reservoir = LiveReservoir();
+    var color_and_weight = vec4<f32>(0.0);
+    let base = ResampleBase(surface, canonical, position, f32(accepted_count));
+    var mis_canonical = f32(accepted_count == 0u);
+
+    // evaluate the MIS of each of the samples versus the canonical one.
+    for (var lid = 0u; lid < accepted_count; lid += 1u) {
+        let other = pixel_cache[accepted_local_indices[lid]];
+        let rr = resample(&reservoir, &color_and_weight, base, other, acc_struct, parameters.spatial_tap_confidence);
+        mis_canonical += rr.mis_canonical;
     }
-    merge_reservoir(&reservoir, canonical, random_gen(rng));
 
-    let effective_history = select(reservoir.history, BASE_CANONICAL_MIS + f32(accepted_count), PAIRWISE_MIS);
-    let stored = pack_reservoir_detail(reservoir, effective_history);
-    reservoirs[pixel_index] = stored;
-    var ro = RestirOutput();
-    if (DECOUPLED_SHADING) {
-        ro.radiance = color_and_weight.xyz / max(color_and_weight.w, 0.001);
-    } else {
-        ro.radiance = stored.contribution_weight * reservoir.radiance;
+    if (WRITE_DEBUG_IMAGE && debug.view_mode == DebugMode_SpatialMatch) {
+        let value = base.accepted_count / max(1.0, f32(parameters.spatial_taps));
+        textureStore(out_debug, cur_pixel, vec4<f32>(value));
     }
-    return ro;
+    if (WRITE_DEBUG_IMAGE && debug.view_mode == DebugMode_SpatialMisCanonical) {
+        textureStore(out_debug, cur_pixel, vec4<f32>(mis_canonical));
+    }
+    return finalize_resampling(&reservoir, &color_and_weight, base, mis_canonical);
 }
 
-@compute @workgroup_size(8, 4)
-fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
-    if (any(global_id.xy >= camera.target_size)) {
+fn compute_restir(
+    rs: RichSurface, pixel: vec2<i32>, local_index: u32, group_id: vec3<u32>,
+) -> vec3<f32> {
+    let center_coord = vec2<f32>(pixel) + 0.5 + select(vec2<f32>(0.0), rs.motion, parameters.use_motion_vectors != 0u);
+    let tr = find_temporal(rs.inner, pixel, center_coord);
+    let motion_sqr = dot(rs.motion, rs.motion);
+
+    let temporal = resample_temporal(rs.inner, pixel, rs.position, local_index, tr);
+    pixel_cache[local_index] = PixelCache(rs.inner, temporal.reservoir, rs.position);
+    var prev_pixel = select(vec2<i32>(-1), tr.pixel, tr.is_valid);
+
+    // sync with the workgroup to ensure all reservoirs are available.
+    workgroupBarrier();
+
+    let temporal_live = revive_canonical(temporal);
+    let spatial = resample_spatial(rs.inner, pixel, rs.position, group_id, temporal_live);
+
+    let pixel_index = get_reservoir_index(pixel, camera);
+    reservoirs[pixel_index] = spatial.reservoir;
+
+    accumulate_temporal(pixel, spatial.color, parameters.temporal_accumulation_weight, prev_pixel, motion_sqr);
+    return spatial.color;
+}
+
+@compute @workgroup_size(GROUP_SIZE.x, GROUP_SIZE.y)
+fn main(
+    @builtin(workgroup_id) group_id: vec3<u32>,
+    @builtin(local_invocation_index) local_index: u32,
+) {
+    pixel_cache[local_index] = PixelCache();
+    let pixel_coord = thread_index_to_coord(local_index, group_id);
+    if (any(vec2<u32>(pixel_coord) >= camera.target_size)) {
         return;
     }
 
-    let global_index = global_id.y * camera.target_size.x + global_id.x;
-    var rng = random_init(global_index, parameters.frame_index);
+    if (WRITE_DEBUG_IMAGE) {
+        var default_color = vec3<f32>(0.0);
+        if (debug.view_mode == DebugMode_Grouping) {
+            p_rng = random_init(group_id.y * 1000u + group_id.x, 0u);
+            let h = random_gen(&p_rng) * 360.0;
+            default_color = hsv_to_rgb(h, 1.0, 1.0);
+        }
+        textureStore(out_debug, pixel_coord, vec4<f32>(default_color, 0.0));
+    }
+
+    let enable_debug = DRAW_DEBUG && all(pixel_coord == vec2<i32>(debug.mouse_pos));
+    let rs = fetch_geometry(pixel_coord, true, enable_debug);
+
+    let global_index = u32(pixel_coord.y) * camera.target_size.x + u32(pixel_coord.x);
+    p_rng = random_init(global_index, parameters.frame_index);
 
-    let surface = read_surface(vec2<i32>(global_id.xy));
-    let enable_debug = all(global_id.xy == debug.mouse_pos);
     let enable_restir_debug = (debug.draw_flags & DebugDrawFlags_RESTIR) != 0u && enable_debug;
-    let ro = compute_restir(surface, vec2<i32>(global_id.xy), &rng, enable_restir_debug);
-    let color = ro.radiance;
+    p_debug_len = select(0.0, rs.inner.depth * 0.2, enable_restir_debug);
+    let color = compute_restir(rs, pixel_coord, local_index, group_id);
+
+    //Note: important to do this after the temporal pass specifically
+    // TODO: option to avoid writing data for the sky
+    textureStore(inout_depth, pixel_coord, vec4<f32>(rs.inner.depth, 0.0, 0.0, 0.0));
+    textureStore(inout_basis, pixel_coord, rs.inner.basis);
+    textureStore(inout_flat_normal, pixel_coord, vec4<f32>(rs.inner.flat_normal, 0.0));
+    textureStore(out_albedo, pixel_coord, vec4<f32>(rs.albedo, 0.0));
+    if (WRITE_MOTION_VECTORS) {
+        textureStore(out_motion, pixel_coord, vec4<f32>(rs.motion * MOTION_SCALE, 0.0, 0.0));
+    }
+
     if (enable_debug) {
         debug_buf.variance.color_sum += color;
         debug_buf.variance.color2_sum += color * color;
         debug_buf.variance.count += 1u;
     }
-    textureStore(out_diffuse, global_id.xy, vec4<f32>(color, 1.0));
 }
diff --git a/blade-render/src/render/mod.rs b/blade-render/src/render/mod.rs
index c856538f..4dd56365 100644
--- a/blade-render/src/render/mod.rs
+++ b/blade-render/src/render/mod.rs
@@ -51,7 +51,13 @@ pub enum DebugMode {
     Normal = 2,
     Motion = 3,
     HitConsistency = 4,
-    Variance = 5,
+    Grouping = 5,
+    Reprojection = 6,
+    TemporalMatch = 10,
+    TemporalMisCanonical = 11,
+    SpatialMatch = 12,
+    SpatialMisCanonical = 13,
+    Variance = 100,
 }
 
 impl Default for DebugMode {
@@ -90,15 +96,24 @@ pub struct RayConfig {
     pub num_environment_samples: u32,
     pub environment_importance_sampling: bool,
     pub temporal_tap: bool,
-    pub temporal_history: u32,
+    pub temporal_confidence: f32,
     pub spatial_taps: u32,
-    pub spatial_tap_history: u32,
-    pub spatial_radius: u32,
+    pub spatial_confidence: f32,
+    /// Minimal distance to a spatially reused pixel (in the current frame).
+    pub spatial_min_distance: u32,
+    /// Scale and mix the groups into clusters, to allow spatial samples to mix
+    /// outside of the original workgroup pixel bounds.
+    pub group_mixer: u32,
     pub t_start: f32,
+    /// See "9.1 pairwise mis for robust reservoir reuse"
+    /// "Correlations and Reuse for Fast and Accurate Physically Based Light Transport"
+    pub pairwise_mis: bool,
+    pub defensive_mis: f32,
 }
 
 #[derive(Clone, Copy, Debug, PartialEq, PartialOrd)]
 pub struct DenoiserConfig {
+    pub enabled: bool,
     pub num_passes: u32,
     pub temporal_weight: f32,
 }
@@ -199,13 +214,15 @@ impl<const N: usize> RenderTarget<N> {
 }
 
 struct RestirTargets {
-    reservoir_buf: [blade_graphics::Buffer; 2],
+    reservoir_buf: blade_graphics::Buffer,
     debug: RenderTarget<1>,
-    depth: RenderTarget<2>,
-    basis: RenderTarget<2>,
-    flat_normal: RenderTarget<2>,
+    depth: RenderTarget<1>,
+    basis: RenderTarget<1>,
+    flat_normal: RenderTarget<1>,
     albedo: RenderTarget<1>,
     motion: RenderTarget<1>,
+    // One stores the ReSTIR output color,
+    // another 2 are used for a-trous ping-pong.
     light_diffuse: RenderTarget<3>,
     camera_params: [CameraParams; 2],
 }
@@ -218,19 +235,16 @@ impl RestirTargets {
         gpu: &blade_graphics::Context,
     ) -> Self {
         let total_reservoirs = size.width as usize * size.height as usize;
-        let mut reservoir_buf = [blade_graphics::Buffer::default(); 2];
-        for (i, rb) in reservoir_buf.iter_mut().enumerate() {
-            *rb = gpu.create_buffer(blade_graphics::BufferDesc {
-                name: &format!("reservoirs{i}"),
-                size: reservoir_size as u64 * total_reservoirs as u64,
-                memory: blade_graphics::Memory::Device,
-            });
-        }
+        let reservoir_buf = gpu.create_buffer(blade_graphics::BufferDesc {
+            name: "reservoirs",
+            size: reservoir_size as u64 * total_reservoirs as u64,
+            memory: blade_graphics::Memory::Device,
+        });
 
         Self {
             reservoir_buf,
             debug: RenderTarget::new(
-                "deubg",
+                "debug",
                 blade_graphics::TextureFormat::Rgba8Unorm,
                 size,
                 encoder,
@@ -277,9 +291,7 @@ impl RestirTargets {
     }
 
     fn destroy(&self, gpu: &blade_graphics::Context) {
-        for rb in self.reservoir_buf.iter() {
-            gpu.destroy_buffer(*rb);
-        }
+        gpu.destroy_buffer(self.reservoir_buf);
         self.debug.destroy(gpu);
         self.depth.destroy(gpu);
         self.basis.destroy(gpu);
@@ -291,7 +303,6 @@ impl RestirTargets {
 }
 
 struct Blur {
-    temporal_accum_pipeline: blade_graphics::ComputePipeline,
     atrous_pipeline: blade_graphics::ComputePipeline,
 }
 
@@ -307,8 +318,6 @@ struct Blur {
 pub struct Renderer {
     shaders: Shaders,
     targets: RestirTargets,
-    post_proc_input_index: usize,
-    fill_pipeline: blade_graphics::ComputePipeline,
     main_pipeline: blade_graphics::ComputePipeline,
     post_proc_pipeline: blade_graphics::RenderPipeline,
     blur: Blur,
@@ -360,57 +369,43 @@ struct MainParams {
     num_environment_samples: u32,
     environment_importance_sampling: u32,
     temporal_tap: u32,
-    temporal_history: u32,
+    temporal_confidence: f32,
     spatial_taps: u32,
-    spatial_tap_history: u32,
-    spatial_radius: u32,
+    spatial_confidence: f32,
+    spatial_min_distance: u32,
     t_start: f32,
+    use_pairwise_mis: u32,
+    defensive_mis: f32,
     use_motion_vectors: u32,
+    temporal_accumulation_weight: f32,
+    pad: u32,
+    grid_scale: [u32; 2],
 }
 
 #[derive(blade_macros::ShaderData)]
-struct FillData<'a> {
+struct MainData<'a> {
     camera: CameraParams,
     prev_camera: CameraParams,
     debug: DebugParams,
+    parameters: MainParams,
     acc_struct: blade_graphics::AccelerationStructure,
+    prev_acc_struct: blade_graphics::AccelerationStructure,
     hit_entries: blade_graphics::BufferPiece,
     index_buffers: &'a blade_graphics::BufferArray<MAX_RESOURCES>,
     vertex_buffers: &'a blade_graphics::BufferArray<MAX_RESOURCES>,
     textures: &'a blade_graphics::TextureArray<MAX_RESOURCES>,
     sampler_linear: blade_graphics::Sampler,
-    debug_buf: blade_graphics::BufferPiece,
-    out_depth: blade_graphics::TextureView,
-    out_basis: blade_graphics::TextureView,
-    out_flat_normal: blade_graphics::TextureView,
-    out_albedo: blade_graphics::TextureView,
-    out_motion: blade_graphics::TextureView,
-    out_debug: blade_graphics::TextureView,
-}
-
-#[derive(blade_macros::ShaderData)]
-struct MainData {
-    camera: CameraParams,
-    prev_camera: CameraParams,
-    debug: DebugParams,
-    parameters: MainParams,
-    acc_struct: blade_graphics::AccelerationStructure,
-    prev_acc_struct: blade_graphics::AccelerationStructure,
-    sampler_linear: blade_graphics::Sampler,
     sampler_nearest: blade_graphics::Sampler,
     env_map: blade_graphics::TextureView,
     env_weights: blade_graphics::TextureView,
-    t_depth: blade_graphics::TextureView,
-    t_prev_depth: blade_graphics::TextureView,
-    t_basis: blade_graphics::TextureView,
-    t_prev_basis: blade_graphics::TextureView,
-    t_flat_normal: blade_graphics::TextureView,
-    t_prev_flat_normal: blade_graphics::TextureView,
-    t_motion: blade_graphics::TextureView,
     debug_buf: blade_graphics::BufferPiece,
     reservoirs: blade_graphics::BufferPiece,
-    prev_reservoirs: blade_graphics::BufferPiece,
-    out_diffuse: blade_graphics::TextureView,
+    inout_depth: blade_graphics::TextureView,
+    inout_basis: blade_graphics::TextureView,
+    inout_flat_normal: blade_graphics::TextureView,
+    out_albedo: blade_graphics::TextureView,
+    out_motion: blade_graphics::TextureView,
+    inout_diffuse: blade_graphics::TextureView,
     out_debug: blade_graphics::TextureView,
 }
 
@@ -418,27 +413,10 @@ struct MainData {
 #[derive(Clone, Copy, bytemuck::Zeroable, bytemuck::Pod)]
 struct BlurParams {
     extent: [u32; 2],
-    temporal_weight: f32,
     iteration: i32,
-    use_motion_vectors: u32,
     pad: u32,
 }
 
-#[derive(blade_macros::ShaderData)]
-struct TemporalAccumData {
-    camera: CameraParams,
-    prev_camera: CameraParams,
-    params: BlurParams,
-    input: blade_graphics::TextureView,
-    prev_input: blade_graphics::TextureView,
-    t_depth: blade_graphics::TextureView,
-    t_prev_depth: blade_graphics::TextureView,
-    t_flat_normal: blade_graphics::TextureView,
-    t_prev_flat_normal: blade_graphics::TextureView,
-    t_motion: blade_graphics::TextureView,
-    output: blade_graphics::TextureView,
-}
-
 #[derive(blade_macros::ShaderData)]
 struct AtrousData {
     params: BlurParams,
@@ -487,9 +465,8 @@ struct HitEntry {
 #[derive(Clone, PartialEq)]
 pub struct Shaders {
     env_prepare: blade_asset::Handle<crate::Shader>,
-    fill_gbuf: blade_asset::Handle<crate::Shader>,
     ray_trace: blade_asset::Handle<crate::Shader>,
-    blur: blade_asset::Handle<crate::Shader>,
+    a_trous: blade_asset::Handle<crate::Shader>,
     post_proc: blade_asset::Handle<crate::Shader>,
     debug_draw: blade_asset::Handle<crate::Shader>,
     debug_blit: blade_asset::Handle<crate::Shader>,
@@ -500,9 +477,8 @@ impl Shaders {
         let mut ctx = asset_hub.open_context(path, "shader finish");
         let shaders = Self {
             env_prepare: ctx.load_shader("env-prepare.wgsl"),
-            fill_gbuf: ctx.load_shader("fill-gbuf.wgsl"),
             ray_trace: ctx.load_shader("ray-trace.wgsl"),
-            blur: ctx.load_shader("blur.wgsl"),
+            a_trous: ctx.load_shader("a-trous.wgsl"),
             post_proc: ctx.load_shader("post-proc.wgsl"),
             debug_draw: ctx.load_shader("debug-draw.wgsl"),
             debug_blit: ctx.load_shader("debug-blit.wgsl"),
@@ -512,29 +488,14 @@ impl Shaders {
 }
 
 struct ShaderPipelines {
-    fill: blade_graphics::ComputePipeline,
     main: blade_graphics::ComputePipeline,
-    temporal_accum: blade_graphics::ComputePipeline,
-    atrous: blade_graphics::ComputePipeline,
+    a_trous: blade_graphics::ComputePipeline,
     post_proc: blade_graphics::RenderPipeline,
     env_prepare: blade_graphics::ComputePipeline,
     reservoir_size: u32,
 }
 
 impl ShaderPipelines {
-    fn create_gbuf_fill(
-        shader: &blade_graphics::Shader,
-        gpu: &blade_graphics::Context,
-    ) -> blade_graphics::ComputePipeline {
-        shader.check_struct_size::<crate::Vertex>();
-        shader.check_struct_size::<HitEntry>();
-        let layout = <FillData as blade_graphics::ShaderData>::layout();
-        gpu.create_compute_pipeline(blade_graphics::ComputePipelineDesc {
-            name: "fill-gbuf",
-            data_layouts: &[&layout],
-            compute: shader.at("main"),
-        })
-    }
     fn create_ray_trace(
         shader: &blade_graphics::Shader,
         gpu: &blade_graphics::Context,
@@ -545,32 +506,26 @@ impl ShaderPipelines {
         shader.check_struct_size::<DebugVariance>();
         shader.check_struct_size::<DebugEntry>();
         let layout = <MainData as blade_graphics::ShaderData>::layout();
-        gpu.create_compute_pipeline(blade_graphics::ComputePipelineDesc {
+        let pipeline = gpu.create_compute_pipeline(blade_graphics::ComputePipelineDesc {
             name: "ray-trace",
             data_layouts: &[&layout],
             compute: shader.at("main"),
-        })
-    }
+        });
 
-    fn create_temporal_accum(
-        shader: &blade_graphics::Shader,
-        gpu: &blade_graphics::Context,
-    ) -> blade_graphics::ComputePipeline {
-        let layout = <TemporalAccumData as blade_graphics::ShaderData>::layout();
-        gpu.create_compute_pipeline(blade_graphics::ComputePipelineDesc {
-            name: "temporal-accum",
-            data_layouts: &[&layout],
-            compute: shader.at("temporal_accum"),
-        })
+        let pl_struct_size = shader.get_struct_size("PixelCache");
+        let group_size = pipeline.get_workgroup_size();
+        let wg_required = pl_struct_size * group_size[0] * group_size[1];
+        log::info!("Using {} workgroup memory for RT", wg_required);
+        pipeline
     }
 
-    fn create_atrous(
+    fn create_a_trous(
         shader: &blade_graphics::Shader,
         gpu: &blade_graphics::Context,
     ) -> blade_graphics::ComputePipeline {
         let layout = <AtrousData as blade_graphics::ShaderData>::layout();
         gpu.create_compute_pipeline(blade_graphics::ComputePipelineDesc {
-            name: "atrous",
+            name: "a-trous",
             data_layouts: &[&layout],
             compute: shader.at("atrous3x3"),
         })
@@ -604,12 +559,10 @@ impl ShaderPipelines {
         shader_man: &blade_asset::AssetManager<crate::shader::Baker>,
     ) -> Result<Self, &'static str> {
         let sh_main = shader_man[shaders.ray_trace].raw.as_ref().unwrap();
-        let sh_blur = shader_man[shaders.blur].raw.as_ref().unwrap();
+        let sh_atrous = shader_man[shaders.a_trous].raw.as_ref().unwrap();
         Ok(Self {
-            fill: Self::create_gbuf_fill(shader_man[shaders.fill_gbuf].raw.as_ref().unwrap(), gpu),
             main: Self::create_ray_trace(sh_main, gpu),
-            temporal_accum: Self::create_temporal_accum(sh_blur, gpu),
-            atrous: Self::create_atrous(sh_blur, gpu),
+            a_trous: Self::create_a_trous(sh_atrous, gpu),
             post_proc: Self::create_post_proc(
                 shader_man[shaders.post_proc].raw.as_ref().unwrap(),
                 config.surface_info,
@@ -639,6 +592,11 @@ pub struct FrameResources {
     pub acceleration_structures: Vec<blade_graphics::AccelerationStructure>,
 }
 
+#[derive(Debug, Default)]
+pub struct FrameKey {
+    post_proc_input_index: usize,
+}
+
 impl Renderer {
     /// Create a new renderer with a given configuration.
     ///
@@ -696,13 +654,10 @@ impl Renderer {
         Self {
             shaders,
             targets,
-            post_proc_input_index: 0,
-            fill_pipeline: sp.fill,
             main_pipeline: sp.main,
             post_proc_pipeline: sp.post_proc,
             blur: Blur {
-                temporal_accum_pipeline: sp.temporal_accum,
-                atrous_pipeline: sp.atrous,
+                atrous_pipeline: sp.a_trous,
             },
             acceleration_structure: blade_graphics::AccelerationStructure::default(),
             prev_acceleration_structure: blade_graphics::AccelerationStructure::default(),
@@ -742,9 +697,7 @@ impl Renderer {
         gpu.destroy_sampler(self.samplers.nearest);
         gpu.destroy_sampler(self.samplers.linear);
         // pipelines
-        gpu.destroy_compute_pipeline(&mut self.blur.temporal_accum_pipeline);
         gpu.destroy_compute_pipeline(&mut self.blur.atrous_pipeline);
-        gpu.destroy_compute_pipeline(&mut self.fill_pipeline);
         gpu.destroy_compute_pipeline(&mut self.main_pipeline);
         gpu.destroy_render_pipeline(&mut self.post_proc_pipeline);
     }
@@ -759,9 +712,8 @@ impl Renderer {
         let mut tasks = Vec::new();
         let old = self.shaders.clone();
 
-        tasks.extend(asset_hub.shaders.hot_reload(&mut self.shaders.fill_gbuf));
         tasks.extend(asset_hub.shaders.hot_reload(&mut self.shaders.ray_trace));
-        tasks.extend(asset_hub.shaders.hot_reload(&mut self.shaders.blur));
+        tasks.extend(asset_hub.shaders.hot_reload(&mut self.shaders.a_trous));
         tasks.extend(asset_hub.shaders.hot_reload(&mut self.shaders.post_proc));
         tasks.extend(asset_hub.shaders.hot_reload(&mut self.shaders.debug_draw));
         tasks.extend(asset_hub.shaders.hot_reload(&mut self.shaders.debug_blit));
@@ -776,11 +728,6 @@ impl Renderer {
             let _ = task.join();
         }
 
-        if self.shaders.fill_gbuf != old.fill_gbuf {
-            if let Ok(ref shader) = asset_hub.shaders[self.shaders.fill_gbuf].raw {
-                self.fill_pipeline = ShaderPipelines::create_gbuf_fill(shader, gpu);
-            }
-        }
         if self.shaders.ray_trace != old.ray_trace {
             if let Ok(ref shader) = asset_hub.shaders[self.shaders.ray_trace].raw {
                 assert_eq!(
@@ -790,11 +737,9 @@ impl Renderer {
                 self.main_pipeline = ShaderPipelines::create_ray_trace(shader, gpu);
             }
         }
-        if self.shaders.blur != old.blur {
-            if let Ok(ref shader) = asset_hub.shaders[self.shaders.blur].raw {
-                self.blur.temporal_accum_pipeline =
-                    ShaderPipelines::create_temporal_accum(shader, gpu);
-                self.blur.atrous_pipeline = ShaderPipelines::create_atrous(shader, gpu);
+        if self.shaders.a_trous != old.a_trous {
+            if let Ok(ref shader) = asset_hub.shaders[self.shaders.a_trous].raw {
+                self.blur.atrous_pipeline = ShaderPipelines::create_a_trous(shader, gpu);
             }
         }
         if self.shaders.post_proc != old.post_proc {
@@ -1089,20 +1034,17 @@ impl Renderer {
                 self.debug.reset_lines(&mut transfer);
             }
             let total_reservoirs = self.surface_size.width as u64 * self.surface_size.height as u64;
-            for reservoir_buf in self.targets.reservoir_buf.iter() {
-                transfer.fill_buffer(
-                    reservoir_buf.at(0),
-                    total_reservoirs * self.reservoir_size as u64,
-                    0,
-                );
-            }
+            transfer.fill_buffer(
+                self.targets.reservoir_buf.at(0),
+                total_reservoirs * self.reservoir_size as u64,
+                0,
+            );
         }
 
         if !config.frozen {
             self.frame_index += 1;
         }
         self.targets.camera_params[self.frame_index % 2] = self.make_camera_params(camera);
-        self.post_proc_input_index = self.frame_index % 2;
     }
 
     /// Ray trace the scene.
@@ -1114,40 +1056,33 @@ impl Renderer {
         command_encoder: &mut blade_graphics::CommandEncoder,
         debug_config: DebugConfig,
         ray_config: RayConfig,
-    ) {
+        denoiser_config: DenoiserConfig,
+    ) -> FrameKey {
         let debug = self.make_debug_params(&debug_config);
         let (cur, prev) = self.work_indices();
+        let mut post_proc_input_index = 0;
 
         if let mut pass = command_encoder.compute() {
-            let mut pc = pass.with(&self.fill_pipeline);
-            let groups = self.fill_pipeline.get_dispatch_for(self.surface_size);
-            pc.bind(
-                0,
-                &FillData {
-                    camera: self.targets.camera_params[cur],
-                    prev_camera: self.targets.camera_params[prev],
-                    debug,
-                    acc_struct: self.acceleration_structure,
-                    hit_entries: self.hit_buffer.into(),
-                    index_buffers: &self.index_buffers,
-                    vertex_buffers: &self.vertex_buffers,
-                    textures: &self.textures,
-                    sampler_linear: self.samplers.linear,
-                    debug_buf: self.debug.buffer_resource(),
-                    out_depth: self.targets.depth.views[cur],
-                    out_basis: self.targets.basis.views[cur],
-                    out_flat_normal: self.targets.flat_normal.views[cur],
-                    out_albedo: self.targets.albedo.views[0],
-                    out_motion: self.targets.motion.views[0],
-                    out_debug: self.targets.debug.views[0],
-                },
-            );
-            pc.dispatch(groups);
-        }
-
-        if let mut pass = command_encoder.compute() {
+            let grid_scale = {
+                let limit = ray_config.group_mixer;
+                let r = self.frame_index as u32 ^ 0x5A;
+                [r % limit + 1, (r / limit) % limit + 1]
+            };
+            let groups = {
+                let wg_size = self.main_pipeline.get_workgroup_size();
+                let cluster_size = [
+                    wg_size[0] * grid_scale[0],
+                    wg_size[1] * grid_scale[1],
+                    wg_size[2],
+                ];
+                let clusters = self.surface_size.group_by(cluster_size);
+                [
+                    clusters[0] * grid_scale[0],
+                    clusters[1] * grid_scale[1],
+                    clusters[2],
+                ]
+            };
             let mut pc = pass.with(&self.main_pipeline);
-            let groups = self.main_pipeline.get_dispatch_for(self.surface_size);
             pc.bind(
                 0,
                 &MainData {
@@ -1160,12 +1095,21 @@ impl Renderer {
                         environment_importance_sampling: ray_config.environment_importance_sampling
                             as u32,
                         temporal_tap: ray_config.temporal_tap as u32,
-                        temporal_history: ray_config.temporal_history,
+                        temporal_confidence: ray_config.temporal_confidence,
                         spatial_taps: ray_config.spatial_taps,
-                        spatial_tap_history: ray_config.spatial_tap_history,
-                        spatial_radius: ray_config.spatial_radius,
+                        spatial_confidence: ray_config.spatial_confidence,
+                        spatial_min_distance: ray_config.spatial_min_distance,
                         t_start: ray_config.t_start,
+                        use_pairwise_mis: ray_config.pairwise_mis as u32,
+                        defensive_mis: ray_config.defensive_mis,
                         use_motion_vectors: (self.frame_scene_built == self.frame_index) as u32,
+                        temporal_accumulation_weight: if denoiser_config.enabled {
+                            denoiser_config.temporal_weight
+                        } else {
+                            1.0
+                        },
+                        pad: 0,
+                        grid_scale,
                     },
                     acc_struct: self.acceleration_structure,
                     prev_acc_struct: if self.frame_scene_built < self.frame_index
@@ -1176,95 +1120,61 @@ impl Renderer {
                     } else {
                         self.prev_acceleration_structure
                     },
+                    hit_entries: self.hit_buffer.into(),
+                    index_buffers: &self.index_buffers,
+                    vertex_buffers: &self.vertex_buffers,
+                    textures: &self.textures,
                     sampler_linear: self.samplers.linear,
                     sampler_nearest: self.samplers.nearest,
                     env_map: self.env_map.main_view,
                     env_weights: self.env_map.weight_view,
-                    t_depth: self.targets.depth.views[cur],
-                    t_prev_depth: self.targets.depth.views[prev],
-                    t_basis: self.targets.basis.views[cur],
-                    t_prev_basis: self.targets.basis.views[prev],
-                    t_flat_normal: self.targets.flat_normal.views[cur],
-                    t_prev_flat_normal: self.targets.flat_normal.views[prev],
-                    t_motion: self.targets.motion.views[0],
                     debug_buf: self.debug.buffer_resource(),
-                    reservoirs: self.targets.reservoir_buf[cur].into(),
-                    prev_reservoirs: self.targets.reservoir_buf[prev].into(),
-                    out_diffuse: self.targets.light_diffuse.views[cur],
+                    reservoirs: self.targets.reservoir_buf.into(),
+                    inout_depth: self.targets.depth.views[0],
+                    inout_basis: self.targets.basis.views[0],
+                    inout_flat_normal: self.targets.flat_normal.views[0],
+                    out_albedo: self.targets.albedo.views[0],
+                    out_motion: self.targets.motion.views[0],
+                    inout_diffuse: self.targets.light_diffuse.views[post_proc_input_index],
                     out_debug: self.targets.debug.views[0],
                 },
             );
             pc.dispatch(groups);
         }
-    }
 
-    /// Perform noise reduction using SVGF.
-    #[profiling::function]
-    pub fn denoise(
-        &mut self, //TODO: borrow immutably
-        command_encoder: &mut blade_graphics::CommandEncoder,
-        denoiser_config: DenoiserConfig,
-    ) {
-        let mut params = BlurParams {
-            extent: [self.surface_size.width, self.surface_size.height],
-            temporal_weight: denoiser_config.temporal_weight,
-            iteration: 0,
-            use_motion_vectors: (self.frame_scene_built == self.frame_index) as u32,
-            pad: 0,
-        };
-        let (cur, prev) = self.work_indices();
-        let temp = 2;
-
-        if denoiser_config.temporal_weight < 1.0 {
-            let mut pass = command_encoder.compute();
-            let mut pc = pass.with(&self.blur.temporal_accum_pipeline);
-            let groups = self
-                .blur
-                .atrous_pipeline
-                .get_dispatch_for(self.surface_size);
-            pc.bind(
-                0,
-                &TemporalAccumData {
-                    camera: self.targets.camera_params[cur],
-                    prev_camera: self.targets.camera_params[prev],
-                    params,
-                    input: self.targets.light_diffuse.views[cur],
-                    prev_input: self.targets.light_diffuse.views[prev],
-                    t_depth: self.targets.depth.views[cur],
-                    t_prev_depth: self.targets.depth.views[prev],
-                    t_flat_normal: self.targets.flat_normal.views[cur],
-                    t_prev_flat_normal: self.targets.flat_normal.views[prev],
-                    t_motion: self.targets.motion.views[0],
-                    output: self.targets.light_diffuse.views[temp],
-                },
-            );
-            pc.dispatch(groups);
-            //Note: making `cur` contain the latest reprojection output
-            self.targets.light_diffuse.views.swap(cur, temp);
+        if denoiser_config.enabled {
+            let mut params = BlurParams {
+                extent: [self.surface_size.width, self.surface_size.height],
+                iteration: 0,
+                pad: 0,
+            };
+            let mut ping_pong = [1, 2];
+            for _ in 0..denoiser_config.num_passes {
+                let mut pass = command_encoder.compute();
+                let mut pc = pass.with(&self.blur.atrous_pipeline);
+                let groups = self
+                    .blur
+                    .atrous_pipeline
+                    .get_dispatch_for(self.surface_size);
+                pc.bind(
+                    0,
+                    &AtrousData {
+                        params,
+                        input: self.targets.light_diffuse.views[post_proc_input_index],
+                        t_depth: self.targets.depth.views[0],
+                        t_flat_normal: self.targets.flat_normal.views[0],
+                        output: self.targets.light_diffuse.views[ping_pong[0]],
+                    },
+                );
+                pc.dispatch(groups);
+                post_proc_input_index = ping_pong[0];
+                ping_pong.swap(0, 1);
+                params.iteration += 1;
+            }
         }
 
-        let mut ping_pong = [temp, prev];
-        for _ in 0..denoiser_config.num_passes {
-            let mut pass = command_encoder.compute();
-            let mut pc = pass.with(&self.blur.atrous_pipeline);
-            let groups = self
-                .blur
-                .atrous_pipeline
-                .get_dispatch_for(self.surface_size);
-            pc.bind(
-                0,
-                &AtrousData {
-                    params,
-                    input: self.targets.light_diffuse.views[self.post_proc_input_index],
-                    t_depth: self.targets.depth.views[cur],
-                    t_flat_normal: self.targets.flat_normal.views[cur],
-                    output: self.targets.light_diffuse.views[ping_pong[0]],
-                },
-            );
-            pc.dispatch(groups);
-            self.post_proc_input_index = ping_pong[0];
-            ping_pong.swap(0, 1);
-            params.iteration += 1;
+        FrameKey {
+            post_proc_input_index,
         }
     }
 
@@ -1273,6 +1183,7 @@ impl Renderer {
     pub fn post_proc(
         &self,
         pass: &mut blade_graphics::RenderCommandEncoder,
+        key: FrameKey,
         debug_config: DebugConfig,
         pp_config: PostProcConfig,
         debug_lines: &[DebugLine],
@@ -1285,7 +1196,7 @@ impl Renderer {
                 0,
                 &PostProcData {
                     t_albedo: self.targets.albedo.views[0],
-                    light_diffuse: self.targets.light_diffuse.views[self.post_proc_input_index],
+                    light_diffuse: self.targets.light_diffuse.views[key.post_proc_input_index],
                     t_debug: self.targets.debug.views[0],
                     tone_map_params: ToneMapParams {
                         enabled: 1,
@@ -1302,7 +1213,7 @@ impl Renderer {
         self.debug.render_lines(
             debug_lines,
             self.targets.camera_params[cur],
-            self.targets.depth.views[cur],
+            self.targets.depth.views[0],
             pass,
         );
         self.debug
diff --git a/examples/init/env-sample.wgsl b/examples/init/env-sample.wgsl
index 2c4d379d..7891133e 100644
--- a/examples/init/env-sample.wgsl
+++ b/examples/init/env-sample.wgsl
@@ -5,12 +5,13 @@ const PI: f32 = 3.1415926;
 const BUMP: f32 = 0.025;
 
 var env_main: texture_2d<f32>;
+var<private> p_rng: RandomState;
 
 @vertex
 fn vs_accum(@builtin(vertex_index) vi: u32) -> @builtin(position) vec4<f32> {
-    var rng = random_init(vi, 0u);
     let dim = textureDimensions(env_main);
-    let es = generate_environment_sample(&rng, dim);
+    p_rng = random_init(vi, 0u);
+    let es = generate_environment_sample(&p_rng, dim);
     let extent = textureDimensions(env_weights, 0);
     let relative = (vec2<f32>(es.pixel) + vec2<f32>(0.5)) / vec2<f32>(extent);
     return vec4<f32>(relative.x - 1.0, 1.0 - relative.y, 0.0, 1.0);
diff --git a/examples/scene/main.rs b/examples/scene/main.rs
index f09de24a..c58c6a48 100644
--- a/examples/scene/main.rs
+++ b/examples/scene/main.rs
@@ -159,7 +159,6 @@ struct Example {
     last_render_time: time::Instant,
     render_times: VecDeque<u32>,
     ray_config: blade_render::RayConfig,
-    denoiser_enabled: bool,
     denoiser_config: blade_render::DenoiserConfig,
     post_proc_config: blade_render::PostProcConfig,
     debug_blit: Option<blade_render::DebugBlit>,
@@ -259,16 +258,19 @@ impl Example {
             render_times: VecDeque::with_capacity(FRAME_TIME_HISTORY),
             ray_config: blade_render::RayConfig {
                 num_environment_samples: 1,
-                environment_importance_sampling: false,
+                environment_importance_sampling: true,
                 temporal_tap: true,
-                temporal_history: 10,
+                temporal_confidence: 10.0,
                 spatial_taps: 1,
-                spatial_tap_history: 5,
-                spatial_radius: 10,
+                spatial_confidence: 5.0,
+                spatial_min_distance: 2,
+                group_mixer: 10,
                 t_start: 0.1,
+                pairwise_mis: true,
+                defensive_mis: 0.0,
             },
-            denoiser_enabled: true,
             denoiser_config: blade_render::DenoiserConfig {
+                enabled: true,
                 num_passes: 3,
                 temporal_weight: 0.1,
             },
@@ -458,6 +460,7 @@ impl Example {
         // even while it's still being loaded.
         let do_render =
             self.scene_load_task.is_none() || (RENDER_WHILE_LOADING && self.scene_revision != 0);
+        let mut frame_key = blade_render::FrameKey::default();
         if do_render {
             self.renderer.prepare(
                 command_encoder,
@@ -474,11 +477,12 @@ impl Example {
             //TODO: figure out why the main RT pipeline
             // causes a GPU crash when there are no objects
             if !self.objects.is_empty() {
-                self.renderer
-                    .ray_trace(command_encoder, self.debug, self.ray_config);
-                if self.denoiser_enabled {
-                    self.renderer.denoise(command_encoder, self.denoiser_config);
-                }
+                frame_key = self.renderer.ray_trace(
+                    command_encoder,
+                    self.debug,
+                    self.ray_config,
+                    self.denoiser_config,
+                );
             }
         }
 
@@ -508,6 +512,7 @@ impl Example {
                 };
                 self.renderer.post_proc(
                     &mut pass,
+                    frame_key,
                     self.debug,
                     self.post_proc_config,
                     &[],
@@ -582,6 +587,8 @@ impl Example {
             return;
         }
 
+        ui.checkbox(&mut self.track_hot_reloads, "Hot reloading");
+
         let mut selection = blade_render::SelectionInfo::default();
         if self.debug.mouse_pos.is_some() {
             selection = self.renderer.read_debug_selection_info();
@@ -669,7 +676,6 @@ impl Example {
         egui::CollapsingHeader::new("Denoise")
             .default_open(false)
             .show(ui, |ui| {
-                ui.checkbox(&mut self.denoiser_enabled, "Enable");
                 self.denoiser_config.populate_hud(ui);
             });
 
@@ -945,6 +951,9 @@ fn main() {
                             }
                             last_mouse_pos = [position.x as i32, position.y as i32];
                         }
+                        winit::event::WindowEvent::MouseWheel { delta, .. } => {
+                            example.camera.on_wheel(delta);
+                        }
                         winit::event::WindowEvent::HoveredFile(_) => {
                             example.is_file_hovered = true;
                             example
diff --git a/src/lib.rs b/src/lib.rs
index 5c8320b6..72e0a2c4 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -378,7 +378,6 @@ pub struct Engine {
     debug: blade_render::DebugConfig,
     pub frame_config: blade_render::FrameConfig,
     pub ray_config: blade_render::RayConfig,
-    pub denoiser_enabled: bool,
     pub denoiser_config: blade_render::DenoiserConfig,
     pub post_proc_config: blade_render::PostProcConfig,
     track_hot_reloads: bool,
@@ -484,16 +483,19 @@ impl Engine {
             },
             ray_config: blade_render::RayConfig {
                 num_environment_samples: 1,
-                environment_importance_sampling: false,
+                environment_importance_sampling: true,
                 temporal_tap: true,
-                temporal_history: 10,
+                temporal_confidence: 10.0,
                 spatial_taps: 1,
-                spatial_tap_history: 5,
-                spatial_radius: 10,
+                spatial_confidence: 10.0,
+                spatial_min_distance: 2,
+                group_mixer: 10,
                 t_start: 0.01,
+                pairwise_mis: true,
+                defensive_mis: 0.1,
             },
-            denoiser_enabled: true,
             denoiser_config: blade_render::DenoiserConfig {
+                enabled: true,
                 num_passes: 4,
                 temporal_weight: 0.1,
             },
@@ -572,6 +574,7 @@ impl Engine {
 
         // We should be able to update TLAS and render content
         // even while it's still being loaded.
+        let mut frame_key = blade_render::FrameKey::default();
         if self.load_tasks.is_empty() {
             self.render_objects.clear();
             for (_, object) in self.objects.iter_mut() {
@@ -627,11 +630,12 @@ impl Engine {
             self.frame_config.reset_reservoirs = false;
 
             if !self.render_objects.is_empty() {
-                self.renderer
-                    .ray_trace(command_encoder, self.debug, self.ray_config);
-                if self.denoiser_enabled {
-                    self.renderer.denoise(command_encoder, self.denoiser_config);
-                }
+                frame_key = self.renderer.ray_trace(
+                    command_encoder,
+                    self.debug,
+                    self.ray_config,
+                    self.denoiser_config,
+                );
             }
         }
 
@@ -701,6 +705,7 @@ impl Engine {
             if self.load_tasks.is_empty() {
                 self.renderer.post_proc(
                     &mut pass,
+                    frame_key,
                     self.debug,
                     self.post_proc_config,
                     &debug_lines,
@@ -734,8 +739,7 @@ impl Engine {
             .default_open(false)
             .show(ui, |ui| {
                 self.ray_config.populate_hud(ui);
-                self.frame_config.reset_reservoirs |= ui.button("Reset Accumulation").clicked();
-                ui.checkbox(&mut self.denoiser_enabled, "Enable Denoiser");
+                self.frame_config.populate_hud(ui);
                 self.denoiser_config.populate_hud(ui);
                 self.post_proc_config.populate_hud(ui);
             });