From 0517ca9ba83056e2cf24c11bdf73341b3f0eeb1e Mon Sep 17 00:00:00 2001 From: Ryan Berger Date: Sun, 11 Jan 2026 20:09:29 -0800 Subject: [PATCH 1/2] make unrolled angle calculation for line of sight straightlined After some thought about how the L1 cache is functioning in our line of sight algorithm, this commit calculates all the angles and puts them into a buffer, and then an unrolled prefix max is then calculated on top of that. It ends up being much quicker on my i9900k, offering about a 20% speedup, and it is expected machines with larger L1 caches will be better. --- .../total-viewsheds/src/cpu/unrolled_los.rs | 67 +++++++++---------- 1 file changed, 30 insertions(+), 37 deletions(-) diff --git a/crates/total-viewsheds/src/cpu/unrolled_los.rs b/crates/total-viewsheds/src/cpu/unrolled_los.rs index 9ea9e4c..49812ae 100644 --- a/crates/total-viewsheds/src/cpu/unrolled_los.rs +++ b/crates/total-viewsheds/src/cpu/unrolled_los.rs @@ -44,6 +44,9 @@ where /// `UnrolledLOS` implements an Unrolled `LineOfSight` calculation pub struct UnrolledVectorLos { + /// `angles` holds a buffer for line of sight angles to be put into + /// which is exactly `max_los+1` long + angles: Vec, /// `distances` holds `max_los` distances distances: Vec, /// `adjustments` holds `max_los` earth curvature adjustments @@ -93,6 +96,7 @@ impl UnrolledVectorLos (f32, f32, Vec) { - let mut angles = [0.0f32; UNROLL * VECTOR_WIDTH + 1]; let mut prefix_max = [0.0f32; UNROLL * VECTOR_WIDTH]; prefix_max[UNROLL * VECTOR_WIDTH - 1] = -2000.0; - angles[0] = -2000.0; + + VectorLos::::calculate_angles( + pov_height, + line, + &self.distances, + &self.adjustments, + &mut self.angles[1..], + ); let mut output: Vec = vec![]; - let (chunked_line, rest_line) = line.as_chunks::<{ UNROLL * VECTOR_WIDTH }>(); + let (chunked_prefix_angles, rest_prefix_angles) = + self.angles[..self.angles.len() - 1].as_chunks::<{ UNROLL * VECTOR_WIDTH }>(); + let (chunked_angles, rest_angles) = + self.angles[1..].as_chunks::<{ UNROLL * VECTOR_WIDTH }>(); let (chunked_distances, rest_distances) = self.distances.as_chunks::<{ UNROLL * VECTOR_WIDTH }>(); - let (chunked_adjustments, rest_adjustments) = - self.adjustments.as_chunks::<{ UNROLL * VECTOR_WIDTH }>(); - - let los = izip!(chunked_line, chunked_distances, chunked_adjustments).fold( + let los = izip!(chunked_prefix_angles, chunked_angles, chunked_distances).fold( UnrollVector:: { longest: [0.0; UNROLL * VECTOR_WIDTH], heatmap: [0.0; UNROLL * VECTOR_WIDTH], }, - |acc, (unroll_line, distances, adjusts)| { - VectorLos::::calculate_angles( - pov_height, - unroll_line, - distances, - adjusts, - &mut angles[1..], - ); - + |acc, (prefix_angles, angles, distances)| { VectorLos::::prefix_max( prefix_max[UNROLL * VECTOR_WIDTH - 1], - &angles[..UNROLL * VECTOR_WIDTH], + prefix_angles, &mut prefix_max, ); - let new_acc = VectorLos::::accumulate( + VectorLos::::accumulate( acc, - &angles[1..], + angles, &prefix_max, distances, &mut output, - ); - - angles[0] = angles[UNROLL]; - new_acc + ) }, ); - VectorLos::::calculate_angles( - pov_height, - rest_line, - rest_distances, - rest_adjustments, - &mut angles[1..=rest_line.len()], - ); - VectorLos::::prefix_max( prefix_max[UNROLL * VECTOR_WIDTH - 1], - &angles[..rest_line.len()], - &mut prefix_max[..rest_line.len()], + rest_prefix_angles, + &mut prefix_max[..rest_angles.len()], ); let new_acc = VectorLos::::accumulate( los, - &angles[1..=rest_line.len()], - &prefix_max[..rest_line.len()], + rest_angles, + &prefix_max[..rest_angles.len()], rest_distances, &mut output, ); @@ -213,11 +204,13 @@ where ) -> UnrollVector { debug_assert!( angles.len().is_multiple_of(VECTOR_WIDTH), - "distance unroll should be multiple of width" + "angles with len {} should be multiple of {}", + angles.len(), + VECTOR_WIDTH, ); debug_assert!( prefix.len().is_multiple_of(VECTOR_WIDTH), - "distance unroll should be multiple of width" + "prefix unroll should be multiple of width" ); debug_assert!( distances.len().is_multiple_of(VECTOR_WIDTH), From 257ae9fd9f728002fa98b12abb685f4058c476c0 Mon Sep 17 00:00:00 2001 From: Ryan Berger Date: Sun, 11 Jan 2026 21:31:11 -0800 Subject: [PATCH 2/2] update unroll factor based on new algorithm --- crates/total-viewsheds/src/cpu/kernel.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/crates/total-viewsheds/src/cpu/kernel.rs b/crates/total-viewsheds/src/cpu/kernel.rs index 9d25f66..3e2a432 100644 --- a/crates/total-viewsheds/src/cpu/kernel.rs +++ b/crates/total-viewsheds/src/cpu/kernel.rs @@ -46,8 +46,7 @@ fn dem_to_pov(dem_id: i32, width: usize, max_los: usize) -> i32 { /// vectors, and 10-way unrolling for the 16-wide vector as it is optimal for Turins const DEFAULT_UNROLL: usize = const { match DEFAULT_VECTOR_LENGTH { - 4 | 8 => 8, - 16 => 10, + 4 | 8 | 16 => 10, #[expect( clippy::unreachable, reason = "no one should be setting any other constants"