From 48b18733c0cc41284923b340ad55fc72cad7a5fd Mon Sep 17 00:00:00 2001 From: Michael Ciraci Date: Sat, 8 Mar 2025 22:21:49 -0600 Subject: [PATCH 01/10] Changing array_utils --- Cargo.toml | 2 +- src/algorithm/butterflies.rs | 4 +- src/array_utils.rs | 61 ++++++++++++++++++++++++++ src/avx/avx32_butterflies.rs | 8 ++-- src/avx/avx64_butterflies.rs | 8 ++-- src/avx/mod.rs | 8 ++-- src/common.rs | 8 ++-- src/neon/neon_butterflies.rs | 8 ++-- src/neon/neon_common.rs | 8 ++-- src/sse/sse_butterflies.rs | 12 ++--- src/sse/sse_common.rs | 8 ++-- src/wasm_simd/wasm_simd_butterflies.rs | 8 ++-- src/wasm_simd/wasm_simd_common.rs | 4 +- 13 files changed, 104 insertions(+), 43 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 270c094e..74b1a875 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,7 +13,7 @@ categories = ["algorithms", "compression", "multimedia::encoding", "science"] license = "MIT OR Apache-2.0" [features] -default = ["avx", "sse", "neon"] +default = ["avx"] # On x86_64, the "avx" feature enables compilation of AVX-acclerated code. # Similarly, the "sse" feature enables compilation of SSE-accelerated code. diff --git a/src/algorithm/butterflies.rs b/src/algorithm/butterflies.rs index 12926189..17db5170 100644 --- a/src/algorithm/butterflies.rs +++ b/src/algorithm/butterflies.rs @@ -29,7 +29,7 @@ macro_rules! boilerplate_fft_butterfly { return; // Unreachable, because fft_error_outofplace asserts, but it helps codegen to put it here } - let result = array_utils::iter_chunks_zipped( + let result = array_utils::iter_chunks_zipped_mut( input, output, self.len(), @@ -56,7 +56,7 @@ macro_rules! boilerplate_fft_butterfly { return; // Unreachable, because fft_error_inplace asserts, but it helps codegen to put it here } - let result = array_utils::iter_chunks(buffer, self.len(), |chunk| unsafe { + let result = array_utils::iter_chunks_mut(buffer, self.len(), |chunk| unsafe { self.perform_fft_butterfly(chunk) }); diff --git a/src/array_utils.rs b/src/array_utils.rs index 8dd0aba6..e9ac585d 100644 --- a/src/array_utils.rs +++ b/src/array_utils.rs @@ -146,6 +146,29 @@ mod unit_tests { // Loop over exact chunks of the provided buffer. Very similar in semantics to ChunksExactMut, but generates smaller code and requires no modulo operations // Returns Ok() if every element ended up in a chunk, Err() if there was a remainder pub fn iter_chunks( + mut buffer: &[T], + chunk_size: usize, + mut chunk_fn: impl FnMut(&[T]), +) -> Result<(), ()> { + // Loop over the buffer, splicing off chunk_size at a time, and calling chunk_fn on each + while buffer.len() >= chunk_size { + let (head, tail) = buffer.split_at(chunk_size); + buffer = tail; + + chunk_fn(head); + } + + // We have a remainder if there's data still in the buffer -- in which case we want to indicate to the caller that there was an unwanted remainder + if buffer.len() == 0 { + Ok(()) + } else { + Err(()) + } +} + +// Loop over exact chunks of the provided buffer. Very similar in semantics to ChunksExactMut, but generates smaller code and requires no modulo operations +// Returns Ok() if every element ended up in a chunk, Err() if there was a remainder +pub fn iter_chunks_mut( mut buffer: &mut [T], chunk_size: usize, mut chunk_fn: impl FnMut(&mut [T]), @@ -169,6 +192,44 @@ pub fn iter_chunks( // Loop over exact zipped chunks of the 2 provided buffers. Very similar in semantics to ChunksExactMut.zip(ChunksExactMut), but generates smaller code and requires no modulo operations // Returns Ok() if every element of both buffers ended up in a chunk, Err() if there was a remainder pub fn iter_chunks_zipped( + mut buffer1: &[T], + mut buffer2: &mut [T], + chunk_size: usize, + mut chunk_fn: impl FnMut(&[T], &mut [T]), +) -> Result<(), ()> { + // If the two buffers aren't the same size, record the fact that they're different, then snip them to be the same size + let uneven = if buffer1.len() > buffer2.len() { + buffer1 = &buffer1[..buffer2.len()]; + true + } else if buffer2.len() < buffer1.len() { + buffer2 = &mut buffer2[..buffer1.len()]; + true + } else { + false + }; + + // Now that we know the two slices are the same length, loop over each one, splicing off chunk_size at a time, and calling chunk_fn on each + while buffer1.len() >= chunk_size && buffer2.len() >= chunk_size { + let (head1, tail1) = buffer1.split_at(chunk_size); + buffer1 = tail1; + + let (head2, tail2) = buffer2.split_at_mut(chunk_size); + buffer2 = tail2; + + chunk_fn(head1, head2); + } + + // We have a remainder if the 2 chunks were uneven to start with, or if there's still data in the buffers -- in which case we want to indicate to the caller that there was an unwanted remainder + if !uneven && buffer1.len() == 0 { + Ok(()) + } else { + Err(()) + } +} + +// Loop over exact zipped chunks of the 2 provided buffers. Very similar in semantics to ChunksExactMut.zip(ChunksExactMut), but generates smaller code and requires no modulo operations +// Returns Ok() if every element of both buffers ended up in a chunk, Err() if there was a remainder +pub fn iter_chunks_zipped_mut( mut buffer1: &mut [T], mut buffer2: &mut [T], chunk_size: usize, diff --git a/src/avx/avx32_butterflies.rs b/src/avx/avx32_butterflies.rs index 87f9c10a..91f9def5 100644 --- a/src/avx/avx32_butterflies.rs +++ b/src/avx/avx32_butterflies.rs @@ -49,7 +49,7 @@ macro_rules! boilerplate_fft_simd_butterfly { return; // Unreachable, because fft_error_outofplace asserts, but it helps codegen to put it here } - let result = array_utils::iter_chunks_zipped( + let result = array_utils::iter_chunks_zipped_mut( input, output, self.len(), @@ -79,7 +79,7 @@ macro_rules! boilerplate_fft_simd_butterfly { return; // Unreachable, because fft_error_inplace asserts, but it helps codegen to put it here } - let result = array_utils::iter_chunks(buffer, self.len(), |chunk| { + let result = array_utils::iter_chunks_mut(buffer, self.len(), |chunk| { unsafe { // Specialization workaround: See the comments in FftPlannerAvx::new() for why we have to transmute these slices self.perform_fft_f32(workaround_transmute_mut::<_, Complex>(chunk)); @@ -188,7 +188,7 @@ macro_rules! boilerplate_fft_simd_butterfly_with_scratch { unsafe { array_utils::workaround_transmute_mut(input) }; let transmuted_output: &mut [Complex] = unsafe { array_utils::workaround_transmute_mut(output) }; - let result = array_utils::iter_chunks_zipped( + let result = array_utils::iter_chunks_zipped_mut( transmuted_input, transmuted_output, self.len(), @@ -216,7 +216,7 @@ macro_rules! boilerplate_fft_simd_butterfly_with_scratch { unsafe { array_utils::workaround_transmute_mut(buffer) }; let transmuted_scratch: &mut [Complex] = unsafe { array_utils::workaround_transmute_mut(scratch) }; - let result = array_utils::iter_chunks(transmuted_buffer, self.len(), |chunk| { + let result = array_utils::iter_chunks_mut(transmuted_buffer, self.len(), |chunk| { self.perform_fft_inplace(chunk, transmuted_scratch) }); diff --git a/src/avx/avx64_butterflies.rs b/src/avx/avx64_butterflies.rs index c15d5459..28415934 100644 --- a/src/avx/avx64_butterflies.rs +++ b/src/avx/avx64_butterflies.rs @@ -47,7 +47,7 @@ macro_rules! boilerplate_fft_simd_butterfly { return; // Unreachable, because fft_error_outofplace asserts, but it helps codegen to put it here } - let result = array_utils::iter_chunks_zipped( + let result = array_utils::iter_chunks_zipped_mut( input, output, self.len(), @@ -77,7 +77,7 @@ macro_rules! boilerplate_fft_simd_butterfly { return; // Unreachable, because fft_error_inplace asserts, but it helps codegen to put it here } - let result = array_utils::iter_chunks(buffer, self.len(), |chunk| { + let result = array_utils::iter_chunks_mut(buffer, self.len(), |chunk| { unsafe { // Specialization workaround: See the comments in FftPlannerAvx::new() for why we have to transmute these slices self.perform_fft_f64(workaround_transmute_mut::<_, Complex>(chunk)); @@ -188,7 +188,7 @@ macro_rules! boilerplate_fft_simd_butterfly_with_scratch { unsafe { array_utils::workaround_transmute_mut(input) }; let transmuted_output: &mut [Complex] = unsafe { array_utils::workaround_transmute_mut(output) }; - let result = array_utils::iter_chunks_zipped( + let result = array_utils::iter_chunks_zipped_mut( transmuted_input, transmuted_output, self.len(), @@ -216,7 +216,7 @@ macro_rules! boilerplate_fft_simd_butterfly_with_scratch { unsafe { array_utils::workaround_transmute_mut(buffer) }; let transmuted_scratch: &mut [Complex] = unsafe { array_utils::workaround_transmute_mut(scratch) }; - let result = array_utils::iter_chunks(transmuted_buffer, self.len(), |chunk| { + let result = array_utils::iter_chunks_mut(transmuted_buffer, self.len(), |chunk| { self.perform_fft_inplace(chunk, transmuted_scratch) }); diff --git a/src/avx/mod.rs b/src/avx/mod.rs index dc3dbeae..303d6c8e 100644 --- a/src/avx/mod.rs +++ b/src/avx/mod.rs @@ -53,7 +53,7 @@ macro_rules! boilerplate_avx_fft { } let scratch = &mut scratch[..required_scratch]; - let result = array_utils::iter_chunks_zipped( + let result = array_utils::iter_chunks_zipped_mut( input, output, self.len(), @@ -88,7 +88,7 @@ macro_rules! boilerplate_avx_fft { } let scratch = &mut scratch[..required_scratch]; - let result = array_utils::iter_chunks(buffer, self.len(), |chunk| { + let result = array_utils::iter_chunks_mut(buffer, self.len(), |chunk| { self.perform_fft_inplace(chunk, scratch) }); @@ -157,7 +157,7 @@ macro_rules! boilerplate_avx_fft_commondata { } let scratch = &mut scratch[..required_scratch]; - let result = array_utils::iter_chunks_zipped( + let result = array_utils::iter_chunks_zipped_mut( input, output, self.len(), @@ -196,7 +196,7 @@ macro_rules! boilerplate_avx_fft_commondata { } let scratch = &mut scratch[..required_scratch]; - let result = array_utils::iter_chunks(buffer, self.len(), |chunk| { + let result = array_utils::iter_chunks_mut(buffer, self.len(), |chunk| { self.perform_fft_inplace(chunk, scratch) }); diff --git a/src/common.rs b/src/common.rs index 008be6db..43e0facd 100644 --- a/src/common.rs +++ b/src/common.rs @@ -99,7 +99,7 @@ macro_rules! boilerplate_fft_oop { return; // Unreachable, because fft_error_outofplace asserts, but it helps codegen to put it here } - let result = array_utils::iter_chunks_zipped( + let result = array_utils::iter_chunks_zipped_mut( input, output, self.len(), @@ -132,7 +132,7 @@ macro_rules! boilerplate_fft_oop { } let (scratch, extra_scratch) = scratch.split_at_mut(self.len()); - let result = array_utils::iter_chunks(buffer, self.len(), |chunk| { + let result = array_utils::iter_chunks_mut(buffer, self.len(), |chunk| { self.perform_fft_out_of_place(chunk, scratch, extra_scratch); chunk.copy_from_slice(scratch); }); @@ -202,7 +202,7 @@ macro_rules! boilerplate_fft { } let scratch = &mut scratch[..required_scratch]; - let result = array_utils::iter_chunks_zipped( + let result = array_utils::iter_chunks_zipped_mut( input, output, self.len(), @@ -241,7 +241,7 @@ macro_rules! boilerplate_fft { } let scratch = &mut scratch[..required_scratch]; - let result = array_utils::iter_chunks(buffer, self.len(), |chunk| { + let result = array_utils::iter_chunks_mut(buffer, self.len(), |chunk| { self.perform_fft_inplace(chunk, scratch) }); diff --git a/src/neon/neon_butterflies.rs b/src/neon/neon_butterflies.rs index dc84ffa3..261b51d9 100644 --- a/src/neon/neon_butterflies.rs +++ b/src/neon/neon_butterflies.rs @@ -43,7 +43,7 @@ macro_rules! boilerplate_fft_neon_f32_butterfly { buffer: &mut [Complex], ) -> Result<(), ()> { let len = buffer.len(); - let alldone = array_utils::iter_chunks(buffer, 2 * self.len(), |chunk| { + let alldone = array_utils::iter_chunks_mut(buffer, 2 * self.len(), |chunk| { self.perform_parallel_fft_butterfly(chunk) }); if alldone.is_err() && buffer.len() >= self.len() { @@ -59,7 +59,7 @@ macro_rules! boilerplate_fft_neon_f32_butterfly { output: &mut [Complex], ) -> Result<(), ()> { let len = input.len(); - let alldone = array_utils::iter_chunks_zipped( + let alldone = array_utils::iter_chunks_zipped_mut( input, output, 2 * self.len(), @@ -101,7 +101,7 @@ macro_rules! boilerplate_fft_neon_f64_butterfly { &self, buffer: &mut [Complex], ) -> Result<(), ()> { - array_utils::iter_chunks(buffer, self.len(), |chunk| { + array_utils::iter_chunks_mut(buffer, self.len(), |chunk| { self.perform_fft_butterfly(chunk) }) } @@ -113,7 +113,7 @@ macro_rules! boilerplate_fft_neon_f64_butterfly { input: &mut [Complex], output: &mut [Complex], ) -> Result<(), ()> { - array_utils::iter_chunks_zipped(input, output, self.len(), |in_chunk, out_chunk| { + array_utils::iter_chunks_zipped_mut(input, output, self.len(), |in_chunk, out_chunk| { let input_slice = workaround_transmute_mut(in_chunk); let output_slice = workaround_transmute_mut(out_chunk); self.perform_fft_contiguous(DoubleBuf { diff --git a/src/neon/neon_common.rs b/src/neon/neon_common.rs index 826a320e..f48ceb97 100644 --- a/src/neon/neon_common.rs +++ b/src/neon/neon_common.rs @@ -74,7 +74,7 @@ macro_rules! boilerplate_fft_neon_oop { } let result = unsafe { - array_utils::iter_chunks_zipped( + array_utils::iter_chunks_zipped_mut( input, output, self.len(), @@ -109,7 +109,7 @@ macro_rules! boilerplate_fft_neon_oop { let scratch = &mut scratch[..required_scratch]; let result = unsafe { - array_utils::iter_chunks(buffer, self.len(), |chunk| { + array_utils::iter_chunks_mut(buffer, self.len(), |chunk| { self.perform_fft_out_of_place(chunk, scratch, &mut []); chunk.copy_from_slice(scratch); }) @@ -180,7 +180,7 @@ macro_rules! boilerplate_sse_fft { } let scratch = &mut scratch[..required_scratch]; - let result = array_utils::iter_chunks_zipped( + let result = array_utils::iter_chunks_zipped_mut( input, output, self.len(), @@ -219,7 +219,7 @@ macro_rules! boilerplate_sse_fft { } let scratch = &mut scratch[..required_scratch]; - let result = array_utils::iter_chunks(buffer, self.len(), |chunk| { + let result = array_utils::iter_chunks_mut(buffer, self.len(), |chunk| { self.perform_fft_inplace(chunk, scratch) }); diff --git a/src/sse/sse_butterflies.rs b/src/sse/sse_butterflies.rs index 97c6bcfc..b83f6422 100644 --- a/src/sse/sse_butterflies.rs +++ b/src/sse/sse_butterflies.rs @@ -48,7 +48,7 @@ macro_rules! boilerplate_fft_sse_f32_butterfly { buffer: &mut [Complex], ) -> Result<(), ()> { let len = buffer.len(); - let alldone = array_utils::iter_chunks(buffer, 2 * self.len(), |chunk| { + let alldone = array_utils::iter_chunks_mut(buffer, 2 * self.len(), |chunk| { self.perform_parallel_fft_butterfly(chunk) }); if alldone.is_err() && buffer.len() >= self.len() { @@ -65,7 +65,7 @@ macro_rules! boilerplate_fft_sse_f32_butterfly { output: &mut [Complex], ) -> Result<(), ()> { let len = input.len(); - let alldone = array_utils::iter_chunks_zipped( + let alldone = array_utils::iter_chunks_zipped_mut( input, output, 2 * self.len(), @@ -107,7 +107,7 @@ macro_rules! boilerplate_fft_sse_f32_butterfly_noparallel { &self, buffer: &mut [Complex], ) -> Result<(), ()> { - array_utils::iter_chunks(buffer, self.len(), |chunk| { + array_utils::iter_chunks_mut(buffer, self.len(), |chunk| { self.perform_fft_butterfly(chunk) }) } @@ -119,7 +119,7 @@ macro_rules! boilerplate_fft_sse_f32_butterfly_noparallel { input: &mut [Complex], output: &mut [Complex], ) -> Result<(), ()> { - array_utils::iter_chunks_zipped(input, output, self.len(), |in_chunk, out_chunk| { + array_utils::iter_chunks_zipped_mut(input, output, self.len(), |in_chunk, out_chunk| { let input_slice = workaround_transmute_mut(in_chunk); let output_slice = workaround_transmute_mut(out_chunk); self.perform_fft_contiguous(DoubleBuf { @@ -147,7 +147,7 @@ macro_rules! boilerplate_fft_sse_f64_butterfly { &self, buffer: &mut [Complex], ) -> Result<(), ()> { - array_utils::iter_chunks(buffer, self.len(), |chunk| { + array_utils::iter_chunks_mut(buffer, self.len(), |chunk| { self.perform_fft_butterfly(chunk) }) } @@ -159,7 +159,7 @@ macro_rules! boilerplate_fft_sse_f64_butterfly { input: &mut [Complex], output: &mut [Complex], ) -> Result<(), ()> { - array_utils::iter_chunks_zipped(input, output, self.len(), |in_chunk, out_chunk| { + array_utils::iter_chunks_zipped_mut(input, output, self.len(), |in_chunk, out_chunk| { let input_slice = workaround_transmute_mut(in_chunk); let output_slice = workaround_transmute_mut(out_chunk); self.perform_fft_contiguous(DoubleBuf { diff --git a/src/sse/sse_common.rs b/src/sse/sse_common.rs index 0c319955..a6678720 100644 --- a/src/sse/sse_common.rs +++ b/src/sse/sse_common.rs @@ -74,7 +74,7 @@ macro_rules! boilerplate_fft_sse_oop { } let result = unsafe { - array_utils::iter_chunks_zipped( + array_utils::iter_chunks_zipped_mut( input, output, self.len(), @@ -109,7 +109,7 @@ macro_rules! boilerplate_fft_sse_oop { let scratch = &mut scratch[..required_scratch]; let result = unsafe { - array_utils::iter_chunks(buffer, self.len(), |chunk| { + array_utils::iter_chunks_mut(buffer, self.len(), |chunk| { self.perform_fft_out_of_place(chunk, scratch, &mut []); chunk.copy_from_slice(scratch); }) @@ -180,7 +180,7 @@ macro_rules! boilerplate_sse_fft { } let scratch = &mut scratch[..required_scratch]; - let result = array_utils::iter_chunks_zipped( + let result = array_utils::iter_chunks_zipped_mut( input, output, self.len(), @@ -219,7 +219,7 @@ macro_rules! boilerplate_sse_fft { } let scratch = &mut scratch[..required_scratch]; - let result = array_utils::iter_chunks(buffer, self.len(), |chunk| { + let result = array_utils::iter_chunks_mut(buffer, self.len(), |chunk| { self.perform_fft_inplace(chunk, scratch) }); diff --git a/src/wasm_simd/wasm_simd_butterflies.rs b/src/wasm_simd/wasm_simd_butterflies.rs index 90af2e8f..5b5ec874 100644 --- a/src/wasm_simd/wasm_simd_butterflies.rs +++ b/src/wasm_simd/wasm_simd_butterflies.rs @@ -46,7 +46,7 @@ macro_rules! boilerplate_fft_wasm_simd_f32_butterfly { buffer: &mut [Complex], ) -> Result<(), ()> { let len = buffer.len(); - let alldone = array_utils::iter_chunks(buffer, 2 * self.len(), |chunk| { + let alldone = array_utils::iter_chunks_mut(buffer, 2 * self.len(), |chunk| { self.perform_parallel_fft_butterfly(chunk) }); if alldone.is_err() && buffer.len() >= self.len() { @@ -63,7 +63,7 @@ macro_rules! boilerplate_fft_wasm_simd_f32_butterfly { output: &mut [Complex], ) -> Result<(), ()> { let len = input.len(); - let alldone = array_utils::iter_chunks_zipped( + let alldone = array_utils::iter_chunks_zipped_mut( input, output, 2 * self.len(), @@ -105,7 +105,7 @@ macro_rules! boilerplate_fft_wasm_simd_f64_butterfly { &self, buffer: &mut [Complex], ) -> Result<(), ()> { - array_utils::iter_chunks(buffer, self.len(), |chunk| { + array_utils::iter_chunks_mut(buffer, self.len(), |chunk| { self.perform_fft_butterfly(chunk) }) } @@ -117,7 +117,7 @@ macro_rules! boilerplate_fft_wasm_simd_f64_butterfly { input: &mut [Complex], output: &mut [Complex], ) -> Result<(), ()> { - array_utils::iter_chunks_zipped(input, output, self.len(), |in_chunk, out_chunk| { + array_utils::iter_chunks_zipped_mut(input, output, self.len(), |in_chunk, out_chunk| { let input_slice = workaround_transmute_mut(in_chunk); let output_slice = workaround_transmute_mut(out_chunk); self.perform_fft_contiguous(DoubleBuf { diff --git a/src/wasm_simd/wasm_simd_common.rs b/src/wasm_simd/wasm_simd_common.rs index 787170d9..4c7a20cf 100644 --- a/src/wasm_simd/wasm_simd_common.rs +++ b/src/wasm_simd/wasm_simd_common.rs @@ -74,7 +74,7 @@ macro_rules! boilerplate_fft_wasm_simd_oop { } let result = unsafe { - array_utils::iter_chunks_zipped( + array_utils::iter_chunks_zipped_mut( input, output, self.len(), @@ -109,7 +109,7 @@ macro_rules! boilerplate_fft_wasm_simd_oop { let scratch = &mut scratch[..required_scratch]; let result = unsafe { - array_utils::iter_chunks(buffer, self.len(), |chunk| { + array_utils::iter_chunks_mut(buffer, self.len(), |chunk| { self.perform_fft_out_of_place(chunk, scratch, &mut []); chunk.copy_from_slice(scratch); }) From dae4c82856824781b3c0e06f69bd732fd6dd8681 Mon Sep 17 00:00:00 2001 From: Michael Ciraci Date: Sat, 8 Mar 2025 22:37:44 -0600 Subject: [PATCH 02/10] Adding immutable api --- src/algorithm/butterflies.rs | 42 ++++++++++++++++ src/avx/avx32_butterflies.rs | 70 ++++++++++++++++++++++++++- src/avx/avx64_butterflies.rs | 69 ++++++++++++++++++++++++++- src/avx/mod.rs | 92 ++++++++++++++++++++++++++++++++++++ src/common.rs | 90 +++++++++++++++++++++++++++++++++++ src/lib.rs | 7 +++ src/test_utils.rs | 14 ++++++ 7 files changed, 382 insertions(+), 2 deletions(-) diff --git a/src/algorithm/butterflies.rs b/src/algorithm/butterflies.rs index 17db5170..f6617568 100644 --- a/src/algorithm/butterflies.rs +++ b/src/algorithm/butterflies.rs @@ -17,6 +17,39 @@ macro_rules! boilerplate_fft_butterfly { } } impl Fft for $struct_name { + fn process_outofplace_with_scratch_immut( + &self, + input: &[Complex], + output: &mut [Complex], + _scratch: &mut [Complex], + ) { + if input.len() < self.len() || output.len() != input.len() { + // We want to trigger a panic, but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us + fft_error_outofplace(self.len(), input.len(), output.len(), 0, 0); + return; // Unreachable, because fft_error_outofplace asserts, but it helps codegen to put it here + } + + let result = array_utils::iter_chunks_zipped( + input, + output, + self.len(), + |in_chunk, out_chunk| { + unsafe { + self.perform_fft_butterfly(DoubleBuf { + input: in_chunk, + output: out_chunk, + }) + }; + }, + ); + + if result.is_err() { + // We want to trigger a panic, because the buffer sizes weren't cleanly divisible by the FFT size, + // but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us + fft_error_outofplace(self.len(), input.len(), output.len(), 0, 0); + } + } + fn process_outofplace_with_scratch( &self, input: &mut [Complex], @@ -104,6 +137,15 @@ impl Butterfly1 { } } impl Fft for Butterfly1 { + fn process_outofplace_with_scratch_immut( + &self, + input: &[Complex], + output: &mut [Complex], + _scratch: &mut [Complex], + ) { + output.copy_from_slice(&input); + } + fn process_outofplace_with_scratch( &self, input: &mut [Complex], diff --git a/src/avx/avx32_butterflies.rs b/src/avx/avx32_butterflies.rs index 91f9def5..9f743f21 100644 --- a/src/avx/avx32_butterflies.rs +++ b/src/avx/avx32_butterflies.rs @@ -5,7 +5,7 @@ use std::mem::MaybeUninit; use num_complex::Complex; use crate::array_utils; -use crate::array_utils::workaround_transmute_mut; +use crate::array_utils::*; use crate::array_utils::DoubleBuf; use crate::common::{fft_error_inplace, fft_error_outofplace}; use crate::{common::FftNum, twiddles}; @@ -37,6 +37,42 @@ macro_rules! boilerplate_fft_simd_butterfly { } impl Fft for $struct_name { + fn process_outofplace_with_scratch_immut( + &self, + input: &[Complex], + output: &mut [Complex], + _scratch: &mut [Complex], + ) { + if input.len() < self.len() || output.len() != input.len() { + // We want to trigger a panic, but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us + fft_error_outofplace(self.len(), input.len(), output.len(), 0, 0); + return; // Unreachable, because fft_error_outofplace asserts, but it helps codegen to put it here + } + + let result = array_utils::iter_chunks_zipped( + input, + output, + self.len(), + |in_chunk, out_chunk| { + unsafe { + // Specialization workaround: See the comments in FftPlannerAvx::new() for why we have to transmute these slices + let input_slice = workaround_transmute(in_chunk); + let output_slice = workaround_transmute_mut(out_chunk); + self.perform_fft_f32(DoubleBuf { + input: input_slice, + output: output_slice, + }); + } + }, + ); + + if result.is_err() { + // We want to trigger a panic, because the buffer sizes weren't cleanly divisible by the FFT size, + // but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us + fft_error_outofplace(self.len(), input.len(), output.len(), 0, 0); + } + } + fn process_outofplace_with_scratch( &self, input: &mut [Complex], @@ -171,6 +207,38 @@ macro_rules! boilerplate_fft_simd_butterfly_with_scratch { } } impl Fft for $struct_name { + fn process_outofplace_with_scratch_immut( + &self, + input: &[Complex], + output: &mut [Complex], + scratch: &mut [Complex], + ) { + if input.len() < self.len() || output.len() != input.len() { + // We want to trigger a panic, but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us + fft_error_outofplace(self.len(), input.len(), output.len(), 0, 0); + return; // Unreachable, because fft_error_outofplace asserts, but it helps codegen to put it here + } + + // Specialization workaround: See the comments in FftPlannerAvx::new() for why these calls to array_utils::workaround_transmute are necessary + let transmuted_input: &[Complex] = + unsafe { array_utils::workaround_transmute(input) }; + let transmuted_output: &mut [Complex] = + unsafe { array_utils::workaround_transmute_mut(output) }; + let transmuted_scratch: &mut [Complex] = unsafe {array_utils::workaround_transmute_mut(scratch)}; + let result = array_utils::iter_chunks_zipped( + transmuted_input, + transmuted_output, + self.len(), + |in_chunk, out_chunk| self.process_outofplace_with_scratch_immut(in_chunk, out_chunk, transmuted_scratch), + ); + + if result.is_err() { + // We want to trigger a panic, because the buffer sizes weren't cleanly divisible by the FFT size, + // but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us + fft_error_outofplace(self.len(), input.len(), output.len(), 0, 0); + } + } + fn process_outofplace_with_scratch( &self, input: &mut [Complex], diff --git a/src/avx/avx64_butterflies.rs b/src/avx/avx64_butterflies.rs index 28415934..e4934790 100644 --- a/src/avx/avx64_butterflies.rs +++ b/src/avx/avx64_butterflies.rs @@ -5,7 +5,7 @@ use std::mem::MaybeUninit; use num_complex::Complex; use crate::array_utils; -use crate::array_utils::workaround_transmute_mut; +use crate::array_utils::*; use crate::array_utils::DoubleBuf; use crate::common::{fft_error_inplace, fft_error_outofplace}; use crate::{common::FftNum, twiddles}; @@ -35,6 +35,41 @@ macro_rules! boilerplate_fft_simd_butterfly { } } impl Fft for $struct_name { + fn process_outofplace_with_scratch_immut( + &self, + input: &[Complex], + output: &mut [Complex], + scratch: &mut [Complex], + ) { + if input.len() < self.len() || output.len() != input.len() { + // We want to trigger a panic, but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us + fft_error_outofplace(self.len(), input.len(), output.len(), 0, 0); + return; // Unreachable, because fft_error_outofplace asserts, but it helps codegen to put it here + } + + let result = array_utils::iter_chunks_zipped( + input, + output, + self.len(), + |in_chunk, out_chunk| { + unsafe { + // Specialization workaround: See the comments in FftPlannerAvx::new() for why we have to transmute these slices + let input_slice = workaround_transmute(in_chunk); + let output_slice = workaround_transmute_mut(out_chunk); + self.perform_fft_f64(DoubleBuf { + input: input_slice, + output: output_slice, + }); + } + }, + ); + + if result.is_err() { + // We want to trigger a panic, because the buffer sizes weren't cleanly divisible by the FFT size, + // but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us + fft_error_outofplace(self.len(), input.len(), output.len(), 0, 0); + } + } fn process_outofplace_with_scratch( &self, input: &mut [Complex], @@ -171,6 +206,38 @@ macro_rules! boilerplate_fft_simd_butterfly_with_scratch { } } impl Fft for $struct_name { + fn process_outofplace_with_scratch_immut( + &self, + input: &[Complex], + output: &mut [Complex], + scratch: &mut [Complex], + ) { + if input.len() < self.len() || output.len() != input.len() { + // We want to trigger a panic, but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us + fft_error_outofplace(self.len(), input.len(), output.len(), 0, 0); + return; // Unreachable, because fft_error_outofplace asserts, but it helps codegen to put it here + } + + // Specialization workaround: See the comments in FftPlannerAvx::new() for why these calls to array_utils::workaround_transmute are necessary + let transmuted_input: &[Complex] = + unsafe { array_utils::workaround_transmute(input) }; + let transmuted_output: &mut [Complex] = + unsafe { array_utils::workaround_transmute_mut(output) }; + let transmuted_scratch: &mut [Complex] = + unsafe { array_utils::workaround_transmute_mut(scratch) }; + let result = array_utils::iter_chunks_zipped( + transmuted_input, + transmuted_output, + self.len(), + |in_chunk, out_chunk| self.process_outofplace_with_scratch_immut(in_chunk, out_chunk, transmuted_scratch), + ); + + if result.is_err() { + // We want to trigger a panic, because the buffer sizes weren't cleanly divisible by the FFT size, + // but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us + fft_error_outofplace(self.len(), input.len(), output.len(), 0, 0); + } + } fn process_outofplace_with_scratch( &self, input: &mut [Complex], diff --git a/src/avx/mod.rs b/src/avx/mod.rs index 303d6c8e..38c26fc1 100644 --- a/src/avx/mod.rs +++ b/src/avx/mod.rs @@ -30,6 +30,50 @@ struct CommonSimdData { macro_rules! boilerplate_avx_fft { ($struct_name:ident, $len_fn:expr, $inplace_scratch_len_fn:expr, $out_of_place_scratch_len_fn:expr) => { impl Fft for $struct_name { + fn process_outofplace_with_scratch_immut( + &self, + input: &[Complex], + output: &mut [Complex], + scratch: &mut [Complex], + ) { + let required_scratch = self.get_outofplace_scratch_len(); + if scratch.len() < required_scratch + || input.len() < self.len() + || output.len() != input.len() + { + // We want to trigger a panic, but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us + fft_error_outofplace( + self.len(), + input.len(), + output.len(), + self.get_outofplace_scratch_len(), + scratch.len(), + ); + return; // Unreachable, because fft_error_outofplace asserts, but it helps codegen to put it here + } + + let scratch = &mut scratch[..required_scratch]; + let result = array_utils::iter_chunks_zipped( + input, + output, + self.len(), + |in_chunk, out_chunk| { + self.process_outofplace_with_scratch_immut(in_chunk, out_chunk, scratch) + }, + ); + + if result.is_err() { + // We want to trigger a panic, because the buffer sizes weren't cleanly divisible by the FFT size, + // but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us + fft_error_outofplace( + self.len(), + input.len(), + output.len(), + self.get_outofplace_scratch_len(), + scratch.len(), + ) + } + } fn process_outofplace_with_scratch( &self, input: &mut [Complex], @@ -130,6 +174,54 @@ macro_rules! boilerplate_avx_fft { macro_rules! boilerplate_avx_fft_commondata { ($struct_name:ident) => { impl Fft for $struct_name { + fn process_outofplace_with_scratch_immut( + &self, + input: &[Complex], + output: &mut [Complex], + scratch: &mut [Complex], + ) { + if self.len() == 0 { + return; + } + + let required_scratch = self.get_outofplace_scratch_len(); + if scratch.len() < required_scratch + || input.len() < self.len() + || output.len() != input.len() + { + // We want to trigger a panic, but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us + fft_error_outofplace( + self.len(), + input.len(), + output.len(), + self.get_outofplace_scratch_len(), + scratch.len(), + ); + return; // Unreachable, because fft_error_outofplace asserts, but it helps codegen to put it here + } + + let scratch = &mut scratch[..required_scratch]; + let result = array_utils::iter_chunks_zipped( + input, + output, + self.len(), + |in_chunk, out_chunk| { + self.process_outofplace_with_scratch_immut(in_chunk, out_chunk, scratch) + }, + ); + + if result.is_err() { + // We want to trigger a panic, because the buffer sizes weren't cleanly divisible by the FFT size, + // but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us + fft_error_outofplace( + self.len(), + input.len(), + output.len(), + self.get_outofplace_scratch_len(), + scratch.len(), + ); + } + } fn process_outofplace_with_scratch( &self, input: &mut [Complex], diff --git a/src/common.rs b/src/common.rs index 43e0facd..85cef334 100644 --- a/src/common.rs +++ b/src/common.rs @@ -73,6 +73,47 @@ pub fn fft_error_outofplace( macro_rules! boilerplate_fft_oop { ($struct_name:ident, $len_fn:expr) => { impl Fft for $struct_name { + fn process_outofplace_with_scratch_immut( + &self, + input: &[Complex], + output: &mut [Complex], + scratch: &mut [Complex], + ) { + if self.len() == 0 { + return; + } + + let required_scratch = self.get_outofplace_scratch_len(); + if input.len() < self.len() + || output.len() != input.len() + || scratch.len() < required_scratch + { + // We want to trigger a panic, but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us + fft_error_outofplace( + self.len(), + input.len(), + output.len(), + required_scratch, + scratch.len(), + ); + return; // Unreachable, because fft_error_outofplace asserts, but it helps codegen to put it here + } + + let result = array_utils::iter_chunks_zipped( + input, + output, + self.len(), + |in_chunk, out_chunk| { + self.process_outofplace_with_scratch_immut(in_chunk, out_chunk, scratch) + }, + ); + + if result.is_err() { + // We want to trigger a panic, because the buffer sizes weren't cleanly divisible by the FFT size, + // but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us + fft_error_outofplace(self.len(), input.len(), output.len(), 0, 0); + } + } fn process_outofplace_with_scratch( &self, input: &mut [Complex], @@ -175,6 +216,55 @@ macro_rules! boilerplate_fft_oop { macro_rules! boilerplate_fft { ($struct_name:ident, $len_fn:expr, $inplace_scratch_len_fn:expr, $out_of_place_scratch_len_fn:expr) => { impl Fft for $struct_name { + fn process_outofplace_with_scratch_immut( + &self, + input: &[Complex], + output: &mut [Complex], + scratch: &mut [Complex], + ) { + if self.len() == 0 { + return; + } + + let required_scratch = self.get_outofplace_scratch_len(); + if scratch.len() < required_scratch + || input.len() < self.len() + || output.len() != input.len() + { + // We want to trigger a panic, but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us + fft_error_outofplace( + self.len(), + input.len(), + output.len(), + self.get_outofplace_scratch_len(), + scratch.len(), + ); + return; // Unreachable, because fft_error_outofplace asserts, but it helps codegen to put it here + } + + let scratch = &mut scratch[..required_scratch]; + let result = array_utils::iter_chunks_zipped( + input, + output, + self.len(), + |in_chunk, out_chunk| { + self.process_outofplace_with_scratch_immut(in_chunk, out_chunk, scratch) + }, + ); + + if result.is_err() { + // We want to trigger a panic, because the buffer sizes weren't cleanly divisible by the FFT size, + // but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us + fft_error_outofplace( + self.len(), + input.len(), + output.len(), + self.get_outofplace_scratch_len(), + scratch.len(), + ); + } + } + fn process_outofplace_with_scratch( &self, input: &mut [Complex], diff --git a/src/lib.rs b/src/lib.rs index 74b47876..f4d947eb 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -236,6 +236,13 @@ pub trait Fft: Length + Direction + Sync + Send { scratch: &mut [Complex], ); + fn process_outofplace_with_scratch_immut( + &self, + input: &[Complex], + output: &mut [Complex], + scratch: &mut [Complex], + ); + /// Returns the size of the scratch buffer required by `process_with_scratch` /// /// For most FFT sizes, this method will return `self.len()`. For a few small sizes it will return 0, and for some special FFT sizes diff --git a/src/test_utils.rs b/src/test_utils.rs index 1bbb48bd..4a60f197 100644 --- a/src/test_utils.rs +++ b/src/test_utils.rs @@ -213,6 +213,20 @@ impl Fft for BigScratchAlgorithm { fn get_outofplace_scratch_len(&self) -> usize { self.outofplace_scratch } + + fn process_outofplace_with_scratch_immut( + &self, + input: &[Complex], + output: &mut [Complex], + scratch: &mut [Complex], + ) { + assert!( + scratch.len() >= self.outofplace_scratch, + "Not enough OOP scratch provided, self={:?}, provided scratch={}", + &self, + scratch.len() + ); + } } impl Length for BigScratchAlgorithm { fn len(&self) -> usize { From 346a0d94f4d6f1a8a57be697a8dba780c11f5b84 Mon Sep 17 00:00:00 2001 From: Michael Ciraci Date: Sun, 9 Mar 2025 12:29:47 -0500 Subject: [PATCH 03/10] Adding neon --- Cargo.toml | 2 +- src/neon/neon_butterflies.rs | 35 +++++++++++++++++++++++++++-------- src/neon/neon_common.rs | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 61 insertions(+), 9 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 74b1a875..8b637e86 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,7 +13,7 @@ categories = ["algorithms", "compression", "multimedia::encoding", "science"] license = "MIT OR Apache-2.0" [features] -default = ["avx"] +default = ["avx","neon"] # On x86_64, the "avx" feature enables compilation of AVX-acclerated code. # Similarly, the "sse" feature enables compilation of SSE-accelerated code. diff --git a/src/neon/neon_butterflies.rs b/src/neon/neon_butterflies.rs index 261b51d9..36317a4e 100644 --- a/src/neon/neon_butterflies.rs +++ b/src/neon/neon_butterflies.rs @@ -55,16 +55,16 @@ macro_rules! boilerplate_fft_neon_f32_butterfly { // Do multiple ffts over a longer vector outofplace, called from "process_outofplace_with_scratch" of Fft trait pub(crate) unsafe fn perform_oop_fft_butterfly_multi( &self, - input: &mut [Complex], + input: &[Complex], output: &mut [Complex], ) -> Result<(), ()> { let len = input.len(); - let alldone = array_utils::iter_chunks_zipped_mut( + let alldone = array_utils::iter_chunks_zipped( input, output, 2 * self.len(), |in_chunk, out_chunk| { - let input_slice = workaround_transmute_mut(in_chunk); + let input_slice = crate::array_utils::workaround_transmute(in_chunk); let output_slice = workaround_transmute_mut(out_chunk); self.perform_parallel_fft_contiguous(DoubleBuf { input: input_slice, @@ -73,10 +73,10 @@ macro_rules! boilerplate_fft_neon_f32_butterfly { }, ); if alldone.is_err() && input.len() >= self.len() { - let input_slice = workaround_transmute_mut(input); + let input_slice = crate::array_utils::workaround_transmute(input); let output_slice = workaround_transmute_mut(output); self.perform_fft_contiguous(DoubleBuf { - input: &mut input_slice[len - self.len()..], + input: &input_slice[len - self.len()..], output: &mut output_slice[len - self.len()..], }) } @@ -110,11 +110,11 @@ macro_rules! boilerplate_fft_neon_f64_butterfly { //#[target_feature(enable = "neon")] pub(crate) unsafe fn perform_oop_fft_butterfly_multi( &self, - input: &mut [Complex], + input: &[Complex], output: &mut [Complex], ) -> Result<(), ()> { - array_utils::iter_chunks_zipped_mut(input, output, self.len(), |in_chunk, out_chunk| { - let input_slice = workaround_transmute_mut(in_chunk); + array_utils::iter_chunks_zipped(input, output, self.len(), |in_chunk, out_chunk| { + let input_slice = crate::array_utils::workaround_transmute(in_chunk); let output_slice = workaround_transmute_mut(out_chunk); self.perform_fft_contiguous(DoubleBuf { input: input_slice, @@ -130,6 +130,25 @@ macro_rules! boilerplate_fft_neon_f64_butterfly { macro_rules! boilerplate_fft_neon_common_butterfly { ($struct_name:ident, $len:expr, $direction_fn:expr) => { impl Fft for $struct_name { + fn process_outofplace_with_scratch_immut( + &self, + input: &[Complex], + output: &mut [Complex], + _scratch: &mut [Complex], + ) { + if input.len() < self.len() || output.len() != input.len() { + // We want to trigger a panic, but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us + fft_error_outofplace(self.len(), input.len(), output.len(), 0, 0); + return; // Unreachable, because fft_error_outofplace asserts, but it helps codegen to put it here + } + let result = unsafe { self.perform_oop_fft_butterfly_multi(input, output) }; + + if result.is_err() { + // We want to trigger a panic, because the buffer sizes weren't cleanly divisible by the FFT size, + // but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us + fft_error_outofplace(self.len(), input.len(), output.len(), 0, 0); + } + } fn process_outofplace_with_scratch( &self, input: &mut [Complex], diff --git a/src/neon/neon_common.rs b/src/neon/neon_common.rs index f48ceb97..f0d3abea 100644 --- a/src/neon/neon_common.rs +++ b/src/neon/neon_common.rs @@ -57,6 +57,39 @@ macro_rules! separate_interleaved_complex_f32 { macro_rules! boilerplate_fft_neon_oop { ($struct_name:ident, $len_fn:expr) => { impl Fft for $struct_name { + fn process_outofplace_with_scratch_immut( + &self, + input: &[Complex], + output: &mut [Complex], + _scratch: &mut [Complex], + ) { + if self.len() == 0 { + return; + } + + if input.len() < self.len() || output.len() != input.len() { + // We want to trigger a panic, but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us + fft_error_outofplace(self.len(), input.len(), output.len(), 0, 0); + return; // Unreachable, because fft_error_outofplace asserts, but it helps codegen to put it here + } + + let result = unsafe { + array_utils::iter_chunks_zipped( + input, + output, + self.len(), + |in_chunk, out_chunk| { + self.perform_fft_out_of_place(in_chunk, out_chunk, &mut []) + }, + ) + }; + + if result.is_err() { + // We want to trigger a panic, because the buffer sizes weren't cleanly divisible by the FFT size, + // but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us + fft_error_outofplace(self.len(), input.len(), output.len(), 0, 0); + } + } fn process_outofplace_with_scratch( &self, input: &mut [Complex], From b37d08007b0b08e224d6ee8f27b8907007295e45 Mon Sep 17 00:00:00 2001 From: Michael Ciraci Date: Sun, 9 Mar 2025 14:44:05 -0500 Subject: [PATCH 04/10] Adding mixed radix implementation --- src/algorithm/bluesteins_algorithm.rs | 9 ++++ src/algorithm/dft.rs | 9 ++++ src/algorithm/good_thomas_algorithm.rs | 18 +++++++ src/algorithm/mixed_radix.rs | 67 ++++++++++++++++++++++++++ src/algorithm/raders_algorithm.rs | 9 ++++ src/algorithm/radix3.rs | 9 ++++ src/algorithm/radix4.rs | 9 ++++ src/algorithm/radixn.rs | 9 ++++ src/avx/avx32_butterflies.rs | 9 ++++ src/avx/avx64_butterflies.rs | 9 ++++ src/avx/avx_bluesteins.rs | 9 ++++ src/avx/avx_mixed_radix.rs | 9 ++++ src/avx/avx_raders.rs | 9 ++++ src/common.rs | 7 ++- src/lib.rs | 11 +++++ src/neon/neon_radix4.rs | 9 ++++ tests/test_basic.rs | 15 ++++++ 17 files changed, 224 insertions(+), 2 deletions(-) create mode 100644 tests/test_basic.rs diff --git a/src/algorithm/bluesteins_algorithm.rs b/src/algorithm/bluesteins_algorithm.rs index 9cc3642c..a16b6703 100644 --- a/src/algorithm/bluesteins_algorithm.rs +++ b/src/algorithm/bluesteins_algorithm.rs @@ -137,6 +137,15 @@ impl BluesteinsAlgorithm { } } + fn perform_fft_out_of_place_immut( + &self, + input: &[Complex], + output: &mut [Complex], + scratch: &mut [Complex], + ) { + todo!() + } + fn perform_fft_out_of_place( &self, input: &mut [Complex], diff --git a/src/algorithm/dft.rs b/src/algorithm/dft.rs index e0b700cf..0e956fd1 100644 --- a/src/algorithm/dft.rs +++ b/src/algorithm/dft.rs @@ -68,6 +68,15 @@ impl Dft { } } } + + fn perform_fft_out_of_place_immut( + &self, + input: &[Complex], + output: &mut [Complex], + scratch: &mut [Complex], + ) { + todo!() + } } boilerplate_fft_oop!(Dft, |this: &Dft<_>| this.twiddles.len()); diff --git a/src/algorithm/good_thomas_algorithm.rs b/src/algorithm/good_thomas_algorithm.rs index 005ba529..b28cff5e 100644 --- a/src/algorithm/good_thomas_algorithm.rs +++ b/src/algorithm/good_thomas_algorithm.rs @@ -241,6 +241,15 @@ impl GoodThomasAlgorithm { self.reindex_output(scratch, buffer); } + fn perform_fft_out_of_place_immut( + &self, + input: &[Complex], + output: &mut [Complex], + scratch: &mut [Complex], + ) { + todo!() + } + fn perform_fft_out_of_place( &self, input: &mut [Complex], @@ -384,6 +393,15 @@ impl GoodThomasAlgorithmSmall { } } + fn perform_fft_out_of_place_immut( + &self, + input: &[Complex], + output: &mut [Complex], + scratch: &mut [Complex], + ) { + todo!() + } + fn perform_fft_out_of_place( &self, input: &mut [Complex], diff --git a/src/algorithm/mixed_radix.rs b/src/algorithm/mixed_radix.rs index 808ff0fa..ac496a65 100644 --- a/src/algorithm/mixed_radix.rs +++ b/src/algorithm/mixed_radix.rs @@ -151,6 +151,45 @@ impl MixedRadix { transpose::transpose(scratch, buffer, self.width, self.height); } + fn perform_fft_out_of_place_immut( + &self, + input: &[Complex], + output: &mut [Complex], + scratch: &mut [Complex], + ) { + // STEP 1: transpose + transpose::transpose(input, output, self.width, self.height); + + // STEP 2: perform FFTs of size `height` + // let height_scratch = if scratch.len() > input.len() { + // &mut scratch[..] + // } else { + // &mut input[..] + // }; + self.height_size_fft + .process_with_scratch(output, scratch); + + // STEP 3: Apply twiddle factors + for (element, twiddle) in output.iter_mut().zip(self.twiddles.iter()) { + *element = *element * twiddle; + } + + // STEP 4: transpose again + transpose::transpose(output, scratch, self.height, self.width); + + // STEP 5: perform FFTs of size `width` + // let width_scratch = if scratch.len() > output.len() { + // &mut scratch[..] + // } else { + // &mut output[..] + // }; + self.width_size_fft + .process_with_scratch(scratch, output); + + // STEP 6: transpose again + transpose::transpose(scratch, output, self.width, self.height); + } + fn perform_fft_out_of_place( &self, input: &mut [Complex], @@ -302,6 +341,34 @@ impl MixedRadixSmall { unsafe { array_utils::transpose_small(self.width, self.height, scratch, buffer) }; } + fn perform_fft_out_of_place_immut( + &self, + input: &[Complex], + output: &mut [Complex], + scratch: &mut [Complex], + ) { + // SIX STEP FFT: + // STEP 1: transpose + unsafe { array_utils::transpose_small(self.width, self.height, input, output) }; + + // STEP 2: perform FFTs of size `height` + self.height_size_fft.process_with_scratch(output, scratch); + + // STEP 3: Apply twiddle factors + for (element, twiddle) in output.iter_mut().zip(self.twiddles.iter()) { + *element = *element * twiddle; + } + + // STEP 4: transpose again + unsafe { array_utils::transpose_small(self.height, self.width, output, scratch) }; + + // STEP 5: perform FFTs of size `width` + self.width_size_fft.process_with_scratch(scratch, output); + + // STEP 6: transpose again + unsafe { array_utils::transpose_small(self.width, self.height, scratch, output) }; + } + fn perform_fft_out_of_place( &self, input: &mut [Complex], diff --git a/src/algorithm/raders_algorithm.rs b/src/algorithm/raders_algorithm.rs index 7f059dd0..c9fa76c8 100644 --- a/src/algorithm/raders_algorithm.rs +++ b/src/algorithm/raders_algorithm.rs @@ -119,6 +119,15 @@ impl RadersAlgorithm { } } + fn perform_fft_out_of_place_immut( + &self, + input: &[Complex], + output: &mut [Complex], + scratch: &mut [Complex], + ) { + todo!() + } + fn perform_fft_out_of_place( &self, input: &mut [Complex], diff --git a/src/algorithm/radix3.rs b/src/algorithm/radix3.rs index d392f8c1..358f8586 100644 --- a/src/algorithm/radix3.rs +++ b/src/algorithm/radix3.rs @@ -118,6 +118,15 @@ impl Radix3 { self.outofplace_scratch_len } + fn perform_fft_out_of_place_immut( + &self, + input: &[Complex], + output: &mut [Complex], + scratch: &mut [Complex], + ) { + todo!() + } + fn perform_fft_out_of_place( &self, input: &mut [Complex], diff --git a/src/algorithm/radix4.rs b/src/algorithm/radix4.rs index 33a804e4..ffc998ae 100644 --- a/src/algorithm/radix4.rs +++ b/src/algorithm/radix4.rs @@ -124,6 +124,15 @@ impl Radix4 { self.outofplace_scratch_len } + fn perform_fft_out_of_place_immut( + &self, + input: &[Complex], + output: &mut [Complex], + scratch: &mut [Complex], + ) { + todo!() + } + fn perform_fft_out_of_place( &self, input: &mut [Complex], diff --git a/src/algorithm/radixn.rs b/src/algorithm/radixn.rs index bd4d35ca..40503298 100644 --- a/src/algorithm/radixn.rs +++ b/src/algorithm/radixn.rs @@ -174,6 +174,15 @@ impl RadixN { self.outofplace_scratch_len } + fn perform_fft_out_of_place_immut( + &self, + input: &[Complex], + output: &mut [Complex], + scratch: &mut [Complex], + ) { + todo!() + } + fn perform_fft_out_of_place( &self, input: &mut [Complex], diff --git a/src/avx/avx32_butterflies.rs b/src/avx/avx32_butterflies.rs index 9f743f21..708b11ac 100644 --- a/src/avx/avx32_butterflies.rs +++ b/src/avx/avx32_butterflies.rs @@ -191,6 +191,15 @@ macro_rules! boilerplate_fft_simd_butterfly_with_scratch { }; } + fn perform_fft_out_of_place_immut( + &self, + input: &[Complex], + output: &mut [Complex], + scratch: &mut [Complex], + ) { + todo!() + } + #[inline] fn perform_fft_out_of_place( &self, diff --git a/src/avx/avx64_butterflies.rs b/src/avx/avx64_butterflies.rs index e4934790..3e3316c2 100644 --- a/src/avx/avx64_butterflies.rs +++ b/src/avx/avx64_butterflies.rs @@ -190,6 +190,15 @@ macro_rules! boilerplate_fft_simd_butterfly_with_scratch { }; } + fn perform_fft_out_of_place_immut( + &self, + input: &[Complex], + output: &mut [Complex], + scratch: &mut [Complex], + ) { + todo!() + } + #[inline] fn perform_fft_out_of_place( &self, diff --git a/src/avx/avx_bluesteins.rs b/src/avx/avx_bluesteins.rs index 30e8f8e1..e9757e01 100644 --- a/src/avx/avx_bluesteins.rs +++ b/src/avx/avx_bluesteins.rs @@ -311,6 +311,15 @@ impl BluesteinsAvx { } } + fn perform_fft_out_of_place_immut( + &self, + input: &[Complex], + output: &mut [Complex], + scratch: &mut [Complex], + ) { + todo!() + } + fn perform_fft_out_of_place( &self, input: &[Complex], diff --git a/src/avx/avx_mixed_radix.rs b/src/avx/avx_mixed_radix.rs index ed6de0fc..42b283be 100644 --- a/src/avx/avx_mixed_radix.rs +++ b/src/avx/avx_mixed_radix.rs @@ -69,6 +69,15 @@ macro_rules! boilerplate_mixedradix { } } + fn perform_fft_out_of_place_immut( + &self, + input: &[Complex], + output: &mut [Complex], + scratch: &mut [Complex], + ) { + todo!() + } + #[inline] fn perform_fft_out_of_place( &self, diff --git a/src/avx/avx_raders.rs b/src/avx/avx_raders.rs index 34b76b1d..e3e0cfc7 100644 --- a/src/avx/avx_raders.rs +++ b/src/avx/avx_raders.rs @@ -351,6 +351,15 @@ impl RadersAvx2 { } } + fn perform_fft_out_of_place_immut( + &self, + input: &[Complex], + output: &mut [Complex], + scratch: &mut [Complex], + ) { + todo!() + } + fn perform_fft_out_of_place( &self, input: &mut [Complex], diff --git a/src/common.rs b/src/common.rs index 85cef334..b082a62a 100644 --- a/src/common.rs +++ b/src/common.rs @@ -226,7 +226,10 @@ macro_rules! boilerplate_fft { return; } - let required_scratch = self.get_outofplace_scratch_len(); + // Normally the required scratch would just be `self.self.get_outofplace_scratch_len()` + // However, small FFTs could have a required scratch of 0. Use the max of the scratch + // and the length for that case + let required_scratch = usize::max(self.get_outofplace_scratch_len(), self.len()); if scratch.len() < required_scratch || input.len() < self.len() || output.len() != input.len() @@ -248,7 +251,7 @@ macro_rules! boilerplate_fft { output, self.len(), |in_chunk, out_chunk| { - self.process_outofplace_with_scratch_immut(in_chunk, out_chunk, scratch) + self.perform_fft_out_of_place_immut(in_chunk, out_chunk, scratch) }, ); diff --git a/src/lib.rs b/src/lib.rs index f4d947eb..6e853ef5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -243,6 +243,17 @@ pub trait Fft: Length + Direction + Sync + Send { scratch: &mut [Complex], ); + fn process_outofplace_with_scratch_naive( + &self, + input: &[Complex], + output: &mut [Complex], + scratch: &mut [Complex], + ) { + let (mut input_scratch, scratch) = scratch.split_at_mut(input.len()); + input_scratch.copy_from_slice(input); + self.process_outofplace_with_scratch(&mut input_scratch, output, scratch); + } + /// Returns the size of the scratch buffer required by `process_with_scratch` /// /// For most FFT sizes, this method will return `self.len()`. For a few small sizes it will return 0, and for some special FFT sizes diff --git a/src/neon/neon_radix4.rs b/src/neon/neon_radix4.rs index 841a782c..aeff1a4c 100644 --- a/src/neon/neon_radix4.rs +++ b/src/neon/neon_radix4.rs @@ -83,6 +83,15 @@ impl NeonRadix4 { } } + unsafe fn perform_fft_out_of_place_immut( + &self, + input: &[Complex], + output: &mut [Complex], + scratch: &mut [Complex], + ) { + todo!() + } + //#[target_feature(enable = "neon")] unsafe fn perform_fft_out_of_place( &self, diff --git a/tests/test_basic.rs b/tests/test_basic.rs new file mode 100644 index 00000000..8018c9a5 --- /dev/null +++ b/tests/test_basic.rs @@ -0,0 +1,15 @@ +use num_complex::Complex; +use rustfft::{Fft, FftPlanner}; +use std::sync::Arc; + +// Just a very simple test to help debugging +#[test] +fn test_100() { + let mut p = FftPlanner::::new(); + let planner: Arc> = p.plan_fft_forward(100); + + let mut input: Vec<_> = (0..100).map(|i| Complex::new(i as f32, 0.0)).collect(); + let mut output = input.clone(); + let mut scratch = input.clone(); + planner.process_outofplace_with_scratch(&mut input, &mut output, &mut scratch); +} From 2f346b4686ed4e2479abb81bb1550a76ed69a19e Mon Sep 17 00:00:00 2001 From: Michael Ciraci Date: Sun, 9 Mar 2025 14:56:02 -0500 Subject: [PATCH 05/10] Adding implementation for bluesteins and good thomas --- src/algorithm/bluesteins_algorithm.rs | 5 ++++- src/algorithm/good_thomas_algorithm.rs | 25 ++++++++++++++++++++++++- tests/test_basic.rs | 23 +++++++++++++++++------ 3 files changed, 45 insertions(+), 8 deletions(-) diff --git a/src/algorithm/bluesteins_algorithm.rs b/src/algorithm/bluesteins_algorithm.rs index a16b6703..deca84af 100644 --- a/src/algorithm/bluesteins_algorithm.rs +++ b/src/algorithm/bluesteins_algorithm.rs @@ -143,7 +143,10 @@ impl BluesteinsAlgorithm { output: &mut [Complex], scratch: &mut [Complex], ) { - todo!() + // TODO - Is there a better way to do this? + let (mut input_scratch, scratch) = scratch.split_at_mut(input.len()); + input_scratch.copy_from_slice(input); + self.process_outofplace_with_scratch(&mut input_scratch, output, scratch); } fn perform_fft_out_of_place( diff --git a/src/algorithm/good_thomas_algorithm.rs b/src/algorithm/good_thomas_algorithm.rs index b28cff5e..e01328f4 100644 --- a/src/algorithm/good_thomas_algorithm.rs +++ b/src/algorithm/good_thomas_algorithm.rs @@ -399,7 +399,30 @@ impl GoodThomasAlgorithmSmall { output: &mut [Complex], scratch: &mut [Complex], ) { - todo!() + // These asserts are for the unsafe blocks down below. we're relying on the optimizer to get rid of this assert + assert_eq!(self.len(), input.len()); + assert_eq!(self.len(), output.len()); + + let (input_map, output_map) = self.input_output_map.split_at(self.len()); + + // copy the input using our reordering mapping + for (output_element, &input_index) in output.iter_mut().zip(input_map.iter()) { + *output_element = input[input_index]; + } + + // run FFTs of size `width` + self.width_size_fft.process_with_scratch(output, scratch); + + // transpose + unsafe { array_utils::transpose_small(self.width, self.height, output, scratch) }; + + // run FFTs of size 'height' + self.height_size_fft.process_with_scratch(scratch, output); + + // copy to the output, using our output redordeing mapping + for (input_element, &output_index) in scratch.iter().zip(output_map.iter()) { + output[output_index] = *input_element; + } } fn perform_fft_out_of_place( diff --git a/tests/test_basic.rs b/tests/test_basic.rs index 8018c9a5..cf06e3f3 100644 --- a/tests/test_basic.rs +++ b/tests/test_basic.rs @@ -5,11 +5,22 @@ use std::sync::Arc; // Just a very simple test to help debugging #[test] fn test_100() { - let mut p = FftPlanner::::new(); - let planner: Arc> = p.plan_fft_forward(100); + // let len = 102; + for len in 1..250 { + dbg!(len); + let mut p = FftPlanner::::new(); + let planner: Arc> = p.plan_fft_forward(len); - let mut input: Vec<_> = (0..100).map(|i| Complex::new(i as f32, 0.0)).collect(); - let mut output = input.clone(); - let mut scratch = input.clone(); - planner.process_outofplace_with_scratch(&mut input, &mut output, &mut scratch); + let mut input: Vec<_> = (0..len).map(|i| Complex::new(i as f32, 0.0)).collect(); + dbg!(planner.get_inplace_scratch_len(), planner.get_outofplace_scratch_len()); + let mut output = input.clone(); + let mut output2 = output.clone(); + let mut scratch = vec![Complex::::ZERO; planner.get_outofplace_scratch_len() + len]; + // planner.process_outofplace_with_scratch(&mut input, &mut output, &mut scratch); + planner.process_outofplace_with_scratch_immut(&input, &mut output, &mut scratch); + + planner.process_outofplace_with_scratch(&mut input, &mut output2, &mut scratch); + + assert_eq!(output, output2); + } } From 7eb265341924dcac730977e771651e59fb588a33 Mon Sep 17 00:00:00 2001 From: Michael Ciraci Date: Sun, 9 Mar 2025 15:56:41 -0500 Subject: [PATCH 06/10] Removing overridden impl --- src/common.rs | 92 --------------------------------------------------- src/lib.rs | 7 ---- 2 files changed, 99 deletions(-) diff --git a/src/common.rs b/src/common.rs index b082a62a..cc228acf 100644 --- a/src/common.rs +++ b/src/common.rs @@ -73,47 +73,6 @@ pub fn fft_error_outofplace( macro_rules! boilerplate_fft_oop { ($struct_name:ident, $len_fn:expr) => { impl Fft for $struct_name { - fn process_outofplace_with_scratch_immut( - &self, - input: &[Complex], - output: &mut [Complex], - scratch: &mut [Complex], - ) { - if self.len() == 0 { - return; - } - - let required_scratch = self.get_outofplace_scratch_len(); - if input.len() < self.len() - || output.len() != input.len() - || scratch.len() < required_scratch - { - // We want to trigger a panic, but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us - fft_error_outofplace( - self.len(), - input.len(), - output.len(), - required_scratch, - scratch.len(), - ); - return; // Unreachable, because fft_error_outofplace asserts, but it helps codegen to put it here - } - - let result = array_utils::iter_chunks_zipped( - input, - output, - self.len(), - |in_chunk, out_chunk| { - self.process_outofplace_with_scratch_immut(in_chunk, out_chunk, scratch) - }, - ); - - if result.is_err() { - // We want to trigger a panic, because the buffer sizes weren't cleanly divisible by the FFT size, - // but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us - fft_error_outofplace(self.len(), input.len(), output.len(), 0, 0); - } - } fn process_outofplace_with_scratch( &self, input: &mut [Complex], @@ -216,57 +175,6 @@ macro_rules! boilerplate_fft_oop { macro_rules! boilerplate_fft { ($struct_name:ident, $len_fn:expr, $inplace_scratch_len_fn:expr, $out_of_place_scratch_len_fn:expr) => { impl Fft for $struct_name { - fn process_outofplace_with_scratch_immut( - &self, - input: &[Complex], - output: &mut [Complex], - scratch: &mut [Complex], - ) { - if self.len() == 0 { - return; - } - - // Normally the required scratch would just be `self.self.get_outofplace_scratch_len()` - // However, small FFTs could have a required scratch of 0. Use the max of the scratch - // and the length for that case - let required_scratch = usize::max(self.get_outofplace_scratch_len(), self.len()); - if scratch.len() < required_scratch - || input.len() < self.len() - || output.len() != input.len() - { - // We want to trigger a panic, but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us - fft_error_outofplace( - self.len(), - input.len(), - output.len(), - self.get_outofplace_scratch_len(), - scratch.len(), - ); - return; // Unreachable, because fft_error_outofplace asserts, but it helps codegen to put it here - } - - let scratch = &mut scratch[..required_scratch]; - let result = array_utils::iter_chunks_zipped( - input, - output, - self.len(), - |in_chunk, out_chunk| { - self.perform_fft_out_of_place_immut(in_chunk, out_chunk, scratch) - }, - ); - - if result.is_err() { - // We want to trigger a panic, because the buffer sizes weren't cleanly divisible by the FFT size, - // but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us - fft_error_outofplace( - self.len(), - input.len(), - output.len(), - self.get_outofplace_scratch_len(), - scratch.len(), - ); - } - } fn process_outofplace_with_scratch( &self, diff --git a/src/lib.rs b/src/lib.rs index 6e853ef5..c2615b20 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -241,13 +241,6 @@ pub trait Fft: Length + Direction + Sync + Send { input: &[Complex], output: &mut [Complex], scratch: &mut [Complex], - ); - - fn process_outofplace_with_scratch_naive( - &self, - input: &[Complex], - output: &mut [Complex], - scratch: &mut [Complex], ) { let (mut input_scratch, scratch) = scratch.split_at_mut(input.len()); input_scratch.copy_from_slice(input); From edad1ca74eb6ff489aeca072908978b67d6e20f7 Mon Sep 17 00:00:00 2001 From: Michael Ciraci Date: Sun, 9 Mar 2025 17:36:22 -0500 Subject: [PATCH 07/10] Adding more AVX instructions --- src/avx/avx32_butterflies.rs | 25 +-------------- src/avx/avx64_butterflies.rs | 55 ++------------------------------ src/avx/avx_mixed_radix.rs | 33 ++++++++++++++++++- src/avx/avx_raders.rs | 62 +++++++++++++++++++++++++++++++++++- src/avx/mod.rs | 12 +++---- tests/test_basic.rs | 4 +-- 6 files changed, 104 insertions(+), 87 deletions(-) diff --git a/src/avx/avx32_butterflies.rs b/src/avx/avx32_butterflies.rs index 708b11ac..e9dadf78 100644 --- a/src/avx/avx32_butterflies.rs +++ b/src/avx/avx32_butterflies.rs @@ -222,30 +222,7 @@ macro_rules! boilerplate_fft_simd_butterfly_with_scratch { output: &mut [Complex], scratch: &mut [Complex], ) { - if input.len() < self.len() || output.len() != input.len() { - // We want to trigger a panic, but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us - fft_error_outofplace(self.len(), input.len(), output.len(), 0, 0); - return; // Unreachable, because fft_error_outofplace asserts, but it helps codegen to put it here - } - - // Specialization workaround: See the comments in FftPlannerAvx::new() for why these calls to array_utils::workaround_transmute are necessary - let transmuted_input: &[Complex] = - unsafe { array_utils::workaround_transmute(input) }; - let transmuted_output: &mut [Complex] = - unsafe { array_utils::workaround_transmute_mut(output) }; - let transmuted_scratch: &mut [Complex] = unsafe {array_utils::workaround_transmute_mut(scratch)}; - let result = array_utils::iter_chunks_zipped( - transmuted_input, - transmuted_output, - self.len(), - |in_chunk, out_chunk| self.process_outofplace_with_scratch_immut(in_chunk, out_chunk, transmuted_scratch), - ); - - if result.is_err() { - // We want to trigger a panic, because the buffer sizes weren't cleanly divisible by the FFT size, - // but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us - fft_error_outofplace(self.len(), input.len(), output.len(), 0, 0); - } + todo!() } fn process_outofplace_with_scratch( diff --git a/src/avx/avx64_butterflies.rs b/src/avx/avx64_butterflies.rs index 3e3316c2..57846fc6 100644 --- a/src/avx/avx64_butterflies.rs +++ b/src/avx/avx64_butterflies.rs @@ -41,34 +41,7 @@ macro_rules! boilerplate_fft_simd_butterfly { output: &mut [Complex], scratch: &mut [Complex], ) { - if input.len() < self.len() || output.len() != input.len() { - // We want to trigger a panic, but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us - fft_error_outofplace(self.len(), input.len(), output.len(), 0, 0); - return; // Unreachable, because fft_error_outofplace asserts, but it helps codegen to put it here - } - - let result = array_utils::iter_chunks_zipped( - input, - output, - self.len(), - |in_chunk, out_chunk| { - unsafe { - // Specialization workaround: See the comments in FftPlannerAvx::new() for why we have to transmute these slices - let input_slice = workaround_transmute(in_chunk); - let output_slice = workaround_transmute_mut(out_chunk); - self.perform_fft_f64(DoubleBuf { - input: input_slice, - output: output_slice, - }); - } - }, - ); - - if result.is_err() { - // We want to trigger a panic, because the buffer sizes weren't cleanly divisible by the FFT size, - // but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us - fft_error_outofplace(self.len(), input.len(), output.len(), 0, 0); - } + todo!() } fn process_outofplace_with_scratch( &self, @@ -221,31 +194,7 @@ macro_rules! boilerplate_fft_simd_butterfly_with_scratch { output: &mut [Complex], scratch: &mut [Complex], ) { - if input.len() < self.len() || output.len() != input.len() { - // We want to trigger a panic, but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us - fft_error_outofplace(self.len(), input.len(), output.len(), 0, 0); - return; // Unreachable, because fft_error_outofplace asserts, but it helps codegen to put it here - } - - // Specialization workaround: See the comments in FftPlannerAvx::new() for why these calls to array_utils::workaround_transmute are necessary - let transmuted_input: &[Complex] = - unsafe { array_utils::workaround_transmute(input) }; - let transmuted_output: &mut [Complex] = - unsafe { array_utils::workaround_transmute_mut(output) }; - let transmuted_scratch: &mut [Complex] = - unsafe { array_utils::workaround_transmute_mut(scratch) }; - let result = array_utils::iter_chunks_zipped( - transmuted_input, - transmuted_output, - self.len(), - |in_chunk, out_chunk| self.process_outofplace_with_scratch_immut(in_chunk, out_chunk, transmuted_scratch), - ); - - if result.is_err() { - // We want to trigger a panic, because the buffer sizes weren't cleanly divisible by the FFT size, - // but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us - fft_error_outofplace(self.len(), input.len(), output.len(), 0, 0); - } + todo!() } fn process_outofplace_with_scratch( &self, diff --git a/src/avx/avx_mixed_radix.rs b/src/avx/avx_mixed_radix.rs index 42b283be..982faa40 100644 --- a/src/avx/avx_mixed_radix.rs +++ b/src/avx/avx_mixed_radix.rs @@ -75,7 +75,38 @@ macro_rules! boilerplate_mixedradix { output: &mut [Complex], scratch: &mut [Complex], ) { - todo!() + // Perform the column FFTs + // Safety: self.perform_column_butterflies() requires the "avx" and "fma" instruction sets, and we return Err() in our constructor if the instructions aren't available + scratch.copy_from_slice(input); + unsafe { + // Specialization workaround: See the comments in FftPlannerAvx::new() for why these calls to array_utils::workaround_transmute are necessary + let transmuted_output: &mut [Complex] = + array_utils::workaround_transmute_mut(scratch); + + self.perform_column_butterflies(transmuted_output); + } + + // process the row FFTs. If extra scratch was provided, pass it in. Otherwise, use the output. + // let inner_scratch = if scratch.len() > 0 { + // scratch + // } else { + // &mut output[..] + // }; + self.common_data + .inner_fft + .process_with_scratch(scratch, output); + + // Transpose + // Safety: self.transpose() requires the "avx" instruction set, and we return Err() in our constructor if the instructions aren't available + unsafe { + // Specialization workaround: See the comments in FftPlannerAvx::new() for why these calls to array_utils::workaround_transmute are necessary + let transmuted_input: &[Complex] = + array_utils::workaround_transmute(scratch); + let transmuted_output: &mut [Complex] = + array_utils::workaround_transmute_mut(output); + + self.transpose(transmuted_input, transmuted_output) + } } #[inline] diff --git a/src/avx/avx_raders.rs b/src/avx/avx_raders.rs index e3e0cfc7..e2d0510d 100644 --- a/src/avx/avx_raders.rs +++ b/src/avx/avx_raders.rs @@ -357,7 +357,67 @@ impl RadersAvx2 { output: &mut [Complex], scratch: &mut [Complex], ) { - todo!() + scratch.copy_from_slice(input); + unsafe { + // Specialization workaround: See the comments in FftPlannerAvx::new() for why these calls to array_utils::workaround_transmute are necessary + let transmuted_input: &mut [Complex] = array_utils::workaround_transmute_mut(scratch); + let transmuted_output: &mut [Complex] = + array_utils::workaround_transmute_mut(output); + self.prepare_raders(transmuted_input, transmuted_output) + } + + let (first_input, inner_input) = scratch.split_first_mut().unwrap(); + let (first_output, inner_output) = output.split_first_mut().unwrap(); + + // perform the first of two inner FFTs + // let inner_scratch = if scratch.len() > 0 { + // &mut scratch[..] + // } else { + // &mut inner_input[..] + // }; + self.inner_fft + .process_with_scratch(inner_output, inner_input); + + // inner_output[0] now contains the sum of elements 1..n. we want the sum of all inputs, so all we need to do is add the first input + *first_output = inner_output[0] + *first_input; + + // multiply the inner result with our cached setup data + // also conjugate every entry. this sets us up to do an inverse FFT + // (because an inverse FFT is equivalent to a normal FFT where you conjugate both the inputs and outputs) + unsafe { + // Specialization workaround: See the comments in FftPlannerAvx::new() for why these calls to array_utils::workaround_transmute are necessary + let transmuted_inner_input: &mut [Complex] = + array_utils::workaround_transmute_mut(inner_input); + let transmuted_inner_output: &mut [Complex] = + array_utils::workaround_transmute_mut(inner_output); + avx_vector::pairwise_complex_mul_conjugated( + transmuted_inner_output, + transmuted_inner_input, + &self.twiddles, + ) + }; + + // We need to add the first input value to all output values. We can accomplish this by adding it to the DC input of our inner ifft. + // Of course, we have to conjugate it, just like we conjugated the complex multiplied above + inner_input[0] = inner_input[0] + first_input.conj(); + + // execute the second FFT + // let inner_scratch = if scratch.len() > 0 { + // scratch + // } else { + // &mut inner_output[..] + // }; + self.inner_fft + .process_with_scratch(inner_input, inner_output); + + // copy the final values into the output, reordering as we go + unsafe { + // Specialization workaround: See the comments in FftPlannerAvx::new() for why these calls to array_utils::workaround_transmute are necessary + let transmuted_input: &mut [Complex] = array_utils::workaround_transmute_mut(scratch); + let transmuted_output: &mut [Complex] = + array_utils::workaround_transmute_mut(output); + self.finalize_raders(transmuted_input, transmuted_output); + } } fn perform_fft_out_of_place( diff --git a/src/avx/mod.rs b/src/avx/mod.rs index 38c26fc1..f3a54699 100644 --- a/src/avx/mod.rs +++ b/src/avx/mod.rs @@ -36,7 +36,7 @@ macro_rules! boilerplate_avx_fft { output: &mut [Complex], scratch: &mut [Complex], ) { - let required_scratch = self.get_outofplace_scratch_len(); + let required_scratch = self.get_inplace_scratch_len(); if scratch.len() < required_scratch || input.len() < self.len() || output.len() != input.len() @@ -46,7 +46,7 @@ macro_rules! boilerplate_avx_fft { self.len(), input.len(), output.len(), - self.get_outofplace_scratch_len(), + self.get_inplace_scratch_len(), scratch.len(), ); return; // Unreachable, because fft_error_outofplace asserts, but it helps codegen to put it here @@ -58,7 +58,7 @@ macro_rules! boilerplate_avx_fft { output, self.len(), |in_chunk, out_chunk| { - self.process_outofplace_with_scratch_immut(in_chunk, out_chunk, scratch) + self.perform_fft_out_of_place_immut(in_chunk, out_chunk, scratch) }, ); @@ -184,7 +184,7 @@ macro_rules! boilerplate_avx_fft_commondata { return; } - let required_scratch = self.get_outofplace_scratch_len(); + let required_scratch = self.get_inplace_scratch_len(); if scratch.len() < required_scratch || input.len() < self.len() || output.len() != input.len() @@ -194,7 +194,7 @@ macro_rules! boilerplate_avx_fft_commondata { self.len(), input.len(), output.len(), - self.get_outofplace_scratch_len(), + self.get_inplace_scratch_len(), scratch.len(), ); return; // Unreachable, because fft_error_outofplace asserts, but it helps codegen to put it here @@ -206,7 +206,7 @@ macro_rules! boilerplate_avx_fft_commondata { output, self.len(), |in_chunk, out_chunk| { - self.process_outofplace_with_scratch_immut(in_chunk, out_chunk, scratch) + self.perform_fft_out_of_place_immut(in_chunk, out_chunk, scratch) }, ); diff --git a/tests/test_basic.rs b/tests/test_basic.rs index cf06e3f3..abf76772 100644 --- a/tests/test_basic.rs +++ b/tests/test_basic.rs @@ -6,13 +6,13 @@ use std::sync::Arc; #[test] fn test_100() { // let len = 102; - for len in 1..250 { + for len in 37..250 { dbg!(len); let mut p = FftPlanner::::new(); let planner: Arc> = p.plan_fft_forward(len); let mut input: Vec<_> = (0..len).map(|i| Complex::new(i as f32, 0.0)).collect(); - dbg!(planner.get_inplace_scratch_len(), planner.get_outofplace_scratch_len()); + // dbg!(planner.get_inplace_scratch_len(), planner.get_outofplace_scratch_len()); let mut output = input.clone(); let mut output2 = output.clone(); let mut scratch = vec![Complex::::ZERO; planner.get_outofplace_scratch_len() + len]; From f2a9ca6df8363130605609abbc8da57e49d5d16a Mon Sep 17 00:00:00 2001 From: Michael Ciraci Date: Sun, 9 Mar 2025 17:59:54 -0500 Subject: [PATCH 08/10] Working avx --- src/avx/avx32_butterflies.rs | 40 ++++++++++++++++++++++++++---- src/avx/avx_bluesteins.rs | 48 +++++++++++++++++++++++++++++++++++- src/avx/avx_mixed_radix.rs | 13 +++------- 3 files changed, 86 insertions(+), 15 deletions(-) diff --git a/src/avx/avx32_butterflies.rs b/src/avx/avx32_butterflies.rs index e9dadf78..80869600 100644 --- a/src/avx/avx32_butterflies.rs +++ b/src/avx/avx32_butterflies.rs @@ -193,11 +193,17 @@ macro_rules! boilerplate_fft_simd_butterfly_with_scratch { fn perform_fft_out_of_place_immut( &self, - input: &[Complex], - output: &mut [Complex], - scratch: &mut [Complex], + input: &[Complex], + output: &mut [Complex], + _scratch: &mut [Complex], ) { - todo!() + // Perform the column FFTs + // Safety: self.perform_column_butterflies() requres the "avx" and "fma" instruction sets, and we return Err() in our constructor if the instructions aren't available + unsafe { self.column_butterflies_and_transpose(input, output) }; + + // process the row FFTs in-place in the output buffer + // Safety: self.transpose() requres the "avx" instruction set, and we return Err() in our constructor if the instructions aren't available + unsafe { self.row_butterflies(output) }; } #[inline] @@ -222,7 +228,31 @@ macro_rules! boilerplate_fft_simd_butterfly_with_scratch { output: &mut [Complex], scratch: &mut [Complex], ) { - todo!() + if input.len() < self.len() || output.len() != input.len() { + // We want to trigger a panic, but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us + fft_error_outofplace(self.len(), input.len(), output.len(), 0, 0); + return; // Unreachable, because fft_error_outofplace asserts, but it helps codegen to put it here + } + + // Specialization workaround: See the comments in FftPlannerAvx::new() for why these calls to array_utils::workaround_transmute are necessary + let transmuted_input: &[Complex] = + unsafe { array_utils::workaround_transmute(input) }; + let transmuted_output: &mut [Complex] = + unsafe { array_utils::workaround_transmute_mut(output) }; + let transmuted_scratch: &mut [Complex] = + unsafe { array_utils::workaround_transmute_mut(scratch) }; + let result = array_utils::iter_chunks_zipped( + transmuted_input, + transmuted_output, + self.len(), + |in_chunk, out_chunk| self.perform_fft_out_of_place_immut(in_chunk, out_chunk, transmuted_scratch), + ); + + if result.is_err() { + // We want to trigger a panic, because the buffer sizes weren't cleanly divisible by the FFT size, + // but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us + fft_error_outofplace(self.len(), input.len(), output.len(), 0, 0); + } } fn process_outofplace_with_scratch( diff --git a/src/avx/avx_bluesteins.rs b/src/avx/avx_bluesteins.rs index e9757e01..63db326c 100644 --- a/src/avx/avx_bluesteins.rs +++ b/src/avx/avx_bluesteins.rs @@ -317,7 +317,53 @@ impl BluesteinsAvx { output: &mut [Complex], scratch: &mut [Complex], ) { - todo!() + let (inner_input, inner_scratch) = scratch + .split_at_mut(self.inner_fft_multiplier.len() * A::VectorType::COMPLEX_PER_VECTOR); + + // do the necessary setup for bluestein's algorithm: copy the data to the inner buffers, apply some twiddle factors, zero out the rest of the inner buffer + unsafe { + // Specialization workaround: See the comments in FftPlannerAvx::new() for why these calls to array_utils::workaround_transmute are necessary + let transmuted_input: &[Complex] = array_utils::workaround_transmute(input); + let transmuted_inner_input: &mut [Complex] = + array_utils::workaround_transmute_mut(inner_input); + + self.prepare_bluesteins(transmuted_input, transmuted_inner_input) + } + + // run our inner forward FFT + self.common_data + .inner_fft + .process_with_scratch(inner_input, inner_scratch); + + // Multiply our inner FFT output by our precomputed data. Then, conjugate the result to set up for an inverse FFT. + // We can conjugate the result of multiplication by conjugating both inputs. We pre-conjugated the multiplier array, + // so we just need to conjugate inner_input, which the pairwise_complex_multiply_conjugated function will handle + unsafe { + // Specialization workaround: See the comments in FftPlannerAvx::new() for why these calls to array_utils::workaround_transmute are necessary + let transmuted_inner_input: &mut [Complex] = + array_utils::workaround_transmute_mut(inner_input); + + Self::pairwise_complex_multiply_conjugated( + transmuted_inner_input, + &self.inner_fft_multiplier, + ) + }; + + // inverse FFT. we're computing a forward but we're massaging it into an inverse by conjugating the inputs and outputs + self.common_data + .inner_fft + .process_with_scratch(inner_input, inner_scratch); + + // finalize the result + unsafe { + // Specialization workaround: See the comments in FftPlannerAvx::new() for why these calls to array_utils::workaround_transmute are necessary + let transmuted_output: &mut [Complex] = + array_utils::workaround_transmute_mut(output); + let transmuted_inner_input: &mut [Complex] = + array_utils::workaround_transmute_mut(inner_input); + + self.finalize_bluesteins(transmuted_inner_input, transmuted_output) + } } fn perform_fft_out_of_place( diff --git a/src/avx/avx_mixed_radix.rs b/src/avx/avx_mixed_radix.rs index 982faa40..38ab6ae4 100644 --- a/src/avx/avx_mixed_radix.rs +++ b/src/avx/avx_mixed_radix.rs @@ -77,24 +77,19 @@ macro_rules! boilerplate_mixedradix { ) { // Perform the column FFTs // Safety: self.perform_column_butterflies() requires the "avx" and "fma" instruction sets, and we return Err() in our constructor if the instructions aren't available - scratch.copy_from_slice(input); + output.copy_from_slice(input); unsafe { // Specialization workaround: See the comments in FftPlannerAvx::new() for why these calls to array_utils::workaround_transmute are necessary let transmuted_output: &mut [Complex] = - array_utils::workaround_transmute_mut(scratch); + array_utils::workaround_transmute_mut(output); self.perform_column_butterflies(transmuted_output); } - // process the row FFTs. If extra scratch was provided, pass it in. Otherwise, use the output. - // let inner_scratch = if scratch.len() > 0 { - // scratch - // } else { - // &mut output[..] - // }; self.common_data .inner_fft - .process_with_scratch(scratch, output); + .process_with_scratch(output, scratch); + scratch[..input.len()].copy_from_slice(output); // Transpose // Safety: self.transpose() requires the "avx" instruction set, and we return Err() in our constructor if the instructions aren't available From 0b0f66ffa19a74ce43c4c6a14d0e13a327cd79f5 Mon Sep 17 00:00:00 2001 From: Michael Ciraci Date: Sun, 9 Mar 2025 18:06:35 -0500 Subject: [PATCH 09/10] Removing blanket impl --- src/common.rs | 89 +++++++++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 6 +--- 2 files changed, 90 insertions(+), 5 deletions(-) diff --git a/src/common.rs b/src/common.rs index cc228acf..0702bc4a 100644 --- a/src/common.rs +++ b/src/common.rs @@ -73,6 +73,47 @@ pub fn fft_error_outofplace( macro_rules! boilerplate_fft_oop { ($struct_name:ident, $len_fn:expr) => { impl Fft for $struct_name { + fn process_outofplace_with_scratch_immut( + &self, + input: &[Complex], + output: &mut [Complex], + scratch: &mut [Complex], + ) { + if self.len() == 0 { + return; + } + + let required_scratch = self.get_outofplace_scratch_len(); + if input.len() < self.len() + || output.len() != input.len() + || scratch.len() < required_scratch + { + // We want to trigger a panic, but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us + fft_error_outofplace( + self.len(), + input.len(), + output.len(), + required_scratch, + scratch.len(), + ); + return; // Unreachable, because fft_error_outofplace asserts, but it helps codegen to put it here + } + + let result = array_utils::iter_chunks_zipped( + input, + output, + self.len(), + |in_chunk, out_chunk| { + self.perform_fft_out_of_place_immut(in_chunk, out_chunk, scratch) + }, + ); + + if result.is_err() { + // We want to trigger a panic, because the buffer sizes weren't cleanly divisible by the FFT size, + // but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us + fft_error_outofplace(self.len(), input.len(), output.len(), 0, 0); + } + } fn process_outofplace_with_scratch( &self, input: &mut [Complex], @@ -175,6 +216,54 @@ macro_rules! boilerplate_fft_oop { macro_rules! boilerplate_fft { ($struct_name:ident, $len_fn:expr, $inplace_scratch_len_fn:expr, $out_of_place_scratch_len_fn:expr) => { impl Fft for $struct_name { + fn process_outofplace_with_scratch_immut( + &self, + input: &[Complex], + output: &mut [Complex], + scratch: &mut [Complex], + ) { + if self.len() == 0 { + return; + } + + let required_scratch = self.get_outofplace_scratch_len(); + if scratch.len() < required_scratch + || input.len() < self.len() + || output.len() != input.len() + { + // We want to trigger a panic, but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us + fft_error_outofplace( + self.len(), + input.len(), + output.len(), + self.get_outofplace_scratch_len(), + scratch.len(), + ); + return; // Unreachable, because fft_error_outofplace asserts, but it helps codegen to put it here + } + + let scratch = &mut scratch[..required_scratch]; + let result = array_utils::iter_chunks_zipped( + input, + output, + self.len(), + |in_chunk, out_chunk| { + self.perform_fft_out_of_place_immut(in_chunk, out_chunk, scratch) + }, + ); + + if result.is_err() { + // We want to trigger a panic, because the buffer sizes weren't cleanly divisible by the FFT size, + // but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us + fft_error_outofplace( + self.len(), + input.len(), + output.len(), + self.get_outofplace_scratch_len(), + scratch.len(), + ); + } + } fn process_outofplace_with_scratch( &self, diff --git a/src/lib.rs b/src/lib.rs index c2615b20..f4d947eb 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -241,11 +241,7 @@ pub trait Fft: Length + Direction + Sync + Send { input: &[Complex], output: &mut [Complex], scratch: &mut [Complex], - ) { - let (mut input_scratch, scratch) = scratch.split_at_mut(input.len()); - input_scratch.copy_from_slice(input); - self.process_outofplace_with_scratch(&mut input_scratch, output, scratch); - } + ); /// Returns the size of the scratch buffer required by `process_with_scratch` /// From 7a3b6c997ae70164b1e525550d184e327549de6e Mon Sep 17 00:00:00 2001 From: Michael Ciraci Date: Sun, 9 Mar 2025 18:09:31 -0500 Subject: [PATCH 10/10] Correcting 100 test len --- tests/test_basic.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_basic.rs b/tests/test_basic.rs index abf76772..18f3aad0 100644 --- a/tests/test_basic.rs +++ b/tests/test_basic.rs @@ -6,7 +6,7 @@ use std::sync::Arc; #[test] fn test_100() { // let len = 102; - for len in 37..250 { + for len in 37..5000 { dbg!(len); let mut p = FftPlanner::::new(); let planner: Arc> = p.plan_fft_forward(len); @@ -15,7 +15,7 @@ fn test_100() { // dbg!(planner.get_inplace_scratch_len(), planner.get_outofplace_scratch_len()); let mut output = input.clone(); let mut output2 = output.clone(); - let mut scratch = vec![Complex::::ZERO; planner.get_outofplace_scratch_len() + len]; + let mut scratch = vec![Complex::::ZERO; planner.get_inplace_scratch_len() + len]; // planner.process_outofplace_with_scratch(&mut input, &mut output, &mut scratch); planner.process_outofplace_with_scratch_immut(&input, &mut output, &mut scratch);