ejmahler · michaelciraci · Mar 9, 2025 · Mar 9, 2025 · Mar 9, 2025 · Mar 9, 2025
diff --git a/Cargo.toml b/Cargo.toml
@@ -13,7 +13,7 @@ categories = ["algorithms", "compression", "multimedia::encoding", "science"]
 license = "MIT OR Apache-2.0"
 
 [features]
-default = ["avx", "sse", "neon"]
+default = ["avx","neon"]
 
 # On x86_64, the "avx" feature enables compilation of AVX-acclerated code. 
 # Similarly, the "sse" feature enables compilation of SSE-accelerated code. 

diff --git a/src/algorithm/bluesteins_algorithm.rs b/src/algorithm/bluesteins_algorithm.rs
@@ -137,6 +137,18 @@ impl<T: FftNum> BluesteinsAlgorithm<T> {
         }
     }
 
+    fn perform_fft_out_of_place_immut(
+        &self,
+        input: &[Complex<T>],
+        output: &mut [Complex<T>],
+        scratch: &mut [Complex<T>],
+    ) {
+        // TODO - Is there a better way to do this?
+        let (mut input_scratch, scratch) = scratch.split_at_mut(input.len());
+        input_scratch.copy_from_slice(input);
+        self.process_outofplace_with_scratch(&mut input_scratch, output, scratch);
+    }
+
     fn perform_fft_out_of_place(
         &self,
         input: &mut [Complex<T>],

diff --git a/src/algorithm/butterflies.rs b/src/algorithm/butterflies.rs
@@ -17,6 +17,39 @@ macro_rules! boilerplate_fft_butterfly {
             }
         }
         impl<T: FftNum> Fft<T> for $struct_name<T> {
+            fn process_outofplace_with_scratch_immut(
+                &self,
+                input: &[Complex<T>],
+                output: &mut [Complex<T>],
+                _scratch: &mut [Complex<T>],
+            ) {
+                if input.len() < self.len() || output.len() != input.len() {
+                    // We want to trigger a panic, but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us
+                    fft_error_outofplace(self.len(), input.len(), output.len(), 0, 0);
+                    return; // Unreachable, because fft_error_outofplace asserts, but it helps codegen to put it here
+                }
+
+                let result = array_utils::iter_chunks_zipped(
+                    input,
+                    output,
+                    self.len(),
+                    |in_chunk, out_chunk| {
+                        unsafe {
+                            self.perform_fft_butterfly(DoubleBuf {
+                                input: in_chunk,
+                                output: out_chunk,
+                            })
+                        };
+                    },
+                );
+
+                if result.is_err() {
+                    // We want to trigger a panic, because the buffer sizes weren't cleanly divisible by the FFT size,
+                    // but we want to avoid doing it in this function to reduce code size, so call a function marked cold and inline(never) that will do it for us
+                    fft_error_outofplace(self.len(), input.len(), output.len(), 0, 0);
+                }
+            }
+
             fn process_outofplace_with_scratch(
                 &self,
                 input: &mut [Complex<T>],
@@ -29,7 +62,7 @@ macro_rules! boilerplate_fft_butterfly {
                     return; // Unreachable, because fft_error_outofplace asserts, but it helps codegen to put it here
                 }
 
-                let result = array_utils::iter_chunks_zipped(
+                let result = array_utils::iter_chunks_zipped_mut(
                     input,
                     output,
                     self.len(),
@@ -56,7 +89,7 @@ macro_rules! boilerplate_fft_butterfly {
                     return; // Unreachable, because fft_error_inplace asserts, but it helps codegen to put it here
                 }
 
-                let result = array_utils::iter_chunks(buffer, self.len(), |chunk| unsafe {
+                let result = array_utils::iter_chunks_mut(buffer, self.len(), |chunk| unsafe {
                     self.perform_fft_butterfly(chunk)
                 });
 
@@ -104,6 +137,15 @@ impl<T: FftNum> Butterfly1<T> {
     }
 }
 impl<T: FftNum> Fft<T> for Butterfly1<T> {
+    fn process_outofplace_with_scratch_immut(
+        &self,
+        input: &[Complex<T>],
+        output: &mut [Complex<T>],
+        _scratch: &mut [Complex<T>],
+    ) {
+        output.copy_from_slice(&input);
+    }
+
     fn process_outofplace_with_scratch(
         &self,
         input: &mut [Complex<T>],

diff --git a/src/algorithm/dft.rs b/src/algorithm/dft.rs
@@ -68,6 +68,15 @@ impl<T: FftNum> Dft<T> {
             }
         }
     }
+
+    fn perform_fft_out_of_place_immut(
+        &self,
+        input: &[Complex<T>],
+        output: &mut [Complex<T>],
+        scratch: &mut [Complex<T>],
+    ) {
+        todo!()
+    }
 }
 boilerplate_fft_oop!(Dft, |this: &Dft<_>| this.twiddles.len());
 

diff --git a/src/algorithm/good_thomas_algorithm.rs b/src/algorithm/good_thomas_algorithm.rs
@@ -241,6 +241,15 @@ impl<T: FftNum> GoodThomasAlgorithm<T> {
         self.reindex_output(scratch, buffer);
     }
 
+    fn perform_fft_out_of_place_immut(
+        &self,
+        input: &[Complex<T>],
+        output: &mut [Complex<T>],
+        scratch: &mut [Complex<T>],
+    ) {
+        todo!()
+    }
+
     fn perform_fft_out_of_place(
         &self,
         input: &mut [Complex<T>],
@@ -384,6 +393,38 @@ impl<T: FftNum> GoodThomasAlgorithmSmall<T> {
         }
     }
 
+    fn perform_fft_out_of_place_immut(
+        &self,
+        input: &[Complex<T>],
+        output: &mut [Complex<T>],
+        scratch: &mut [Complex<T>],
+    ) {
+        // These asserts are for the unsafe blocks down below. we're relying on the optimizer to get rid of this assert
+        assert_eq!(self.len(), input.len());
+        assert_eq!(self.len(), output.len());
+
+        let (input_map, output_map) = self.input_output_map.split_at(self.len());
+
+        // copy the input using our reordering mapping
+        for (output_element, &input_index) in output.iter_mut().zip(input_map.iter()) {
+            *output_element = input[input_index];
+        }
+
+        // run FFTs of size `width`
+        self.width_size_fft.process_with_scratch(output, scratch);
+
+        // transpose
+        unsafe { array_utils::transpose_small(self.width, self.height, output, scratch) };
+
+        // run FFTs of size 'height'
+        self.height_size_fft.process_with_scratch(scratch, output);
+
+        // copy to the output, using our output redordeing mapping
+        for (input_element, &output_index) in scratch.iter().zip(output_map.iter()) {
+            output[output_index] = *input_element;
+        }
+    }
+
     fn perform_fft_out_of_place(
         &self,
         input: &mut [Complex<T>],

diff --git a/src/algorithm/mixed_radix.rs b/src/algorithm/mixed_radix.rs
@@ -151,6 +151,45 @@ impl<T: FftNum> MixedRadix<T> {
         transpose::transpose(scratch, buffer, self.width, self.height);
     }
 
+    fn perform_fft_out_of_place_immut(
+        &self,
+        input: &[Complex<T>],
+        output: &mut [Complex<T>],
+        scratch: &mut [Complex<T>],
+    ) {
+        // STEP 1: transpose
+        transpose::transpose(input, output, self.width, self.height);
+
+        // STEP 2: perform FFTs of size `height`
+        // let height_scratch = if scratch.len() > input.len() {
+        //     &mut scratch[..]
+        // } else {
+        //     &mut input[..]
+        // };
+        self.height_size_fft
+            .process_with_scratch(output, scratch);
+
+        // STEP 3: Apply twiddle factors
+        for (element, twiddle) in output.iter_mut().zip(self.twiddles.iter()) {
+            *element = *element * twiddle;
+        }
+
+        // STEP 4: transpose again
+        transpose::transpose(output, scratch, self.height, self.width);
+
+        // STEP 5: perform FFTs of size `width`
+        // let width_scratch = if scratch.len() > output.len() {
+        //     &mut scratch[..]
+        // } else {
+        //     &mut output[..]
+        // };
+        self.width_size_fft
+            .process_with_scratch(scratch, output);
+
+        // STEP 6: transpose again
+        transpose::transpose(scratch, output, self.width, self.height);
+    }
+
     fn perform_fft_out_of_place(
         &self,
         input: &mut [Complex<T>],
@@ -302,6 +341,34 @@ impl<T: FftNum> MixedRadixSmall<T> {
         unsafe { array_utils::transpose_small(self.width, self.height, scratch, buffer) };
     }
 
+    fn perform_fft_out_of_place_immut(
+        &self,
+        input: &[Complex<T>],
+        output: &mut [Complex<T>],
+        scratch: &mut [Complex<T>],
+    ) {
+        // SIX STEP FFT:
+        // STEP 1: transpose
+        unsafe { array_utils::transpose_small(self.width, self.height, input, output) };
+
+        // STEP 2: perform FFTs of size `height`
+        self.height_size_fft.process_with_scratch(output, scratch);
+
+        // STEP 3: Apply twiddle factors
+        for (element, twiddle) in output.iter_mut().zip(self.twiddles.iter()) {
+            *element = *element * twiddle;
+        }
+
+        // STEP 4: transpose again
+        unsafe { array_utils::transpose_small(self.height, self.width, output, scratch) };
+
+        // STEP 5: perform FFTs of size `width`
+        self.width_size_fft.process_with_scratch(scratch, output);
+
+        // STEP 6: transpose again
+        unsafe { array_utils::transpose_small(self.width, self.height, scratch, output) };
+    }
+
     fn perform_fft_out_of_place(
         &self,
         input: &mut [Complex<T>],

diff --git a/src/algorithm/raders_algorithm.rs b/src/algorithm/raders_algorithm.rs
@@ -119,6 +119,15 @@ impl<T: FftNum> RadersAlgorithm<T> {
         }
     }
 
+    fn perform_fft_out_of_place_immut(
+        &self,
+        input: &[Complex<T>],
+        output: &mut [Complex<T>],
+        scratch: &mut [Complex<T>],
+    ) {
+        todo!()
+    }
+
     fn perform_fft_out_of_place(
         &self,
         input: &mut [Complex<T>],

diff --git a/src/algorithm/radix3.rs b/src/algorithm/radix3.rs
@@ -118,6 +118,15 @@ impl<T: FftNum> Radix3<T> {
         self.outofplace_scratch_len
     }
 
+    fn perform_fft_out_of_place_immut(
+        &self,
+        input: &[Complex<T>],
+        output: &mut [Complex<T>],
+        scratch: &mut [Complex<T>],
+    ) {
+        todo!()
+    }
+
     fn perform_fft_out_of_place(
         &self,
         input: &mut [Complex<T>],

diff --git a/src/algorithm/radix4.rs b/src/algorithm/radix4.rs
@@ -124,6 +124,15 @@ impl<T: FftNum> Radix4<T> {
         self.outofplace_scratch_len
     }
 
+    fn perform_fft_out_of_place_immut(
+        &self,
+        input: &[Complex<T>],
+        output: &mut [Complex<T>],
+        scratch: &mut [Complex<T>],
+    ) {
+        todo!()
+    }
+
     fn perform_fft_out_of_place(
         &self,
         input: &mut [Complex<T>],

diff --git a/src/algorithm/radixn.rs b/src/algorithm/radixn.rs
@@ -174,6 +174,15 @@ impl<T: FftNum> RadixN<T> {
         self.outofplace_scratch_len
     }
 
+    fn perform_fft_out_of_place_immut(
+        &self,
+        input: &[Complex<T>],
+        output: &mut [Complex<T>],
+        scratch: &mut [Complex<T>],
+    ) {
+        todo!()
+    }
+
     fn perform_fft_out_of_place(
         &self,
         input: &mut [Complex<T>],

diff --git a/src/array_utils.rs b/src/array_utils.rs
@@ -146,6 +146,29 @@ mod unit_tests {
 // Loop over exact chunks of the provided buffer. Very similar in semantics to ChunksExactMut, but generates smaller code and requires no modulo operations
 // Returns Ok() if every element ended up in a chunk, Err() if there was a remainder
 pub fn iter_chunks<T>(
+    mut buffer: &[T],
+    chunk_size: usize,
+    mut chunk_fn: impl FnMut(&[T]),
+) -> Result<(), ()> {
+    // Loop over the buffer, splicing off chunk_size at a time, and calling chunk_fn on each
+    while buffer.len() >= chunk_size {
+        let (head, tail) = buffer.split_at(chunk_size);
+        buffer = tail;
+
+        chunk_fn(head);
+    }
+
+    // We have a remainder if there's data still in the buffer -- in which case we want to indicate to the caller that there was an unwanted remainder
+    if buffer.len() == 0 {
+        Ok(())
+    } else {
+        Err(())
+    }
+}
+
+// Loop over exact chunks of the provided buffer. Very similar in semantics to ChunksExactMut, but generates smaller code and requires no modulo operations
+// Returns Ok() if every element ended up in a chunk, Err() if there was a remainder
+pub fn iter_chunks_mut<T>(
     mut buffer: &mut [T],
     chunk_size: usize,
     mut chunk_fn: impl FnMut(&mut [T]),
@@ -169,6 +192,44 @@ pub fn iter_chunks<T>(
 // Loop over exact zipped chunks of the 2 provided buffers. Very similar in semantics to ChunksExactMut.zip(ChunksExactMut), but generates smaller code and requires no modulo operations
 // Returns Ok() if every element of both buffers ended up in a chunk, Err() if there was a remainder
 pub fn iter_chunks_zipped<T>(
+    mut buffer1: &[T],
+    mut buffer2: &mut [T],
+    chunk_size: usize,
+    mut chunk_fn: impl FnMut(&[T], &mut [T]),
+) -> Result<(), ()> {
+    // If the two buffers aren't the same size, record the fact that they're different, then snip them to be the same size
+    let uneven = if buffer1.len() > buffer2.len() {
+        buffer1 = &buffer1[..buffer2.len()];
+        true
+    } else if buffer2.len() < buffer1.len() {
+        buffer2 = &mut buffer2[..buffer1.len()];
+        true
+    } else {
+        false
+    };
+
+    // Now that we know the two slices are the same length, loop over each one, splicing off chunk_size at a time, and calling chunk_fn on each
+    while buffer1.len() >= chunk_size && buffer2.len() >= chunk_size {
+        let (head1, tail1) = buffer1.split_at(chunk_size);
+        buffer1 = tail1;
+
+        let (head2, tail2) = buffer2.split_at_mut(chunk_size);
+        buffer2 = tail2;
+
+        chunk_fn(head1, head2);
+    }
+
+    // We have a remainder if the 2 chunks were uneven to start with, or if there's still data in the buffers -- in which case we want to indicate to the caller that there was an unwanted remainder
+    if !uneven && buffer1.len() == 0 {
+        Ok(())
+    } else {
+        Err(())
+    }
+}
+
+// Loop over exact zipped chunks of the 2 provided buffers. Very similar in semantics to ChunksExactMut.zip(ChunksExactMut), but generates smaller code and requires no modulo operations
+// Returns Ok() if every element of both buffers ended up in a chunk, Err() if there was a remainder
+pub fn iter_chunks_zipped_mut<T>(
     mut buffer1: &mut [T],
     mut buffer2: &mut [T],
     chunk_size: usize,