From 3de18cdb8324f7cb7f016140ec8449b433a5bdd6 Mon Sep 17 00:00:00 2001
From: Xander van der Goot <xandervandergoot@gmail.com>
Date: Wed, 11 Jun 2025 12:18:08 +0800
Subject: [PATCH 1/8] noir: row-level parallelism in left and right
 multiplication

---
 noir-r1cs/src/sparse_matrix.rs | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/noir-r1cs/src/sparse_matrix.rs b/noir-r1cs/src/sparse_matrix.rs
index 1a5f132a..276c46f9 100644
--- a/noir-r1cs/src/sparse_matrix.rs
+++ b/noir-r1cs/src/sparse_matrix.rs
@@ -1,6 +1,7 @@
 use {
     crate::{FieldElement, InternedFieldElement, Interner},
     ark_std::Zero,
+    rayon::iter::{IntoParallelIterator, ParallelBridge, ParallelIterator},
     serde::{Deserialize, Serialize},
     std::{
         fmt::Debug,
@@ -156,11 +157,14 @@ impl Mul<&[FieldElement]> for HydratedSparseMatrix<'_> {
             rhs.len(),
             "Vector length does not match number of columns."
         );
-        let mut result = vec![FieldElement::zero(); self.matrix.num_rows];
-        for ((i, j), value) in self.iter() {
-            result[i] += value * rhs[j];
-        }
-        result
+
+        (0..self.matrix.num_rows)
+            .into_par_iter()
+            .map(|i| {
+                self.iter_row(i)
+                    .fold(FieldElement::zero(), |sum, (j, value)| sum + value * rhs[j])
+            })
+            .collect()
     }
 }
 
@@ -176,9 +180,16 @@ impl Mul<HydratedSparseMatrix<'_>> for &[FieldElement] {
             "Vector length does not match number of rows."
         );
         let mut result = vec![FieldElement::zero(); rhs.matrix.num_cols];
-        for ((i, j), value) in rhs.iter() {
-            result[j] += value * self[i];
+
+        let mult: Vec<_> = (0..rhs.matrix.num_rows)
+            .into_par_iter()
+            .flat_map_iter(|i| rhs.iter_row(i).map(move |(j, value)| (j, value * self[i])))
+            .collect();
+
+        for (j, value) in mult {
+            result[j] += value;
         }
+
         result
     }
 }

From d333f8dcc04816307be8d774431b0facfbeba89b Mon Sep 17 00:00:00 2001
From: Xander van der Goot <xandervandergoot@gmail.com>
Date: Thu, 12 Jun 2025 17:51:46 +0800
Subject: [PATCH 2/8] sparse matrix: optimise storage right multiplication

---
 noir-r1cs/src/sparse_matrix.rs | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/noir-r1cs/src/sparse_matrix.rs b/noir-r1cs/src/sparse_matrix.rs
index 276c46f9..b1ce43db 100644
--- a/noir-r1cs/src/sparse_matrix.rs
+++ b/noir-r1cs/src/sparse_matrix.rs
@@ -1,7 +1,9 @@
 use {
     crate::{FieldElement, InternedFieldElement, Interner},
     ark_std::Zero,
-    rayon::iter::{IntoParallelIterator, ParallelBridge, ParallelIterator},
+    rayon::iter::{
+        IndexedParallelIterator, IntoParallelIterator, ParallelBridge, ParallelIterator,
+    },
     serde::{Deserialize, Serialize},
     std::{
         fmt::Debug,
@@ -158,13 +160,16 @@ impl Mul<&[FieldElement]> for HydratedSparseMatrix<'_> {
             "Vector length does not match number of columns."
         );
 
+        let mut result = vec![FieldElement::zero(); self.matrix.num_rows];
+
         (0..self.matrix.num_rows)
             .into_par_iter()
             .map(|i| {
                 self.iter_row(i)
                     .fold(FieldElement::zero(), |sum, (j, value)| sum + value * rhs[j])
             })
-            .collect()
+            .collect_into_vec(&mut result);
+        result
     }
 }
 

From b4656d1a05d90e36333ceb9e9b2314202656f37f Mon Sep 17 00:00:00 2001
From: Xander van der Goot <xandervandergoot@gmail.com>
Date: Thu, 12 Jun 2025 19:39:19 +0800
Subject: [PATCH 3/8] first attempt parallel

---
 noir-r1cs/src/sparse_matrix.rs | 31 ++++++++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/noir-r1cs/src/sparse_matrix.rs b/noir-r1cs/src/sparse_matrix.rs
index b1ce43db..97091553 100644
--- a/noir-r1cs/src/sparse_matrix.rs
+++ b/noir-r1cs/src/sparse_matrix.rs
@@ -6,6 +6,7 @@ use {
     },
     serde::{Deserialize, Serialize},
     std::{
+        cell::UnsafeCell,
         fmt::Debug,
         ops::{Mul, Range},
     },
@@ -173,6 +174,17 @@ impl Mul<&[FieldElement]> for HydratedSparseMatrix<'_> {
     }
 }
 
+struct LeftVec<T>(UnsafeCell<Vec<T>>);
+unsafe impl<T: Sync + Send> Send for LeftVec<T> {}
+unsafe impl<T: Sync + Send> Sync for LeftVec<T> {}
+
+impl<T> LeftVec<T> {
+    fn insert(&self, index: usize, value: T) {
+        let vec = unsafe { &mut *self.0.get() };
+        vec[index] = value;
+    }
+}
+
 /// Left multiplication by vector
 // OPT: Paralelize
 impl Mul<HydratedSparseMatrix<'_>> for &[FieldElement] {
@@ -186,12 +198,21 @@ impl Mul<HydratedSparseMatrix<'_>> for &[FieldElement] {
         );
         let mut result = vec![FieldElement::zero(); rhs.matrix.num_cols];
 
-        let mult: Vec<_> = (0..rhs.matrix.num_rows)
-            .into_par_iter()
-            .flat_map_iter(|i| rhs.iter_row(i).map(move |(j, value)| (j, value * self[i])))
-            .collect();
+        let mut intermediate = LeftVec(UnsafeCell::new(vec![
+            (0, FieldElement::zero());
+            rhs.matrix.values.len()
+        ]));
+
+        let int_ref = &intermediate;
+
+        (0..rhs.matrix.num_rows).into_par_iter().for_each(|row| {
+            let range = rhs.matrix.row_range(row);
+            rhs.iter_row(row)
+                .zip(range)
+                .for_each(move |((col, value), ind)| int_ref.insert(ind, (col, value * self[row])))
+        });
 
-        for (j, value) in mult {
+        for (j, value) in intermediate.0.into_inner() {
             result[j] += value;
         }
 

From c18a743272cbb456b389dae4506ca0ce0cba9775 Mon Sep 17 00:00:00 2001
From: Xander van der Goot <xandervandergoot@gmail.com>
Date: Fri, 13 Jun 2025 14:13:07 +0800
Subject: [PATCH 4/8] sparse matrix: lock free array

---
 noir-r1cs/src/sparse_matrix.rs | 59 ++++++++++++++++++++++++----------
 1 file changed, 42 insertions(+), 17 deletions(-)

diff --git a/noir-r1cs/src/sparse_matrix.rs b/noir-r1cs/src/sparse_matrix.rs
index 97091553..3ded7584 100644
--- a/noir-r1cs/src/sparse_matrix.rs
+++ b/noir-r1cs/src/sparse_matrix.rs
@@ -150,7 +150,6 @@ impl HydratedSparseMatrix<'_> {
 }
 
 /// Right multiplication by vector
-// OPT: Paralelize
 impl Mul<&[FieldElement]> for HydratedSparseMatrix<'_> {
     type Output = Vec<FieldElement>;
 
@@ -174,19 +173,26 @@ impl Mul<&[FieldElement]> for HydratedSparseMatrix<'_> {
     }
 }
 
-struct LeftVec<T>(UnsafeCell<Vec<T>>);
-unsafe impl<T: Sync + Send> Send for LeftVec<T> {}
-unsafe impl<T: Sync + Send> Sync for LeftVec<T> {}
+// Provide interior mutability where
+struct LockFreeArray<T>(UnsafeCell<Box<[T]>>);
+unsafe impl<T: Sync + Send> Send for LockFreeArray<T> {}
+unsafe impl<T: Sync + Send> Sync for LockFreeArray<T> {}
 
-impl<T> LeftVec<T> {
-    fn insert(&self, index: usize, value: T) {
-        let vec = unsafe { &mut *self.0.get() };
+impl<T> LockFreeArray<T> {
+    fn new(vec: Vec<T>) -> Self {
+        let arr = vec.into_boxed_slice();
+        LockFreeArray(UnsafeCell::new(arr))
+    }
+
+    // Requires that only one thread has access to index and that the index is
+    // within bounds.
+    unsafe fn insert(&self, index: usize, value: T) {
+        let vec = { &mut **self.0.get() };
         vec[index] = value;
     }
 }
 
 /// Left multiplication by vector
-// OPT: Paralelize
 impl Mul<HydratedSparseMatrix<'_>> for &[FieldElement] {
     type Output = Vec<FieldElement>;
 
@@ -196,23 +202,42 @@ impl Mul<HydratedSparseMatrix<'_>> for &[FieldElement] {
             rhs.matrix.num_rows,
             "Vector length does not match number of rows."
         );
-        let mut result = vec![FieldElement::zero(); rhs.matrix.num_cols];
-
-        let mut intermediate = LeftVec(UnsafeCell::new(vec![
-            (0, FieldElement::zero());
-            rhs.matrix.values.len()
-        ]));
 
-        let int_ref = &intermediate;
+        // E
+        let intermediate_multiplication =
+            LockFreeArray::new(vec![(0, FieldElement::zero()); rhs.matrix.num_entries()]);
+
+        let intermediate_reference = &intermediate_multiplication;
+
+        // Mapping phase
+        //
+        // Parallelize the multiplication
+        // Use a lock-free array to prevent constant resizing when collecting the
+        // iterator as the size is not known to Rayon. Collecting without a
+        // preallocating the intermediate vector is >15% slower
+        // Other options that have been explored
+        // - An IndexedParallelIterator on the values of the sparse matrix also wasn't
+        //   an option as it requires random access which we can't provide as we
+        //   wouldn't know the row a value belongs to. That's why the rows drive the
+        //   iterator below.
+        // - Acquiring a mutex per column in the result was too expensive (even with
+        //   parking_lot)
 
         (0..rhs.matrix.num_rows).into_par_iter().for_each(|row| {
             let range = rhs.matrix.row_range(row);
             rhs.iter_row(row)
                 .zip(range)
-                .for_each(move |((col, value), ind)| int_ref.insert(ind, (col, value * self[row])))
+                .for_each(move |((col, value), ind)| unsafe {
+                    intermediate_reference.insert(ind, (col, value * self[row]))
+                })
         });
 
-        for (j, value) in intermediate.0.into_inner() {
+        let mut result = vec![FieldElement::zero(); rhs.matrix.num_cols];
+
+        // Reduce phase
+        // Single thread for folding to not have a mutex per column in the result.
+
+        for (j, value) in intermediate_multiplication.0.into_inner() {
             result[j] += value;
         }
 

From 17aee00ba04aaa442400035201e29055826913d9 Mon Sep 17 00:00:00 2001
From: Xander van der Goot <xandervandergoot@gmail.com>
Date: Mon, 16 Jun 2025 23:33:08 +0800
Subject: [PATCH 5/8] stash

---
 Cargo.toml                         |  4 +---
 noir-r1cs/Cargo.toml               |  2 ++
 noir-r1cs/benches/bench.rs         | 36 +++++++++++++++++++++++++++++-
 noir-r1cs/src/bin/profile_prove.rs | 35 +++++++++++++++++++++++++++++
 noir-r1cs/src/sparse_matrix.rs     |  1 -
 5 files changed, 73 insertions(+), 5 deletions(-)
 create mode 100644 noir-r1cs/src/bin/profile_prove.rs

diff --git a/Cargo.toml b/Cargo.toml
index 329cc316..bfaa488e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -41,10 +41,8 @@ missing_docs_in_private_items = { level = "allow", priority = 1 }
 missing_safety_doc = { level = "deny", priority = 1 }
 
 [profile.release]
-debug = true      # Generate symbol info for profiling
+debug = true  # Generate symbol info for profiling
 opt-level = 3
-codegen-units = 1
-lto = "fat"
 
 # Doing light optimizations helps test performance more than it hurts build time.
 [profile.test]
diff --git a/noir-r1cs/Cargo.toml b/noir-r1cs/Cargo.toml
index 21c1f316..450498de 100644
--- a/noir-r1cs/Cargo.toml
+++ b/noir-r1cs/Cargo.toml
@@ -53,6 +53,8 @@ ark-poly.workspace = true
 ark-std.workspace = true
 ark-serialize.workspace = true
 
+noir-tools.workspace = true
+
 # Binary
 # See <https://github.com/rust-lang/cargo/issues/1982>
 argh.workspace = true
diff --git a/noir-r1cs/benches/bench.rs b/noir-r1cs/benches/bench.rs
index 59aa1583..75acad82 100644
--- a/noir-r1cs/benches/bench.rs
+++ b/noir-r1cs/benches/bench.rs
@@ -1,8 +1,12 @@
 //! Divan benchmarks for noir-r1cs
 use {
+    bincode,
     core::hint::black_box,
     divan::Bencher,
-    noir_r1cs::{read, NoirProof, NoirProofScheme},
+    noir_r1cs::{
+        read, utils::sumcheck::calculate_external_row_of_r1cs_matrices, FieldElement, NoirProof,
+        NoirProofScheme, R1CS,
+    },
     noir_tools::compile_workspace,
     std::path::Path,
 };
@@ -67,6 +71,36 @@ fn verify_poseidon_1000(bencher: Bencher) {
     bencher.bench(|| black_box(&scheme).verify(black_box(&proof)));
 }
 
+#[divan::bench]
+fn calculate_external_row_from_serialized_data(bencher: Bencher) {
+    let alpha_path = Path::new(env!("CARGO_MANIFEST_DIR"))
+        .join("benches")
+        .join("alpha.bin");
+    let r1cs_path = Path::new(env!("CARGO_MANIFEST_DIR"))
+        .join("benches")
+        .join("r1cs.bin");
+
+    // Load serialized data with bincode
+    let alpha_raw: Vec<u64> =
+        bincode::deserialize(&std::fs::read(&alpha_path).expect("Failed to read alpha.bin"))
+            .expect("Failed to deserialize alpha");
+    let alpha: Vec<FieldElement> = alpha_raw
+        .into_iter()
+        .map(|v| FieldElement::from(v))
+        .collect();
+
+    let r1cs: R1CS =
+        bincode::deserialize(&std::fs::read(&r1cs_path).expect("Failed to read r1cs.bin"))
+            .expect("Failed to deserialize r1cs");
+
+    bencher.bench(|| {
+        black_box(calculate_external_row_of_r1cs_matrices(
+            black_box(&alpha),
+            black_box(&r1cs),
+        ))
+    });
+}
+
 fn main() {
     divan::main();
 }
diff --git a/noir-r1cs/src/bin/profile_prove.rs b/noir-r1cs/src/bin/profile_prove.rs
new file mode 100644
index 00000000..23325e77
--- /dev/null
+++ b/noir-r1cs/src/bin/profile_prove.rs
@@ -0,0 +1,35 @@
+//! Standalone executable for profiling noir-r1cs prove operations
+use {
+    noir_r1cs::{read, NoirProofScheme},
+    noir_tools::compile_workspace,
+    std::path::Path,
+};
+
+fn main() {
+    println!("Starting prove profiling...");
+
+    let manifest_path = Path::new(env!("CARGO_MANIFEST_DIR"));
+    let poseidon_path = manifest_path.join("benches").join("poseidon-1000.nps");
+    let scheme: NoirProofScheme = read(&poseidon_path).unwrap();
+
+    let crate_dir = manifest_path.join("../noir-examples/poseidon-rounds");
+
+    compile_workspace(&crate_dir).expect("Compiling workspace");
+
+    let witness_path = crate_dir.join("Prover.toml");
+
+    let input_map = scheme
+        .read_witness(&witness_path)
+        .expect("Failed reading witness");
+
+    println!("Setup complete, starting prove operations...");
+
+    // Run multiple iterations for better profiling data
+    for i in 0..1 {
+        println!("Prove iteration {}", i + 1);
+
+        let _proof = scheme.prove(&input_map);
+    }
+
+    println!("Profiling complete!");
+}
diff --git a/noir-r1cs/src/sparse_matrix.rs b/noir-r1cs/src/sparse_matrix.rs
index 3ded7584..6a1aca12 100644
--- a/noir-r1cs/src/sparse_matrix.rs
+++ b/noir-r1cs/src/sparse_matrix.rs
@@ -203,7 +203,6 @@ impl Mul<HydratedSparseMatrix<'_>> for &[FieldElement] {
             "Vector length does not match number of rows."
         );
 
-        // E
         let intermediate_multiplication =
             LockFreeArray::new(vec![(0, FieldElement::zero()); rhs.matrix.num_entries()]);
 

From 66f8dc7e8bb1d746ef2f6c40c7db643000165128 Mon Sep 17 00:00:00 2001
From: Xander van der Goot <xandervandergoot@gmail.com>
Date: Wed, 25 Jun 2025 15:33:58 +0800
Subject: [PATCH 6/8] Do not initialize vector

---
 noir-r1cs/src/sparse_matrix.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/noir-r1cs/src/sparse_matrix.rs b/noir-r1cs/src/sparse_matrix.rs
index 6a1aca12..9754de95 100644
--- a/noir-r1cs/src/sparse_matrix.rs
+++ b/noir-r1cs/src/sparse_matrix.rs
@@ -160,7 +160,7 @@ impl Mul<&[FieldElement]> for HydratedSparseMatrix<'_> {
             "Vector length does not match number of columns."
         );
 
-        let mut result = vec![FieldElement::zero(); self.matrix.num_rows];
+        let mut result = Vec::with_capacity(self.matrix.num_rows);
 
         (0..self.matrix.num_rows)
             .into_par_iter()

From 62462ca96d964ab2498f11d134fc1f804da7cb5c Mon Sep 17 00:00:00 2001
From: Xander van der Goot <xandervandergoot@gmail.com>
Date: Fri, 27 Jun 2025 12:04:15 +0800
Subject: [PATCH 7/8] first attempt at splitting result

---
 noir-r1cs/src/sparse_matrix.rs | 54 ++++++++++++----------------------
 1 file changed, 19 insertions(+), 35 deletions(-)

diff --git a/noir-r1cs/src/sparse_matrix.rs b/noir-r1cs/src/sparse_matrix.rs
index 9754de95..d964f6e2 100644
--- a/noir-r1cs/src/sparse_matrix.rs
+++ b/noir-r1cs/src/sparse_matrix.rs
@@ -2,13 +2,14 @@ use {
     crate::{FieldElement, InternedFieldElement, Interner},
     ark_std::Zero,
     rayon::iter::{
-        IndexedParallelIterator, IntoParallelIterator, ParallelBridge, ParallelIterator,
+        IndexedParallelIterator, IntoParallelIterator, IntoParallelRefMutIterator, ParallelBridge,
+        ParallelIterator,
     },
     serde::{Deserialize, Serialize},
     std::{
         cell::UnsafeCell,
         fmt::Debug,
-        ops::{Mul, Range},
+        ops::{Div, Mul, Range},
     },
 };
 /// A sparse matrix with interned field elements
@@ -203,42 +204,25 @@ impl Mul<HydratedSparseMatrix<'_>> for &[FieldElement] {
             "Vector length does not match number of rows."
         );
 
-        let intermediate_multiplication =
-            LockFreeArray::new(vec![(0, FieldElement::zero()); rhs.matrix.num_entries()]);
-
-        let intermediate_reference = &intermediate_multiplication;
-
-        // Mapping phase
-        //
-        // Parallelize the multiplication
-        // Use a lock-free array to prevent constant resizing when collecting the
-        // iterator as the size is not known to Rayon. Collecting without a
-        // preallocating the intermediate vector is >15% slower
-        // Other options that have been explored
-        // - An IndexedParallelIterator on the values of the sparse matrix also wasn't
-        //   an option as it requires random access which we can't provide as we
-        //   wouldn't know the row a value belongs to. That's why the rows drive the
-        //   iterator below.
-        // - Acquiring a mutex per column in the result was too expensive (even with
-        //   parking_lot)
-
-        (0..rhs.matrix.num_rows).into_par_iter().for_each(|row| {
-            let range = rhs.matrix.row_range(row);
-            rhs.iter_row(row)
-                .zip(range)
-                .for_each(move |((col, value), ind)| unsafe {
-                    intermediate_reference.insert(ind, (col, value * self[row]))
-                })
-        });
-
+        let num_threads = rayon::current_num_threads();
         let mut result = vec![FieldElement::zero(); rhs.matrix.num_cols];
 
-        // Reduce phase
-        // Single thread for folding to not have a mutex per column in the result.
+        let chunk_size = result.len().div_ceil(num_threads);
 
-        for (j, value) in intermediate_multiplication.0.into_inner() {
-            result[j] += value;
-        }
+        result
+            .par_iter_mut()
+            .chunks(chunk_size)
+            .enumerate()
+            .for_each(|(chunk_number, mut chunk)| {
+                let base = chunk_number * chunk_size;
+                let col_range = base..base + chunk_size;
+                rhs.iter()
+                    .filter(|((_row, col), _value)| col_range.contains(col))
+                    .for_each(|((row, col), value)| {
+                        let index = col - base;
+                        *(chunk[index]) += self[row] * value;
+                    });
+            });
 
         result
     }

From 72bb6ea69120b64bf5fc3cb1b67ae87fa7773b20 Mon Sep 17 00:00:00 2001
From: Xander van der Goot <xandervandergoot@gmail.com>
Date: Fri, 27 Jun 2025 12:22:43 +0800
Subject: [PATCH 8/8] cleanup

---
 noir-r1cs/src/sparse_matrix.rs | 26 +++-----------------------
 1 file changed, 3 insertions(+), 23 deletions(-)

diff --git a/noir-r1cs/src/sparse_matrix.rs b/noir-r1cs/src/sparse_matrix.rs
index d964f6e2..7abc45d9 100644
--- a/noir-r1cs/src/sparse_matrix.rs
+++ b/noir-r1cs/src/sparse_matrix.rs
@@ -2,14 +2,12 @@ use {
     crate::{FieldElement, InternedFieldElement, Interner},
     ark_std::Zero,
     rayon::iter::{
-        IndexedParallelIterator, IntoParallelIterator, IntoParallelRefMutIterator, ParallelBridge,
-        ParallelIterator,
+        IndexedParallelIterator, IntoParallelIterator, IntoParallelRefMutIterator, ParallelIterator,
     },
     serde::{Deserialize, Serialize},
     std::{
-        cell::UnsafeCell,
         fmt::Debug,
-        ops::{Div, Mul, Range},
+        ops::{Mul, Range},
     },
 };
 /// A sparse matrix with interned field elements
@@ -174,25 +172,6 @@ impl Mul<&[FieldElement]> for HydratedSparseMatrix<'_> {
     }
 }
 
-// Provide interior mutability where
-struct LockFreeArray<T>(UnsafeCell<Box<[T]>>);
-unsafe impl<T: Sync + Send> Send for LockFreeArray<T> {}
-unsafe impl<T: Sync + Send> Sync for LockFreeArray<T> {}
-
-impl<T> LockFreeArray<T> {
-    fn new(vec: Vec<T>) -> Self {
-        let arr = vec.into_boxed_slice();
-        LockFreeArray(UnsafeCell::new(arr))
-    }
-
-    // Requires that only one thread has access to index and that the index is
-    // within bounds.
-    unsafe fn insert(&self, index: usize, value: T) {
-        let vec = { &mut **self.0.get() };
-        vec[index] = value;
-    }
-}
-
 /// Left multiplication by vector
 impl Mul<HydratedSparseMatrix<'_>> for &[FieldElement] {
     type Output = Vec<FieldElement>;
@@ -209,6 +188,7 @@ impl Mul<HydratedSparseMatrix<'_>> for &[FieldElement] {
 
         let chunk_size = result.len().div_ceil(num_threads);
 
+        // In microbenchmarks par_iter_mut.chunks outperforms par_chunks_mut slightly.
         result
             .par_iter_mut()
             .chunks(chunk_size)