From 3de18cdb8324f7cb7f016140ec8449b433a5bdd6 Mon Sep 17 00:00:00 2001 From: Xander van der Goot Date: Wed, 11 Jun 2025 12:18:08 +0800 Subject: [PATCH 1/8] noir: row-level parallelism in left and right multiplication --- noir-r1cs/src/sparse_matrix.rs | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/noir-r1cs/src/sparse_matrix.rs b/noir-r1cs/src/sparse_matrix.rs index 1a5f132a..276c46f9 100644 --- a/noir-r1cs/src/sparse_matrix.rs +++ b/noir-r1cs/src/sparse_matrix.rs @@ -1,6 +1,7 @@ use { crate::{FieldElement, InternedFieldElement, Interner}, ark_std::Zero, + rayon::iter::{IntoParallelIterator, ParallelBridge, ParallelIterator}, serde::{Deserialize, Serialize}, std::{ fmt::Debug, @@ -156,11 +157,14 @@ impl Mul<&[FieldElement]> for HydratedSparseMatrix<'_> { rhs.len(), "Vector length does not match number of columns." ); - let mut result = vec![FieldElement::zero(); self.matrix.num_rows]; - for ((i, j), value) in self.iter() { - result[i] += value * rhs[j]; - } - result + + (0..self.matrix.num_rows) + .into_par_iter() + .map(|i| { + self.iter_row(i) + .fold(FieldElement::zero(), |sum, (j, value)| sum + value * rhs[j]) + }) + .collect() } } @@ -176,9 +180,16 @@ impl Mul> for &[FieldElement] { "Vector length does not match number of rows." ); let mut result = vec![FieldElement::zero(); rhs.matrix.num_cols]; - for ((i, j), value) in rhs.iter() { - result[j] += value * self[i]; + + let mult: Vec<_> = (0..rhs.matrix.num_rows) + .into_par_iter() + .flat_map_iter(|i| rhs.iter_row(i).map(move |(j, value)| (j, value * self[i]))) + .collect(); + + for (j, value) in mult { + result[j] += value; } + result } } From d333f8dcc04816307be8d774431b0facfbeba89b Mon Sep 17 00:00:00 2001 From: Xander van der Goot Date: Thu, 12 Jun 2025 17:51:46 +0800 Subject: [PATCH 2/8] sparse matrix: optimise storage right multiplication --- noir-r1cs/src/sparse_matrix.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/noir-r1cs/src/sparse_matrix.rs b/noir-r1cs/src/sparse_matrix.rs index 276c46f9..b1ce43db 100644 --- a/noir-r1cs/src/sparse_matrix.rs +++ b/noir-r1cs/src/sparse_matrix.rs @@ -1,7 +1,9 @@ use { crate::{FieldElement, InternedFieldElement, Interner}, ark_std::Zero, - rayon::iter::{IntoParallelIterator, ParallelBridge, ParallelIterator}, + rayon::iter::{ + IndexedParallelIterator, IntoParallelIterator, ParallelBridge, ParallelIterator, + }, serde::{Deserialize, Serialize}, std::{ fmt::Debug, @@ -158,13 +160,16 @@ impl Mul<&[FieldElement]> for HydratedSparseMatrix<'_> { "Vector length does not match number of columns." ); + let mut result = vec![FieldElement::zero(); self.matrix.num_rows]; + (0..self.matrix.num_rows) .into_par_iter() .map(|i| { self.iter_row(i) .fold(FieldElement::zero(), |sum, (j, value)| sum + value * rhs[j]) }) - .collect() + .collect_into_vec(&mut result); + result } } From b4656d1a05d90e36333ceb9e9b2314202656f37f Mon Sep 17 00:00:00 2001 From: Xander van der Goot Date: Thu, 12 Jun 2025 19:39:19 +0800 Subject: [PATCH 3/8] first attempt parallel --- noir-r1cs/src/sparse_matrix.rs | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/noir-r1cs/src/sparse_matrix.rs b/noir-r1cs/src/sparse_matrix.rs index b1ce43db..97091553 100644 --- a/noir-r1cs/src/sparse_matrix.rs +++ b/noir-r1cs/src/sparse_matrix.rs @@ -6,6 +6,7 @@ use { }, serde::{Deserialize, Serialize}, std::{ + cell::UnsafeCell, fmt::Debug, ops::{Mul, Range}, }, @@ -173,6 +174,17 @@ impl Mul<&[FieldElement]> for HydratedSparseMatrix<'_> { } } +struct LeftVec(UnsafeCell>); +unsafe impl Send for LeftVec {} +unsafe impl Sync for LeftVec {} + +impl LeftVec { + fn insert(&self, index: usize, value: T) { + let vec = unsafe { &mut *self.0.get() }; + vec[index] = value; + } +} + /// Left multiplication by vector // OPT: Paralelize impl Mul> for &[FieldElement] { @@ -186,12 +198,21 @@ impl Mul> for &[FieldElement] { ); let mut result = vec![FieldElement::zero(); rhs.matrix.num_cols]; - let mult: Vec<_> = (0..rhs.matrix.num_rows) - .into_par_iter() - .flat_map_iter(|i| rhs.iter_row(i).map(move |(j, value)| (j, value * self[i]))) - .collect(); + let mut intermediate = LeftVec(UnsafeCell::new(vec![ + (0, FieldElement::zero()); + rhs.matrix.values.len() + ])); + + let int_ref = &intermediate; + + (0..rhs.matrix.num_rows).into_par_iter().for_each(|row| { + let range = rhs.matrix.row_range(row); + rhs.iter_row(row) + .zip(range) + .for_each(move |((col, value), ind)| int_ref.insert(ind, (col, value * self[row]))) + }); - for (j, value) in mult { + for (j, value) in intermediate.0.into_inner() { result[j] += value; } From c18a743272cbb456b389dae4506ca0ce0cba9775 Mon Sep 17 00:00:00 2001 From: Xander van der Goot Date: Fri, 13 Jun 2025 14:13:07 +0800 Subject: [PATCH 4/8] sparse matrix: lock free array --- noir-r1cs/src/sparse_matrix.rs | 59 ++++++++++++++++++++++++---------- 1 file changed, 42 insertions(+), 17 deletions(-) diff --git a/noir-r1cs/src/sparse_matrix.rs b/noir-r1cs/src/sparse_matrix.rs index 97091553..3ded7584 100644 --- a/noir-r1cs/src/sparse_matrix.rs +++ b/noir-r1cs/src/sparse_matrix.rs @@ -150,7 +150,6 @@ impl HydratedSparseMatrix<'_> { } /// Right multiplication by vector -// OPT: Paralelize impl Mul<&[FieldElement]> for HydratedSparseMatrix<'_> { type Output = Vec; @@ -174,19 +173,26 @@ impl Mul<&[FieldElement]> for HydratedSparseMatrix<'_> { } } -struct LeftVec(UnsafeCell>); -unsafe impl Send for LeftVec {} -unsafe impl Sync for LeftVec {} +// Provide interior mutability where +struct LockFreeArray(UnsafeCell>); +unsafe impl Send for LockFreeArray {} +unsafe impl Sync for LockFreeArray {} -impl LeftVec { - fn insert(&self, index: usize, value: T) { - let vec = unsafe { &mut *self.0.get() }; +impl LockFreeArray { + fn new(vec: Vec) -> Self { + let arr = vec.into_boxed_slice(); + LockFreeArray(UnsafeCell::new(arr)) + } + + // Requires that only one thread has access to index and that the index is + // within bounds. + unsafe fn insert(&self, index: usize, value: T) { + let vec = { &mut **self.0.get() }; vec[index] = value; } } /// Left multiplication by vector -// OPT: Paralelize impl Mul> for &[FieldElement] { type Output = Vec; @@ -196,23 +202,42 @@ impl Mul> for &[FieldElement] { rhs.matrix.num_rows, "Vector length does not match number of rows." ); - let mut result = vec![FieldElement::zero(); rhs.matrix.num_cols]; - - let mut intermediate = LeftVec(UnsafeCell::new(vec![ - (0, FieldElement::zero()); - rhs.matrix.values.len() - ])); - let int_ref = &intermediate; + // E + let intermediate_multiplication = + LockFreeArray::new(vec![(0, FieldElement::zero()); rhs.matrix.num_entries()]); + + let intermediate_reference = &intermediate_multiplication; + + // Mapping phase + // + // Parallelize the multiplication + // Use a lock-free array to prevent constant resizing when collecting the + // iterator as the size is not known to Rayon. Collecting without a + // preallocating the intermediate vector is >15% slower + // Other options that have been explored + // - An IndexedParallelIterator on the values of the sparse matrix also wasn't + // an option as it requires random access which we can't provide as we + // wouldn't know the row a value belongs to. That's why the rows drive the + // iterator below. + // - Acquiring a mutex per column in the result was too expensive (even with + // parking_lot) (0..rhs.matrix.num_rows).into_par_iter().for_each(|row| { let range = rhs.matrix.row_range(row); rhs.iter_row(row) .zip(range) - .for_each(move |((col, value), ind)| int_ref.insert(ind, (col, value * self[row]))) + .for_each(move |((col, value), ind)| unsafe { + intermediate_reference.insert(ind, (col, value * self[row])) + }) }); - for (j, value) in intermediate.0.into_inner() { + let mut result = vec![FieldElement::zero(); rhs.matrix.num_cols]; + + // Reduce phase + // Single thread for folding to not have a mutex per column in the result. + + for (j, value) in intermediate_multiplication.0.into_inner() { result[j] += value; } From 17aee00ba04aaa442400035201e29055826913d9 Mon Sep 17 00:00:00 2001 From: Xander van der Goot Date: Mon, 16 Jun 2025 23:33:08 +0800 Subject: [PATCH 5/8] stash --- Cargo.toml | 4 +--- noir-r1cs/Cargo.toml | 2 ++ noir-r1cs/benches/bench.rs | 36 +++++++++++++++++++++++++++++- noir-r1cs/src/bin/profile_prove.rs | 35 +++++++++++++++++++++++++++++ noir-r1cs/src/sparse_matrix.rs | 1 - 5 files changed, 73 insertions(+), 5 deletions(-) create mode 100644 noir-r1cs/src/bin/profile_prove.rs diff --git a/Cargo.toml b/Cargo.toml index 329cc316..bfaa488e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -41,10 +41,8 @@ missing_docs_in_private_items = { level = "allow", priority = 1 } missing_safety_doc = { level = "deny", priority = 1 } [profile.release] -debug = true # Generate symbol info for profiling +debug = true # Generate symbol info for profiling opt-level = 3 -codegen-units = 1 -lto = "fat" # Doing light optimizations helps test performance more than it hurts build time. [profile.test] diff --git a/noir-r1cs/Cargo.toml b/noir-r1cs/Cargo.toml index 21c1f316..450498de 100644 --- a/noir-r1cs/Cargo.toml +++ b/noir-r1cs/Cargo.toml @@ -53,6 +53,8 @@ ark-poly.workspace = true ark-std.workspace = true ark-serialize.workspace = true +noir-tools.workspace = true + # Binary # See argh.workspace = true diff --git a/noir-r1cs/benches/bench.rs b/noir-r1cs/benches/bench.rs index 59aa1583..75acad82 100644 --- a/noir-r1cs/benches/bench.rs +++ b/noir-r1cs/benches/bench.rs @@ -1,8 +1,12 @@ //! Divan benchmarks for noir-r1cs use { + bincode, core::hint::black_box, divan::Bencher, - noir_r1cs::{read, NoirProof, NoirProofScheme}, + noir_r1cs::{ + read, utils::sumcheck::calculate_external_row_of_r1cs_matrices, FieldElement, NoirProof, + NoirProofScheme, R1CS, + }, noir_tools::compile_workspace, std::path::Path, }; @@ -67,6 +71,36 @@ fn verify_poseidon_1000(bencher: Bencher) { bencher.bench(|| black_box(&scheme).verify(black_box(&proof))); } +#[divan::bench] +fn calculate_external_row_from_serialized_data(bencher: Bencher) { + let alpha_path = Path::new(env!("CARGO_MANIFEST_DIR")) + .join("benches") + .join("alpha.bin"); + let r1cs_path = Path::new(env!("CARGO_MANIFEST_DIR")) + .join("benches") + .join("r1cs.bin"); + + // Load serialized data with bincode + let alpha_raw: Vec = + bincode::deserialize(&std::fs::read(&alpha_path).expect("Failed to read alpha.bin")) + .expect("Failed to deserialize alpha"); + let alpha: Vec = alpha_raw + .into_iter() + .map(|v| FieldElement::from(v)) + .collect(); + + let r1cs: R1CS = + bincode::deserialize(&std::fs::read(&r1cs_path).expect("Failed to read r1cs.bin")) + .expect("Failed to deserialize r1cs"); + + bencher.bench(|| { + black_box(calculate_external_row_of_r1cs_matrices( + black_box(&alpha), + black_box(&r1cs), + )) + }); +} + fn main() { divan::main(); } diff --git a/noir-r1cs/src/bin/profile_prove.rs b/noir-r1cs/src/bin/profile_prove.rs new file mode 100644 index 00000000..23325e77 --- /dev/null +++ b/noir-r1cs/src/bin/profile_prove.rs @@ -0,0 +1,35 @@ +//! Standalone executable for profiling noir-r1cs prove operations +use { + noir_r1cs::{read, NoirProofScheme}, + noir_tools::compile_workspace, + std::path::Path, +}; + +fn main() { + println!("Starting prove profiling..."); + + let manifest_path = Path::new(env!("CARGO_MANIFEST_DIR")); + let poseidon_path = manifest_path.join("benches").join("poseidon-1000.nps"); + let scheme: NoirProofScheme = read(&poseidon_path).unwrap(); + + let crate_dir = manifest_path.join("../noir-examples/poseidon-rounds"); + + compile_workspace(&crate_dir).expect("Compiling workspace"); + + let witness_path = crate_dir.join("Prover.toml"); + + let input_map = scheme + .read_witness(&witness_path) + .expect("Failed reading witness"); + + println!("Setup complete, starting prove operations..."); + + // Run multiple iterations for better profiling data + for i in 0..1 { + println!("Prove iteration {}", i + 1); + + let _proof = scheme.prove(&input_map); + } + + println!("Profiling complete!"); +} diff --git a/noir-r1cs/src/sparse_matrix.rs b/noir-r1cs/src/sparse_matrix.rs index 3ded7584..6a1aca12 100644 --- a/noir-r1cs/src/sparse_matrix.rs +++ b/noir-r1cs/src/sparse_matrix.rs @@ -203,7 +203,6 @@ impl Mul> for &[FieldElement] { "Vector length does not match number of rows." ); - // E let intermediate_multiplication = LockFreeArray::new(vec![(0, FieldElement::zero()); rhs.matrix.num_entries()]); From 66f8dc7e8bb1d746ef2f6c40c7db643000165128 Mon Sep 17 00:00:00 2001 From: Xander van der Goot Date: Wed, 25 Jun 2025 15:33:58 +0800 Subject: [PATCH 6/8] Do not initialize vector --- noir-r1cs/src/sparse_matrix.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/noir-r1cs/src/sparse_matrix.rs b/noir-r1cs/src/sparse_matrix.rs index 6a1aca12..9754de95 100644 --- a/noir-r1cs/src/sparse_matrix.rs +++ b/noir-r1cs/src/sparse_matrix.rs @@ -160,7 +160,7 @@ impl Mul<&[FieldElement]> for HydratedSparseMatrix<'_> { "Vector length does not match number of columns." ); - let mut result = vec![FieldElement::zero(); self.matrix.num_rows]; + let mut result = Vec::with_capacity(self.matrix.num_rows); (0..self.matrix.num_rows) .into_par_iter() From 62462ca96d964ab2498f11d134fc1f804da7cb5c Mon Sep 17 00:00:00 2001 From: Xander van der Goot Date: Fri, 27 Jun 2025 12:04:15 +0800 Subject: [PATCH 7/8] first attempt at splitting result --- noir-r1cs/src/sparse_matrix.rs | 54 ++++++++++++---------------------- 1 file changed, 19 insertions(+), 35 deletions(-) diff --git a/noir-r1cs/src/sparse_matrix.rs b/noir-r1cs/src/sparse_matrix.rs index 9754de95..d964f6e2 100644 --- a/noir-r1cs/src/sparse_matrix.rs +++ b/noir-r1cs/src/sparse_matrix.rs @@ -2,13 +2,14 @@ use { crate::{FieldElement, InternedFieldElement, Interner}, ark_std::Zero, rayon::iter::{ - IndexedParallelIterator, IntoParallelIterator, ParallelBridge, ParallelIterator, + IndexedParallelIterator, IntoParallelIterator, IntoParallelRefMutIterator, ParallelBridge, + ParallelIterator, }, serde::{Deserialize, Serialize}, std::{ cell::UnsafeCell, fmt::Debug, - ops::{Mul, Range}, + ops::{Div, Mul, Range}, }, }; /// A sparse matrix with interned field elements @@ -203,42 +204,25 @@ impl Mul> for &[FieldElement] { "Vector length does not match number of rows." ); - let intermediate_multiplication = - LockFreeArray::new(vec![(0, FieldElement::zero()); rhs.matrix.num_entries()]); - - let intermediate_reference = &intermediate_multiplication; - - // Mapping phase - // - // Parallelize the multiplication - // Use a lock-free array to prevent constant resizing when collecting the - // iterator as the size is not known to Rayon. Collecting without a - // preallocating the intermediate vector is >15% slower - // Other options that have been explored - // - An IndexedParallelIterator on the values of the sparse matrix also wasn't - // an option as it requires random access which we can't provide as we - // wouldn't know the row a value belongs to. That's why the rows drive the - // iterator below. - // - Acquiring a mutex per column in the result was too expensive (even with - // parking_lot) - - (0..rhs.matrix.num_rows).into_par_iter().for_each(|row| { - let range = rhs.matrix.row_range(row); - rhs.iter_row(row) - .zip(range) - .for_each(move |((col, value), ind)| unsafe { - intermediate_reference.insert(ind, (col, value * self[row])) - }) - }); - + let num_threads = rayon::current_num_threads(); let mut result = vec![FieldElement::zero(); rhs.matrix.num_cols]; - // Reduce phase - // Single thread for folding to not have a mutex per column in the result. + let chunk_size = result.len().div_ceil(num_threads); - for (j, value) in intermediate_multiplication.0.into_inner() { - result[j] += value; - } + result + .par_iter_mut() + .chunks(chunk_size) + .enumerate() + .for_each(|(chunk_number, mut chunk)| { + let base = chunk_number * chunk_size; + let col_range = base..base + chunk_size; + rhs.iter() + .filter(|((_row, col), _value)| col_range.contains(col)) + .for_each(|((row, col), value)| { + let index = col - base; + *(chunk[index]) += self[row] * value; + }); + }); result } From 72bb6ea69120b64bf5fc3cb1b67ae87fa7773b20 Mon Sep 17 00:00:00 2001 From: Xander van der Goot Date: Fri, 27 Jun 2025 12:22:43 +0800 Subject: [PATCH 8/8] cleanup --- noir-r1cs/src/sparse_matrix.rs | 26 +++----------------------- 1 file changed, 3 insertions(+), 23 deletions(-) diff --git a/noir-r1cs/src/sparse_matrix.rs b/noir-r1cs/src/sparse_matrix.rs index d964f6e2..7abc45d9 100644 --- a/noir-r1cs/src/sparse_matrix.rs +++ b/noir-r1cs/src/sparse_matrix.rs @@ -2,14 +2,12 @@ use { crate::{FieldElement, InternedFieldElement, Interner}, ark_std::Zero, rayon::iter::{ - IndexedParallelIterator, IntoParallelIterator, IntoParallelRefMutIterator, ParallelBridge, - ParallelIterator, + IndexedParallelIterator, IntoParallelIterator, IntoParallelRefMutIterator, ParallelIterator, }, serde::{Deserialize, Serialize}, std::{ - cell::UnsafeCell, fmt::Debug, - ops::{Div, Mul, Range}, + ops::{Mul, Range}, }, }; /// A sparse matrix with interned field elements @@ -174,25 +172,6 @@ impl Mul<&[FieldElement]> for HydratedSparseMatrix<'_> { } } -// Provide interior mutability where -struct LockFreeArray(UnsafeCell>); -unsafe impl Send for LockFreeArray {} -unsafe impl Sync for LockFreeArray {} - -impl LockFreeArray { - fn new(vec: Vec) -> Self { - let arr = vec.into_boxed_slice(); - LockFreeArray(UnsafeCell::new(arr)) - } - - // Requires that only one thread has access to index and that the index is - // within bounds. - unsafe fn insert(&self, index: usize, value: T) { - let vec = { &mut **self.0.get() }; - vec[index] = value; - } -} - /// Left multiplication by vector impl Mul> for &[FieldElement] { type Output = Vec; @@ -209,6 +188,7 @@ impl Mul> for &[FieldElement] { let chunk_size = result.len().div_ceil(num_threads); + // In microbenchmarks par_iter_mut.chunks outperforms par_chunks_mut slightly. result .par_iter_mut() .chunks(chunk_size)