diff --git a/Cargo.toml b/Cargo.toml index 329cc316..bfaa488e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -41,10 +41,8 @@ missing_docs_in_private_items = { level = "allow", priority = 1 } missing_safety_doc = { level = "deny", priority = 1 } [profile.release] -debug = true # Generate symbol info for profiling +debug = true # Generate symbol info for profiling opt-level = 3 -codegen-units = 1 -lto = "fat" # Doing light optimizations helps test performance more than it hurts build time. [profile.test] diff --git a/noir-r1cs/Cargo.toml b/noir-r1cs/Cargo.toml index 21c1f316..450498de 100644 --- a/noir-r1cs/Cargo.toml +++ b/noir-r1cs/Cargo.toml @@ -53,6 +53,8 @@ ark-poly.workspace = true ark-std.workspace = true ark-serialize.workspace = true +noir-tools.workspace = true + # Binary # See argh.workspace = true diff --git a/noir-r1cs/benches/bench.rs b/noir-r1cs/benches/bench.rs index 59aa1583..75acad82 100644 --- a/noir-r1cs/benches/bench.rs +++ b/noir-r1cs/benches/bench.rs @@ -1,8 +1,12 @@ //! Divan benchmarks for noir-r1cs use { + bincode, core::hint::black_box, divan::Bencher, - noir_r1cs::{read, NoirProof, NoirProofScheme}, + noir_r1cs::{ + read, utils::sumcheck::calculate_external_row_of_r1cs_matrices, FieldElement, NoirProof, + NoirProofScheme, R1CS, + }, noir_tools::compile_workspace, std::path::Path, }; @@ -67,6 +71,36 @@ fn verify_poseidon_1000(bencher: Bencher) { bencher.bench(|| black_box(&scheme).verify(black_box(&proof))); } +#[divan::bench] +fn calculate_external_row_from_serialized_data(bencher: Bencher) { + let alpha_path = Path::new(env!("CARGO_MANIFEST_DIR")) + .join("benches") + .join("alpha.bin"); + let r1cs_path = Path::new(env!("CARGO_MANIFEST_DIR")) + .join("benches") + .join("r1cs.bin"); + + // Load serialized data with bincode + let alpha_raw: Vec = + bincode::deserialize(&std::fs::read(&alpha_path).expect("Failed to read alpha.bin")) + .expect("Failed to deserialize alpha"); + let alpha: Vec = alpha_raw + .into_iter() + .map(|v| FieldElement::from(v)) + .collect(); + + let r1cs: R1CS = + bincode::deserialize(&std::fs::read(&r1cs_path).expect("Failed to read r1cs.bin")) + .expect("Failed to deserialize r1cs"); + + bencher.bench(|| { + black_box(calculate_external_row_of_r1cs_matrices( + black_box(&alpha), + black_box(&r1cs), + )) + }); +} + fn main() { divan::main(); } diff --git a/noir-r1cs/src/bin/profile_prove.rs b/noir-r1cs/src/bin/profile_prove.rs new file mode 100644 index 00000000..23325e77 --- /dev/null +++ b/noir-r1cs/src/bin/profile_prove.rs @@ -0,0 +1,35 @@ +//! Standalone executable for profiling noir-r1cs prove operations +use { + noir_r1cs::{read, NoirProofScheme}, + noir_tools::compile_workspace, + std::path::Path, +}; + +fn main() { + println!("Starting prove profiling..."); + + let manifest_path = Path::new(env!("CARGO_MANIFEST_DIR")); + let poseidon_path = manifest_path.join("benches").join("poseidon-1000.nps"); + let scheme: NoirProofScheme = read(&poseidon_path).unwrap(); + + let crate_dir = manifest_path.join("../noir-examples/poseidon-rounds"); + + compile_workspace(&crate_dir).expect("Compiling workspace"); + + let witness_path = crate_dir.join("Prover.toml"); + + let input_map = scheme + .read_witness(&witness_path) + .expect("Failed reading witness"); + + println!("Setup complete, starting prove operations..."); + + // Run multiple iterations for better profiling data + for i in 0..1 { + println!("Prove iteration {}", i + 1); + + let _proof = scheme.prove(&input_map); + } + + println!("Profiling complete!"); +} diff --git a/noir-r1cs/src/sparse_matrix.rs b/noir-r1cs/src/sparse_matrix.rs index 1a5f132a..7abc45d9 100644 --- a/noir-r1cs/src/sparse_matrix.rs +++ b/noir-r1cs/src/sparse_matrix.rs @@ -1,6 +1,9 @@ use { crate::{FieldElement, InternedFieldElement, Interner}, ark_std::Zero, + rayon::iter::{ + IndexedParallelIterator, IntoParallelIterator, IntoParallelRefMutIterator, ParallelIterator, + }, serde::{Deserialize, Serialize}, std::{ fmt::Debug, @@ -146,7 +149,6 @@ impl HydratedSparseMatrix<'_> { } /// Right multiplication by vector -// OPT: Paralelize impl Mul<&[FieldElement]> for HydratedSparseMatrix<'_> { type Output = Vec; @@ -156,16 +158,21 @@ impl Mul<&[FieldElement]> for HydratedSparseMatrix<'_> { rhs.len(), "Vector length does not match number of columns." ); - let mut result = vec![FieldElement::zero(); self.matrix.num_rows]; - for ((i, j), value) in self.iter() { - result[i] += value * rhs[j]; - } + + let mut result = Vec::with_capacity(self.matrix.num_rows); + + (0..self.matrix.num_rows) + .into_par_iter() + .map(|i| { + self.iter_row(i) + .fold(FieldElement::zero(), |sum, (j, value)| sum + value * rhs[j]) + }) + .collect_into_vec(&mut result); result } } /// Left multiplication by vector -// OPT: Paralelize impl Mul> for &[FieldElement] { type Output = Vec; @@ -175,10 +182,28 @@ impl Mul> for &[FieldElement] { rhs.matrix.num_rows, "Vector length does not match number of rows." ); + + let num_threads = rayon::current_num_threads(); let mut result = vec![FieldElement::zero(); rhs.matrix.num_cols]; - for ((i, j), value) in rhs.iter() { - result[j] += value * self[i]; - } + + let chunk_size = result.len().div_ceil(num_threads); + + // In microbenchmarks par_iter_mut.chunks outperforms par_chunks_mut slightly. + result + .par_iter_mut() + .chunks(chunk_size) + .enumerate() + .for_each(|(chunk_number, mut chunk)| { + let base = chunk_number * chunk_size; + let col_range = base..base + chunk_size; + rhs.iter() + .filter(|((_row, col), _value)| col_range.contains(col)) + .for_each(|((row, col), value)| { + let index = col - base; + *(chunk[index]) += self[row] * value; + }); + }); + result } }