-
Notifications
You must be signed in to change notification settings - Fork 29
Parallel sparse multiplication #125
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
3de18cd
d333f8d
b4656d1
c18a743
17aee00
66f8dc7
62462ca
72bb6ea
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,35 @@ | ||
| //! Standalone executable for profiling noir-r1cs prove operations | ||
| use { | ||
| noir_r1cs::{read, NoirProofScheme}, | ||
| noir_tools::compile_workspace, | ||
| std::path::Path, | ||
| }; | ||
|
|
||
| fn main() { | ||
| println!("Starting prove profiling..."); | ||
|
|
||
| let manifest_path = Path::new(env!("CARGO_MANIFEST_DIR")); | ||
| let poseidon_path = manifest_path.join("benches").join("poseidon-1000.nps"); | ||
| let scheme: NoirProofScheme = read(&poseidon_path).unwrap(); | ||
|
|
||
| let crate_dir = manifest_path.join("../noir-examples/poseidon-rounds"); | ||
|
|
||
| compile_workspace(&crate_dir).expect("Compiling workspace"); | ||
|
|
||
| let witness_path = crate_dir.join("Prover.toml"); | ||
|
|
||
| let input_map = scheme | ||
| .read_witness(&witness_path) | ||
| .expect("Failed reading witness"); | ||
|
|
||
| println!("Setup complete, starting prove operations..."); | ||
|
|
||
| // Run multiple iterations for better profiling data | ||
| for i in 0..1 { | ||
| println!("Prove iteration {}", i + 1); | ||
|
|
||
| let _proof = scheme.prove(&input_map); | ||
| } | ||
|
|
||
| println!("Profiling complete!"); | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,6 +1,9 @@ | ||
| use { | ||
| crate::{FieldElement, InternedFieldElement, Interner}, | ||
| ark_std::Zero, | ||
| rayon::iter::{ | ||
| IndexedParallelIterator, IntoParallelIterator, IntoParallelRefMutIterator, ParallelIterator, | ||
| }, | ||
| serde::{Deserialize, Serialize}, | ||
| std::{ | ||
| fmt::Debug, | ||
|
|
@@ -146,7 +149,6 @@ impl HydratedSparseMatrix<'_> { | |
| } | ||
|
|
||
| /// Right multiplication by vector | ||
| // OPT: Paralelize | ||
| impl Mul<&[FieldElement]> for HydratedSparseMatrix<'_> { | ||
| type Output = Vec<FieldElement>; | ||
|
|
||
|
|
@@ -156,16 +158,21 @@ impl Mul<&[FieldElement]> for HydratedSparseMatrix<'_> { | |
| rhs.len(), | ||
| "Vector length does not match number of columns." | ||
| ); | ||
| let mut result = vec![FieldElement::zero(); self.matrix.num_rows]; | ||
| for ((i, j), value) in self.iter() { | ||
| result[i] += value * rhs[j]; | ||
| } | ||
|
|
||
| let mut result = Vec::with_capacity(self.matrix.num_rows); | ||
|
|
||
| (0..self.matrix.num_rows) | ||
| .into_par_iter() | ||
| .map(|i| { | ||
| self.iter_row(i) | ||
| .fold(FieldElement::zero(), |sum, (j, value)| sum + value * rhs[j]) | ||
| }) | ||
| .collect_into_vec(&mut result); | ||
| result | ||
| } | ||
| } | ||
|
|
||
| /// Left multiplication by vector | ||
| // OPT: Paralelize | ||
| impl Mul<HydratedSparseMatrix<'_>> for &[FieldElement] { | ||
| type Output = Vec<FieldElement>; | ||
|
|
||
|
|
@@ -175,10 +182,28 @@ impl Mul<HydratedSparseMatrix<'_>> for &[FieldElement] { | |
| rhs.matrix.num_rows, | ||
| "Vector length does not match number of rows." | ||
| ); | ||
|
|
||
| let num_threads = rayon::current_num_threads(); | ||
| let mut result = vec![FieldElement::zero(); rhs.matrix.num_cols]; | ||
| for ((i, j), value) in rhs.iter() { | ||
| result[j] += value * self[i]; | ||
| } | ||
|
|
||
| let chunk_size = result.len().div_ceil(num_threads); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If the perf delta is small (e.g. <5%) I think I prefer a solution that let's rayon decide the chunk size. I don't like hard dependencies on the number of available threads. The number of threads does not say much, as we might be doing other work in parallel. Instead we should pick the workload size such that it amortizes the overhead, while still allowing parallelization for large problem sizes. To approximate this I like to pick 'whatever subproblem fits in L1 cache' as problem size. And this in turn is approximated with the |
||
|
|
||
| // In microbenchmarks par_iter_mut.chunks outperforms par_chunks_mut slightly. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If it is a small difference this might be a compiler quirk. I'd stick with If |
||
| result | ||
| .par_iter_mut() | ||
| .chunks(chunk_size) | ||
| .enumerate() | ||
recmo marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| .for_each(|(chunk_number, mut chunk)| { | ||
| let base = chunk_number * chunk_size; | ||
| let col_range = base..base + chunk_size; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Otherwise
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| rhs.iter() | ||
| .filter(|((_row, col), _value)| col_range.contains(col)) | ||
| .for_each(|((row, col), value)| { | ||
| let index = col - base; | ||
| *(chunk[index]) += self[row] * value; | ||
| }); | ||
| }); | ||
|
|
||
| result | ||
| } | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.