Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,8 @@ missing_docs_in_private_items = { level = "allow", priority = 1 }
missing_safety_doc = { level = "deny", priority = 1 }

[profile.release]
debug = true # Generate symbol info for profiling
debug = true # Generate symbol info for profiling
opt-level = 3
codegen-units = 1
lto = "fat"

# Doing light optimizations helps test performance more than it hurts build time.
[profile.test]
Expand Down
2 changes: 2 additions & 0 deletions noir-r1cs/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ ark-poly.workspace = true
ark-std.workspace = true
ark-serialize.workspace = true

noir-tools.workspace = true

# Binary
# See <https://github.com/rust-lang/cargo/issues/1982>
argh.workspace = true
Expand Down
36 changes: 35 additions & 1 deletion noir-r1cs/benches/bench.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
//! Divan benchmarks for noir-r1cs
use {
bincode,
core::hint::black_box,
divan::Bencher,
noir_r1cs::{read, NoirProof, NoirProofScheme},
noir_r1cs::{
read, utils::sumcheck::calculate_external_row_of_r1cs_matrices, FieldElement, NoirProof,
NoirProofScheme, R1CS,
},
noir_tools::compile_workspace,
std::path::Path,
};
Expand Down Expand Up @@ -67,6 +71,36 @@ fn verify_poseidon_1000(bencher: Bencher) {
bencher.bench(|| black_box(&scheme).verify(black_box(&proof)));
}

#[divan::bench]
fn calculate_external_row_from_serialized_data(bencher: Bencher) {
let alpha_path = Path::new(env!("CARGO_MANIFEST_DIR"))
.join("benches")
.join("alpha.bin");
let r1cs_path = Path::new(env!("CARGO_MANIFEST_DIR"))
.join("benches")
.join("r1cs.bin");

// Load serialized data with bincode
let alpha_raw: Vec<u64> =
bincode::deserialize(&std::fs::read(&alpha_path).expect("Failed to read alpha.bin"))
.expect("Failed to deserialize alpha");
let alpha: Vec<FieldElement> = alpha_raw
.into_iter()
.map(|v| FieldElement::from(v))
.collect();

let r1cs: R1CS =
bincode::deserialize(&std::fs::read(&r1cs_path).expect("Failed to read r1cs.bin"))
.expect("Failed to deserialize r1cs");

bencher.bench(|| {
black_box(calculate_external_row_of_r1cs_matrices(
black_box(&alpha),
black_box(&r1cs),
))
});
}

fn main() {
divan::main();
}
35 changes: 35 additions & 0 deletions noir-r1cs/src/bin/profile_prove.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
//! Standalone executable for profiling noir-r1cs prove operations
use {
noir_r1cs::{read, NoirProofScheme},
noir_tools::compile_workspace,
std::path::Path,
};

fn main() {
println!("Starting prove profiling...");

let manifest_path = Path::new(env!("CARGO_MANIFEST_DIR"));
let poseidon_path = manifest_path.join("benches").join("poseidon-1000.nps");
let scheme: NoirProofScheme = read(&poseidon_path).unwrap();

let crate_dir = manifest_path.join("../noir-examples/poseidon-rounds");

compile_workspace(&crate_dir).expect("Compiling workspace");

let witness_path = crate_dir.join("Prover.toml");

let input_map = scheme
.read_witness(&witness_path)
.expect("Failed reading witness");

println!("Setup complete, starting prove operations...");

// Run multiple iterations for better profiling data
for i in 0..1 {
println!("Prove iteration {}", i + 1);

let _proof = scheme.prove(&input_map);
}

println!("Profiling complete!");
}
43 changes: 34 additions & 9 deletions noir-r1cs/src/sparse_matrix.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
use {
crate::{FieldElement, InternedFieldElement, Interner},
ark_std::Zero,
rayon::iter::{
IndexedParallelIterator, IntoParallelIterator, IntoParallelRefMutIterator, ParallelIterator,
},
serde::{Deserialize, Serialize},
std::{
fmt::Debug,
Expand Down Expand Up @@ -146,7 +149,6 @@ impl HydratedSparseMatrix<'_> {
}

/// Right multiplication by vector
// OPT: Paralelize
impl Mul<&[FieldElement]> for HydratedSparseMatrix<'_> {
type Output = Vec<FieldElement>;

Expand All @@ -156,16 +158,21 @@ impl Mul<&[FieldElement]> for HydratedSparseMatrix<'_> {
rhs.len(),
"Vector length does not match number of columns."
);
let mut result = vec![FieldElement::zero(); self.matrix.num_rows];
for ((i, j), value) in self.iter() {
result[i] += value * rhs[j];
}

let mut result = Vec::with_capacity(self.matrix.num_rows);

(0..self.matrix.num_rows)
.into_par_iter()
.map(|i| {
self.iter_row(i)
.fold(FieldElement::zero(), |sum, (j, value)| sum + value * rhs[j])
})
.collect_into_vec(&mut result);
result
}
}

/// Left multiplication by vector
// OPT: Paralelize
impl Mul<HydratedSparseMatrix<'_>> for &[FieldElement] {
type Output = Vec<FieldElement>;

Expand All @@ -175,10 +182,28 @@ impl Mul<HydratedSparseMatrix<'_>> for &[FieldElement] {
rhs.matrix.num_rows,
"Vector length does not match number of rows."
);

let num_threads = rayon::current_num_threads();
let mut result = vec![FieldElement::zero(); rhs.matrix.num_cols];
for ((i, j), value) in rhs.iter() {
result[j] += value * self[i];
}

let chunk_size = result.len().div_ceil(num_threads);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the perf delta is small (e.g. <5%) I think I prefer a solution that let's rayon decide the chunk size. I don't like hard dependencies on the number of available threads. The number of threads does not say much, as we might be doing other work in parallel.

Instead we should pick the workload size such that it amortizes the overhead, while still allowing parallelization for large problem sizes. To approximate this I like to pick 'whatever subproblem fits in L1 cache' as problem size. And this in turn is approximated with the workload_size::<F>(result.len()) function.


// In microbenchmarks par_iter_mut.chunks outperforms par_chunks_mut slightly.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If it is a small difference this might be a compiler quirk. I'd stick with par_chunks_mut for simplicity and maintainability.

If .par_iter_mut().chunks(..) is faster than this is basically a bug somewhere as the former provides the libraries/compiler with strictly more information. This bug is likely to be fixed at some point.

result
.par_iter_mut()
.chunks(chunk_size)
.enumerate()
.for_each(|(chunk_number, mut chunk)| {
let base = chunk_number * chunk_size;
let col_range = base..base + chunk_size;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let col_range = base..base + chunk.len().

Otherwise col_range will be out of bounds when chunk_size doesn't divide result.len(). You are protected here from this bug because col will only be in-range, but better to do it right.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let base = chunk_number * chunk_size should still be correct as only the last chunk is not exactly chunk_size (but please confirm with rayon docs, and maybe leave a comment explaining correctness.)

rhs.iter()
.filter(|((_row, col), _value)| col_range.contains(col))
.for_each(|((row, col), value)| {
let index = col - base;
*(chunk[index]) += self[row] * value;
});
});

result
}
}
Loading