diff --git a/Cargo.toml b/Cargo.toml
index 329cc316..bfaa488e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -41,10 +41,8 @@ missing_docs_in_private_items = { level = "allow", priority = 1 }
 missing_safety_doc = { level = "deny", priority = 1 }
 
 [profile.release]
-debug = true      # Generate symbol info for profiling
+debug = true  # Generate symbol info for profiling
 opt-level = 3
-codegen-units = 1
-lto = "fat"
 
 # Doing light optimizations helps test performance more than it hurts build time.
 [profile.test]
diff --git a/noir-r1cs/Cargo.toml b/noir-r1cs/Cargo.toml
index 21c1f316..450498de 100644
--- a/noir-r1cs/Cargo.toml
+++ b/noir-r1cs/Cargo.toml
@@ -53,6 +53,8 @@ ark-poly.workspace = true
 ark-std.workspace = true
 ark-serialize.workspace = true
 
+noir-tools.workspace = true
+
 # Binary
 # See <https://github.com/rust-lang/cargo/issues/1982>
 argh.workspace = true
diff --git a/noir-r1cs/benches/bench.rs b/noir-r1cs/benches/bench.rs
index 59aa1583..75acad82 100644
--- a/noir-r1cs/benches/bench.rs
+++ b/noir-r1cs/benches/bench.rs
@@ -1,8 +1,12 @@
 //! Divan benchmarks for noir-r1cs
 use {
+    bincode,
     core::hint::black_box,
     divan::Bencher,
-    noir_r1cs::{read, NoirProof, NoirProofScheme},
+    noir_r1cs::{
+        read, utils::sumcheck::calculate_external_row_of_r1cs_matrices, FieldElement, NoirProof,
+        NoirProofScheme, R1CS,
+    },
     noir_tools::compile_workspace,
     std::path::Path,
 };
@@ -67,6 +71,36 @@ fn verify_poseidon_1000(bencher: Bencher) {
     bencher.bench(|| black_box(&scheme).verify(black_box(&proof)));
 }
 
+#[divan::bench]
+fn calculate_external_row_from_serialized_data(bencher: Bencher) {
+    let alpha_path = Path::new(env!("CARGO_MANIFEST_DIR"))
+        .join("benches")
+        .join("alpha.bin");
+    let r1cs_path = Path::new(env!("CARGO_MANIFEST_DIR"))
+        .join("benches")
+        .join("r1cs.bin");
+
+    // Load serialized data with bincode
+    let alpha_raw: Vec<u64> =
+        bincode::deserialize(&std::fs::read(&alpha_path).expect("Failed to read alpha.bin"))
+            .expect("Failed to deserialize alpha");
+    let alpha: Vec<FieldElement> = alpha_raw
+        .into_iter()
+        .map(|v| FieldElement::from(v))
+        .collect();
+
+    let r1cs: R1CS =
+        bincode::deserialize(&std::fs::read(&r1cs_path).expect("Failed to read r1cs.bin"))
+            .expect("Failed to deserialize r1cs");
+
+    bencher.bench(|| {
+        black_box(calculate_external_row_of_r1cs_matrices(
+            black_box(&alpha),
+            black_box(&r1cs),
+        ))
+    });
+}
+
 fn main() {
     divan::main();
 }
diff --git a/noir-r1cs/src/bin/profile_prove.rs b/noir-r1cs/src/bin/profile_prove.rs
new file mode 100644
index 00000000..23325e77
--- /dev/null
+++ b/noir-r1cs/src/bin/profile_prove.rs
@@ -0,0 +1,35 @@
+//! Standalone executable for profiling noir-r1cs prove operations
+use {
+    noir_r1cs::{read, NoirProofScheme},
+    noir_tools::compile_workspace,
+    std::path::Path,
+};
+
+fn main() {
+    println!("Starting prove profiling...");
+
+    let manifest_path = Path::new(env!("CARGO_MANIFEST_DIR"));
+    let poseidon_path = manifest_path.join("benches").join("poseidon-1000.nps");
+    let scheme: NoirProofScheme = read(&poseidon_path).unwrap();
+
+    let crate_dir = manifest_path.join("../noir-examples/poseidon-rounds");
+
+    compile_workspace(&crate_dir).expect("Compiling workspace");
+
+    let witness_path = crate_dir.join("Prover.toml");
+
+    let input_map = scheme
+        .read_witness(&witness_path)
+        .expect("Failed reading witness");
+
+    println!("Setup complete, starting prove operations...");
+
+    // Run multiple iterations for better profiling data
+    for i in 0..1 {
+        println!("Prove iteration {}", i + 1);
+
+        let _proof = scheme.prove(&input_map);
+    }
+
+    println!("Profiling complete!");
+}
diff --git a/noir-r1cs/src/sparse_matrix.rs b/noir-r1cs/src/sparse_matrix.rs
index 1a5f132a..7abc45d9 100644
--- a/noir-r1cs/src/sparse_matrix.rs
+++ b/noir-r1cs/src/sparse_matrix.rs
@@ -1,6 +1,9 @@
 use {
     crate::{FieldElement, InternedFieldElement, Interner},
     ark_std::Zero,
+    rayon::iter::{
+        IndexedParallelIterator, IntoParallelIterator, IntoParallelRefMutIterator, ParallelIterator,
+    },
     serde::{Deserialize, Serialize},
     std::{
         fmt::Debug,
@@ -146,7 +149,6 @@ impl HydratedSparseMatrix<'_> {
 }
 
 /// Right multiplication by vector
-// OPT: Paralelize
 impl Mul<&[FieldElement]> for HydratedSparseMatrix<'_> {
     type Output = Vec<FieldElement>;
 
@@ -156,16 +158,21 @@ impl Mul<&[FieldElement]> for HydratedSparseMatrix<'_> {
             rhs.len(),
             "Vector length does not match number of columns."
         );
-        let mut result = vec![FieldElement::zero(); self.matrix.num_rows];
-        for ((i, j), value) in self.iter() {
-            result[i] += value * rhs[j];
-        }
+
+        let mut result = Vec::with_capacity(self.matrix.num_rows);
+
+        (0..self.matrix.num_rows)
+            .into_par_iter()
+            .map(|i| {
+                self.iter_row(i)
+                    .fold(FieldElement::zero(), |sum, (j, value)| sum + value * rhs[j])
+            })
+            .collect_into_vec(&mut result);
         result
     }
 }
 
 /// Left multiplication by vector
-// OPT: Paralelize
 impl Mul<HydratedSparseMatrix<'_>> for &[FieldElement] {
     type Output = Vec<FieldElement>;
 
@@ -175,10 +182,28 @@ impl Mul<HydratedSparseMatrix<'_>> for &[FieldElement] {
             rhs.matrix.num_rows,
             "Vector length does not match number of rows."
         );
+
+        let num_threads = rayon::current_num_threads();
         let mut result = vec![FieldElement::zero(); rhs.matrix.num_cols];
-        for ((i, j), value) in rhs.iter() {
-            result[j] += value * self[i];
-        }
+
+        let chunk_size = result.len().div_ceil(num_threads);
+
+        // In microbenchmarks par_iter_mut.chunks outperforms par_chunks_mut slightly.
+        result
+            .par_iter_mut()
+            .chunks(chunk_size)
+            .enumerate()
+            .for_each(|(chunk_number, mut chunk)| {
+                let base = chunk_number * chunk_size;
+                let col_range = base..base + chunk_size;
+                rhs.iter()
+                    .filter(|((_row, col), _value)| col_range.contains(col))
+                    .for_each(|((row, col), value)| {
+                        let index = col - base;
+                        *(chunk[index]) += self[row] * value;
+                    });
+            });
+
         result
     }
 }