diff --git a/.jules/profiler.md b/.jules/profiler.md index 3a11b10..a6ffb87 100644 --- a/.jules/profiler.md +++ b/.jules/profiler.md @@ -155,3 +155,22 @@ Benchmark `bench_ising_custom` (10M iterations, 100x100 grid): - Before: ~0.72s (13.9 M/s) - After: ~0.39s (25.7 M/s) - Speedup: ~1.85x + +## 2026-11-02 - [Optimization] **Bottleneck:** Cera Prediction Reshape Allocation **Strategy:** Zero-Copy Matrix Reshape **Gain:** ~15% Time Saved (196ms -> 167ms) + +**Bottleneck:** +The `Cera::predict` method was performing excessive data copying during the reshape phase: +1. `clone_owned()` was called on the submatrix, creating a full copy. +2. `reshape_for_predictor` accumulated elements into a temporary `Vec`. +3. `DMatrix::from_row_slice` copied the data again into the final result. + +**Strategy:** +1. **Zero-Copy View:** Refactored `predict` to pass a `MatrixView` instead of cloning. +2. **Direct Write:** Updated `reshape_for_predictor` to allocate the result matrix once and write directly to it, iterating over the source view. +3. **Generic Storage:** Made the method generic over `Storage` to accept views or owned matrices. + +**Gain:** +Benchmark `bench_cera_reshape` (2000 batch size, 50 levels): +- Before: ~196ms +- After: ~167ms +- Speedup: ~15% (Eliminated intermediate allocations) diff --git a/math_explorer/examples/bench_cera_reshape.rs b/math_explorer/examples/bench_cera_reshape.rs new file mode 100644 index 0000000..df47296 --- /dev/null +++ b/math_explorer/examples/bench_cera_reshape.rs @@ -0,0 +1,51 @@ +use math_explorer::climate::cera::{Cera, CeraConfig}; +use nalgebra::DMatrix; +use std::time::Instant; + +fn main() { + let num_levels = 50; + let in_channels = 10; + let latent_channels = 8; + let aligned_channels = 8; + let output_size = 20; + let batch_size = 2000; + + let config = CeraConfig { + learning_rate: 0.001, + lambda_pred: 0.1, + lambda_emd: 0.01, + epochs: 1, + batch_size, + in_channels, + latent_channels, + aligned_channels, + num_levels, + output_size, + }; + + let cera = Cera::new(config).expect("Failed to create Cera"); + + let total_rows = batch_size * num_levels; + let inputs = DMatrix::from_fn(total_rows, in_channels, |_, _| rand::random::()); + + println!( + "Running benchmark with batch_size={}, num_levels={}, channels={}...", + batch_size, num_levels, in_channels + ); + + // Warmup + for _ in 0..5 { + let _ = cera.predict(&inputs); + } + + let iterations = 20; + let start = Instant::now(); + for _ in 0..iterations { + let _ = cera.predict(&inputs); + } + let duration = start.elapsed(); + + let avg_time = duration.as_secs_f64() / iterations as f64; + println!("Total time: {:?}", duration); + println!("Average time per prediction: {:.6} seconds", avg_time); +} diff --git a/math_explorer/src/climate/cera.rs b/math_explorer/src/climate/cera.rs index 7f1c3a3..68a513f 100644 --- a/math_explorer/src/climate/cera.rs +++ b/math_explorer/src/climate/cera.rs @@ -98,24 +98,30 @@ impl Cera { /// # Returns /// /// The reshaped matrix ready for the predictor. - fn reshape_for_predictor( + fn reshape_for_predictor( &self, - latent_matrix: &DMatrix, + latent_matrix: &nalgebra::Matrix, batch_size: usize, - ) -> DMatrix { + ) -> DMatrix + where + S: nalgebra::storage::Storage, + { let num_levels = self.config.num_levels; let aligned_channels = self.config.aligned_channels; - let mut reshaped_data = Vec::with_capacity(batch_size * num_levels * aligned_channels); + let mut out = DMatrix::zeros(batch_size, num_levels * aligned_channels); + for i in 0..batch_size { let start_row = i * num_levels; let sample_latent = latent_matrix.rows(start_row, num_levels); - for r in sample_latent.row_iter() { - for element in r.iter() { - reshaped_data.push(*element); + + for r in 0..num_levels { + for c in 0..aligned_channels { + // Safety: bounds checked by logic + out[(i, r * aligned_channels + c)] = sample_latent[(r, c)]; } } } - DMatrix::from_row_slice(batch_size, num_levels * aligned_channels, &reshaped_data) + out } /// Makes a prediction using the trained CERA model. @@ -133,7 +139,7 @@ impl Cera { let batch_size = inputs.nrows() / num_levels; let latent = self.autoencoder.encoder.forward(inputs); - let aligned_latent = latent.columns(0, aligned_channels).clone_owned(); + let aligned_latent = latent.columns(0, aligned_channels); let predictor_input = self.reshape_for_predictor(&aligned_latent, batch_size); self.predictor.forward(&predictor_input) }