diff --git a/docs/hptt-openblas-vs-julia-benchmark.md b/docs/hptt-openblas-vs-julia-benchmark.md new file mode 100644 index 0000000..67ded8a --- /dev/null +++ b/docs/hptt-openblas-vs-julia-benchmark.md @@ -0,0 +1,112 @@ +# strided-rs (HPTT + OpenBLAS) vs OMEinsum.jl Benchmark + +Branch: `perf/src-vs-dst-order` + +## Purpose + +Verify that strided-rs with HPTT as the default copy strategy (instead of +source-stride-order) remains competitive with OMEinsum.jl. If so, there is no +need for an adaptive copy strategy — HPTT can be the universal default. + +## Setup + +- **Rust**: strided-opteinsum with `blas` + `hptt-input-copy` features. + Copy elision (`try_fuse_group`) is enabled; when a copy is needed, HPTT + (destination-stride-order) is used. GEMM backend: OpenBLAS 0.3.29. +- **Julia**: OMEinsum.jl v0.9.3 with pre-computed contraction paths + (`omeinsum_path` mode). Julia 1.10.0, BLAS vendor: lbt (OpenBLAS). +- **Hardware**: AMD EPYC 7713P +- **Timing**: median of 15 runs, 3 warmup + +## Results + +### 1T — opt_flops + +| Instance | Rust (ms) | Julia (ms) | Ratio | +|---|---|---|---| +| gm_queen5_5_3 | 5209 | SKIP | — | +| lm_brackets_4_4d | 19 | 30 | **0.62x** | +| lm_sentence_3_12d | 76 | 80 | **0.95x** | +| lm_sentence_4_4d | 22 | 33 | **0.66x** | +| str_matrix_chain_100 | 14 | 16 | **0.84x** | +| str_mps_varying_200 | 15 | 37 | **0.42x** | +| mera_closed | 1361 | 1386 | **0.98x** | +| mera_open | 880 | 1251 | **0.70x** | +| tn_focus | 455 | 491 | **0.93x** | +| tn_light | 450 | 495 | **0.91x** | + +### 1T — opt_size + +| Instance | Rust (ms) | Julia (ms) | Ratio | +|---|---|---|---| +| gm_queen5_5_3 | 1632 | SKIP | — | +| lm_brackets_4_4d | 20 | 32 | **0.60x** | +| lm_sentence_3_12d | 60 | 92 | **0.66x** | +| lm_sentence_4_4d | 26 | 33 | **0.77x** | +| str_matrix_chain_100 | 12 | 17 | **0.69x** | +| str_mps_varying_200 | 17 | 35 | **0.48x** | +| mera_closed | 1173 | 1286 | **0.91x** | +| mera_open | 914 | 1298 | **0.70x** | +| tn_focus | 449 | 495 | **0.91x** | +| tn_light | 451 | 500 | **0.90x** | + +### 4T — opt_flops + +| Instance | Rust (ms) | Julia (ms) | Ratio | +|---|---|---|---| +| gm_queen5_5_3 | 4092 | SKIP | — | +| lm_brackets_4_4d | 20 | 42 | **0.49x** | +| lm_sentence_3_12d | 53 | 59 | **0.89x** | +| lm_sentence_4_4d | 23 | 46 | **0.51x** | +| str_matrix_chain_100 | 14 | 17 | **0.81x** | +| str_mps_varying_200 | 23 | 59 | **0.39x** | +| mera_closed | 587 | 931 | **0.63x** | +| mera_open | 353 | 798 | **0.44x** | +| tn_focus | 352 | 444 | **0.79x** | +| tn_light | 358 | 446 | **0.80x** | + +### 4T — opt_size + +| Instance | Rust (ms) | Julia (ms) | Ratio | +|---|---|---|---| +| gm_queen5_5_3 | 1202 | SKIP | — | +| lm_brackets_4_4d | 22 | 40 | **0.55x** | +| lm_sentence_3_12d | 39 | 63 | **0.63x** | +| lm_sentence_4_4d | 29 | 49 | **0.59x** | +| str_matrix_chain_100 | 8 | 16 | **0.48x** | +| str_mps_varying_200 | 20 | 44 | **0.45x** | +| mera_closed | 457 | 704 | **0.65x** | +| mera_open | 356 | 796 | **0.45x** | +| tn_focus | 353 | 457 | **0.77x** | +| tn_light | 356 | 464 | **0.77x** | + +## Analysis + +Rust with HPTT is **equal or faster than Julia on every instance**, in both +1T and 4T configurations. + +- **1T**: Rust is 0.42x–0.98x of Julia (2–58% faster across instances) +- **4T**: Rust is 0.39x–0.89x of Julia (11–61% faster across instances) +- The 4T advantage is larger because strided-rs parallelizes both + permutation copies (via rayon) and GEMM (via OpenBLAS threads), while + Julia's OMEinsum only parallelizes GEMM + +Even `tn_focus` and `tn_light` — the instances where HPTT is slower than +source-stride-order in isolation (see `src-vs-dst-order-experiment.md`) — +still outperform Julia. The copy elision (`try_fuse_group`) compensates for +HPTT's overhead on these degenerate many-small-dims cases. + +## Conclusion + +**HPTT can be the default copy strategy** for the `Contract` CPU backend. +There is no need for an adaptive strategy that switches between HPTT and +source-stride-order based on tensor shape. The combination of copy elision + +HPTT is sufficient to match or exceed Julia's OMEinsum.jl on all tested +workloads. + +## Notes + +- `gm_queen5_5_3` is skipped by Julia due to a `MethodError` (3D+ array + incompatibility in OMEinsum.jl) +- Julia's IQR is generally higher than Rust's, suggesting more variance + (likely due to GC pressure) diff --git a/docs/src-vs-dst-order-experiment.md b/docs/src-vs-dst-order-experiment.md new file mode 100644 index 0000000..0c0ddf6 --- /dev/null +++ b/docs/src-vs-dst-order-experiment.md @@ -0,0 +1,119 @@ +# Source-Stride-Order vs Destination-Stride-Order (HPTT) Experiment + +Branch: `perf/src-vs-dst-order` + +## Hypothesis + +The eager-HPTT experiment showed a 26–31% regression on `mera_open` when +permutations are eagerly materialized. However, that experiment conflated two +factors: **copy elision** (`try_fuse_group`) and **copy strategy** (source-order +vs destination-order). This experiment isolates the copy strategy factor by +disabling copy elision (`force-copy` feature) and comparing source-stride-order +copy vs HPTT (destination-stride-order) copy. + +## Setup + +Two feature flags added to `strided-einsum2`: + +- `force-copy`: Forces `needs_copy = true` in all `prepare_input_*` and + `prepare_output_*` functions, disabling `try_fuse_group` elision. +- `hptt-input-copy`: Switches `prepare_input_owned` from source-stride-order + copy to HPTT (`strided_kernel::copy_into_col_major`). + +Three configurations benchmarked: + +1. **Baseline**: Default (copy elision enabled, src-order when copy needed) +2. **force-copy + src-order**: Copy elision disabled, source-stride-order copy +3. **force-copy + HPTT**: Copy elision disabled, HPTT destination-order copy + +## Results (AMD EPYC 7713P, faer, 1T) + +### opt_flops + +| Instance | Baseline | force+src | force+HPTT | src vs HPTT | +|---|---|---|---|---| +| gm_queen5_5_3 | 5775 ms | 6400 ms (+11%) | 6052 ms (+5%) | HPTT 5% faster | +| lm_brackets_4_4d | 19 ms | 35 ms (+80%) | 24 ms (+23%) | **HPTT 31% faster** | +| lm_sentence_3_12d | 65 ms | 92 ms (+42%) | 78 ms (+19%) | **HPTT 16% faster** | +| lm_sentence_4_4d | 17 ms | 34 ms (+99%) | 24 ms (+41%) | **HPTT 29% faster** | +| str_matrix_chain_100 | 11 ms | 20 ms (+80%) | 14 ms (+25%) | **HPTT 30% faster** | +| str_mps_varying_200 | 16 ms | 21 ms (+31%) | 16 ms (-3%) | **HPTT 25% faster** | +| mera_closed | 1518 ms | 1739 ms (+15%) | 1567 ms (+3%) | **HPTT 10% faster** | +| mera_open | 935 ms | 1129 ms (+21%) | 1142 ms (+22%) | ~same (+1%) | +| tn_focus | 288 ms | 400 ms (+39%) | 568 ms (+97%) | **src 30% faster** | +| tn_light | 289 ms | 401 ms (+39%) | 560 ms (+94%) | **src 28% faster** | + +### opt_size + +| Instance | Baseline | force+src | force+HPTT | src vs HPTT | +|---|---|---|---|---| +| gm_queen5_5_3 | 1727 ms | 2471 ms (+43%) | 2290 ms (+33%) | HPTT 7% faster | +| lm_brackets_4_4d | 19 ms | 35 ms (+82%) | 20 ms (+5%) | **HPTT 43% faster** | +| lm_sentence_3_12d | 52 ms | 78 ms (+49%) | 59 ms (+14%) | **HPTT 24% faster** | +| lm_sentence_4_4d | 22 ms | 37 ms (+69%) | 25 ms (+18%) | **HPTT 30% faster** | +| str_matrix_chain_100 | 11 ms | 20 ms (+85%) | 15 ms (+36%) | **HPTT 26% faster** | +| str_mps_varying_200 | 14 ms | 23 ms (+68%) | 13 ms (-3%) | **HPTT 43% faster** | +| mera_closed | 1480 ms | 1496 ms (+1%) | 1322 ms (-11%) | **HPTT 12% faster** | +| mera_open | 934 ms | 1108 ms (+19%) | 1086 ms (+16%) | ~same (-2%) | +| tn_focus | 288 ms | 394 ms (+37%) | 541 ms (+88%) | **src 27% faster** | +| tn_light | 289 ms | 387 ms (+34%) | 532 ms (+84%) | **src 27% faster** | + +## Analysis + +### HPTT is faster for most workloads + +Contrary to the initial assumption that source-stride-order is generally +superior, HPTT outperforms source-order on 8 out of 10 instances (16–43% +faster). HPTT's cache-blocked 2D transpose tiles give better cache utilization +when the data layout has moderate-to-large contiguous blocks. + +### Source-order wins only for many small binary dimensions + +The two instances where source-order is faster — `tn_focus` (316 tensors) and +`tn_light` (415 tensors) — have many binary dimensions (size 2). With ~24 +dimensions of size 2, HPTT builds ~15 recursion levels with only 2 iterations +each, and the 2×2 inner tile degenerates. The simple odometer loop of +source-order copy handles this case more efficiently. + +### mera_open: copy strategy is irrelevant + +`mera_open` shows essentially no difference between source-order and HPTT +(+1% / -2%, within noise). The 21–22% regression vs baseline is entirely +due to copy elision loss. This confirms that the 26–31% regression in the +eager-HPTT experiment was caused by copy elision, not by HPTT's copy strategy. + +### Copy elision remains the dominant optimization + +All instances are faster with copy elision enabled (baseline) than with either +forced copy strategy. The biggest gaps are on lm_* and str_* instances (up to +99% regression with forced copies). Copy elision (`try_fuse_group`) should +always be the first priority. + +## Conclusions + +1. **Copy elision (`try_fuse_group`) is the most important optimization** — + responsible for the majority of performance gains across all instances. + +2. **HPTT is the better default copy strategy** when copies cannot be avoided. + It outperforms source-order on most workloads thanks to cache-blocked tiling. + +3. **Source-order is better for degenerate cases** with many small dimensions + (size 2), where HPTT's recursion structure becomes overhead-heavy. + +4. **The optimal `Contract` implementation should use adaptive copy strategy**: + - Always try copy elision first (`try_fuse_group`) + - Use HPTT for general cases + - Consider source-order for tensors with many small dimensions (heuristic + needed) + +## Implications for `Contract` CPU backend + +The priority order in `contract-as-core-op.md` should be updated: + +1. **Copy elision** (`try_fuse_group`) — dominant optimization, always first +2. **HPTT (destination-stride-order)** — default copy strategy when elision fails +3. **Source-stride-order** — fallback for degenerate many-small-dims cases + +A simple heuristic for choosing between HPTT and source-order: if the minimum +dimension size after bilateral fusion is ≤ 2 and the number of fused dimensions +is large (e.g., > 10), prefer source-order. Otherwise, use HPTT. diff --git a/strided-einsum2/src/contiguous.rs b/strided-einsum2/src/contiguous.rs index 99cb694..e8a3c3e 100644 --- a/strided-einsum2/src/contiguous.rs +++ b/strided-einsum2/src/contiguous.rs @@ -12,9 +12,6 @@ use std::collections::HashMap; use strided_perm::try_fuse_group; use strided_view::{StridedArray, StridedView, StridedViewMut}; -#[cfg(feature = "parallel")] -use rayon::iter::{IntoParallelIterator, ParallelIterator}; - /// GEMM-ready input operand with contiguous data. pub struct ContiguousOperand { ptr: *const T, @@ -373,173 +370,6 @@ pub fn prepare_input_view( } } -/// Copy elements from `src` to `dst`, iterating in source-stride order. -/// -/// Dimensions are traversed from smallest to largest source stride, giving -/// sequential (or near-sequential) reads. Writes to the destination may be -/// scattered, but hardware write-combining buffers absorb much of the cost. -/// -/// # Why not HPTT (`strided_kernel::copy_into_col_major`)? -/// -/// HPTT iterates in *destination*-stride order (optimized for sequential -/// writes). This is ideal when the source data is warm in cache, but in -/// `prepare_input_owned` the source is usually a large intermediate whose -/// L3 cache lines have been evicted by subsequent contraction steps. -/// With cold-cache source and many small non-contiguous dimensions (e.g. -/// 24 binary dims of size 2 after a metadata-only permutation), HPTT's -/// bilateral fusion leaves ~17 fused dims with a 2×2 inner tile and 15 -/// levels of recursion per 4 elements — both cache-unfriendly reads AND -/// high per-element overhead. -/// -/// Source-stride-order iteration gives sequential reads that exploit the -/// hardware prefetcher, which dominates performance on cold-cache, -/// memory-bandwidth-bound copies. -unsafe fn copy_strided_src_order( - src_ptr: *const T, - dst_ptr: *mut T, - dims: &[usize], - src_strides: &[isize], - dst_strides: &[isize], -) { - let ndim = dims.len(); - let n: usize = dims.iter().product(); - if n == 0 { - return; - } - - // Sort dimensions by source stride (ascending) → innermost = smallest src stride - let mut dim_order: Vec = (0..ndim).filter(|&i| dims[i] > 1).collect(); - dim_order.sort_by_key(|&i| src_strides[i].unsigned_abs()); - - let sorted_dims: Vec = dim_order.iter().map(|&i| dims[i]).collect(); - let sorted_src: Vec = dim_order.iter().map(|&i| src_strides[i]).collect(); - let sorted_dst: Vec = dim_order.iter().map(|&i| dst_strides[i]).collect(); - let nd = sorted_dims.len(); - - let mut idx = vec![0usize; nd]; - let mut so: isize = 0; - let mut do_: isize = 0; - - for _ in 0..n { - *dst_ptr.offset(do_) = *src_ptr.offset(so); - - for d in 0..nd { - idx[d] += 1; - if idx[d] < sorted_dims[d] { - so += sorted_src[d]; - do_ += sorted_dst[d]; - break; - } else { - so -= sorted_src[d] * (sorted_dims[d] as isize - 1); - do_ -= sorted_dst[d] * (sorted_dims[d] as isize - 1); - idx[d] = 0; - } - } - } -} - -/// Parallel version of [`copy_strided_src_order`]. -/// -/// Outer dimensions (by source stride) are split across rayon threads; each -/// thread runs a sequential odometer over the inner dimensions. -/// Falls back to the single-threaded version for small tensors. -#[cfg(feature = "parallel")] -unsafe fn copy_strided_src_order_par( - src_ptr: *const T, - dst_ptr: *mut T, - dims: &[usize], - src_strides: &[isize], - dst_strides: &[isize], -) { - let ndim = dims.len(); - let n: usize = dims.iter().product(); - if n == 0 { - return; - } - - // Fall back to sequential when parallelism would add overhead without gain. - const PAR_THRESHOLD: usize = 1 << 20; // 1M elements - if n < PAR_THRESHOLD || rayon::current_num_threads() <= 1 { - copy_strided_src_order(src_ptr, dst_ptr, dims, src_strides, dst_strides); - return; - } - - // Sort dimensions by source stride (ascending) → innermost = smallest src stride - let mut dim_order: Vec = (0..ndim).filter(|&i| dims[i] > 1).collect(); - dim_order.sort_by_key(|&i| src_strides[i].unsigned_abs()); - - let sorted_dims: Vec = dim_order.iter().map(|&i| dims[i]).collect(); - let sorted_src: Vec = dim_order.iter().map(|&i| src_strides[i]).collect(); - let sorted_dst: Vec = dim_order.iter().map(|&i| dst_strides[i]).collect(); - let nd = sorted_dims.len(); - - // Peel outer dims until we have enough parallel tasks (>= 4× threads). - let min_tasks = rayon::current_num_threads() * 4; - let mut split_at = nd; // index into sorted arrays: [0..split_at) inner, [split_at..nd) outer - let mut par_count: usize = 1; - while split_at > 0 && par_count < min_tasks { - split_at -= 1; - par_count *= sorted_dims[split_at]; - } - - let inner_n: usize = sorted_dims[..split_at].iter().product::().max(1); - - // Convert pointers to usize for Send (same pattern as strided-perm). - let src_addr = src_ptr as usize; - let dst_addr = dst_ptr as usize; - - let outer_dims = sorted_dims[split_at..].to_vec(); - let outer_src = sorted_src[split_at..].to_vec(); - let outer_dst = sorted_dst[split_at..].to_vec(); - let inner_dims = sorted_dims[..split_at].to_vec(); - let inner_src = sorted_src[..split_at].to_vec(); - let inner_dst = sorted_dst[..split_at].to_vec(); - - (0..par_count).into_par_iter().for_each(|outer_idx| { - // Compute base offsets from outer multi-index. - let mut src_off: isize = 0; - let mut dst_off: isize = 0; - let mut rem = outer_idx; - for d in 0..outer_dims.len() { - let i = rem % outer_dims[d]; - rem /= outer_dims[d]; - src_off += i as isize * outer_src[d]; - dst_off += i as isize * outer_dst[d]; - } - - let sp = (src_addr as isize + src_off * std::mem::size_of::() as isize) as *const T; - let dp = (dst_addr as isize + dst_off * std::mem::size_of::() as isize) as *mut T; - - if split_at == 0 { - // No inner dims — single element per task. - unsafe { *dp = *sp }; - return; - } - - // Sequential odometer over inner dims. - let mut idx = vec![0usize; split_at]; - let mut so: isize = 0; - let mut do_: isize = 0; - - for _ in 0..inner_n { - unsafe { *dp.offset(do_) = *sp.offset(so) }; - - for d in 0..split_at { - idx[d] += 1; - if idx[d] < inner_dims[d] { - so += inner_src[d]; - do_ += inner_dst[d]; - break; - } else { - so -= inner_src[d] * (inner_dims[d] as isize - 1); - do_ -= inner_dst[d] * (inner_dims[d] as isize - 1); - idx[d] = 0; - } - } - } - }); -} - /// Prepare an owned input array for GEMM. /// /// Expects batch-last canonical order: `[group1..., group2..., batch...]`. @@ -603,37 +433,7 @@ pub fn prepare_input_owned( if needs_copy { let m: usize = group1_dims.iter().product::().max(1); let (mut buf, buf_is_pooled) = alloc_col_major_uninit_with_pool(&dims); - // Use source-stride-order copy instead of HPTT (strided_kernel::copy_into_col_major). - // einsum2 always produces col-major output and only metadata permutations - // are applied between steps, so the source is physically contiguous but has - // scattered strides. HPTT iterates in destination order → scattered reads - // from cold L3 cache. Source-order iteration gives sequential reads that - // exploit the hardware prefetcher. See doc comment on copy_strided_src_order. - { - let dst_strides = buf.strides().to_vec(); - unsafe { - #[cfg(feature = "parallel")] - { - copy_strided_src_order_par( - arr.view().ptr(), - buf.view_mut().as_mut_ptr(), - &dims, - &strides, - &dst_strides, - ); - } - #[cfg(not(feature = "parallel"))] - { - copy_strided_src_order( - arr.view().ptr(), - buf.view_mut().as_mut_ptr(), - &dims, - &strides, - &dst_strides, - ); - } - } - } + strided_kernel::copy_into_col_major(&mut buf.view_mut(), &arr.view())?; let ptr = buf.view().ptr(); let batch_strides = buf.strides()[n_inner..].to_vec(); let row_stride = if m == 0 { 0 } else { 1isize };