Aharoni-Lab · daharoni · Dec 13, 2025 · Dec 13, 2025 · Dec 13, 2025 · Dec 13, 2025
diff --git a/.gitignore b/.gitignore
@@ -177,6 +177,7 @@ cython_debug/
 /tests/data
 *.npy
 *.feat
+*.prof
 /tests/output/*
 !/tests/output/figs
 /tests/output/figs/*

diff --git a/README.md b/README.md
@@ -49,4 +49,13 @@ repository of Interpretable Deconvolution for Calcium imaging.
 1. Run `CUDA_PATH=whatever python setup.py install`.
    If you followed the steps correctly, `CUDA_PATH` shouldn't matter (but it has to be set).
 1. Verify that `cuosqp` is installed under your environment.
+
+## Profiling
+
+For comprehensive profiling documentation, see [`benchmarks/profile/README.md`](benchmarks/profile/README.md).
+
+Quick reference:
+- **Line-level profiling**: Use `kernprof -l -v your_script.py` for functions decorated with `@profile`
+- **Pipeline-level profiling**: Use `yappi_profile` context manager for function-level attribution
+- **Benchmark scripts**: Deterministic benchmarks in `benchmarks/profile/` for regression detection
 
diff --git a/benchmarks/profile/README.md b/benchmarks/profile/README.md
@@ -0,0 +1,158 @@
+# Profiling
+
+InDeCa provides two complementary profiling approaches for performance analysis and optimization.
+
+## 1. Line-Level Profiling (line_profiler)
+
+For deep numerical inspection of hot loops, solvers, and kernel construction.
+
+Functions decorated with `@profile` can be profiled using:
+
+```bash
+kernprof -l -v your_script.py
+```
+
+This is particularly useful for:
+- Hot inner loops
+- Solver internals
+- Kernel construction
+- Deconvolution steps
+
+## 2. Pipeline-Level Profiling (yappi + snakeviz)
+
+For function-level attribution and call graph analysis.
+
+### Quick Start
+
+```python
+from indeca.utils.profiling import yappi_profile
+from indeca.pipeline import pipeline_bin_new, DeconvPipelineConfig
+
+Y = load_your_data()
+config = DeconvPipelineConfig(...)
+
+with yappi_profile("pipeline.prof"):
+    C, S, metrics = pipeline_bin_new(Y, config=config, spawn_dashboard=False)
+```
+
+View results:
+```bash
+snakeviz pipeline.prof
+```
+
+### Clock Types
+
+- **wall** (default): Real elapsed time, includes I/O and waiting
+- **cpu**: Actual computation time, excludes I/O
+
+```python
+with yappi_profile("cpu_profile.prof", clock="cpu"):
+    ...
+```
+
+### Usage
+
+The `yappi_profile` context manager wraps code execution and saves profiling data in pstat format:
+
+```python
+from indeca.utils.profiling import yappi_profile
+
+with yappi_profile("output.prof", clock="wall"):
+    # Your code here
+    result = expensive_function()
+```
+
+## Benchmark Scripts
+
+Deterministic benchmarks for performance regression detection. All scripts use fixed seeds and configurations for reproducible results.
+
+### Small Benchmark (10 cells × 1K frames)
+
+Quick iterations and fast profiling:
+
+```bash
+# Quick runtime check
+python benchmarks/profile/profile_pipeline_small.py
+
+# With yappi profiling (wall-clock time)
+python benchmarks/profile/profile_pipeline_small.py --profile
+
+# With yappi profiling (CPU time)
+python benchmarks/profile/profile_pipeline_small.py --profile --clock cpu
+
+# View results
+snakeviz benchmarks/profile/output/profile_pipeline_small.prof
+```
+
+### Medium Benchmark (50 cells × 5K frames)
+
+Realistic workload testing:
+
+```bash
+# Quick runtime check
+python benchmarks/profile/profile_pipeline_medium.py
+
+# With profiling
+python benchmarks/profile/profile_pipeline_medium.py --profile
+
+# View results
+snakeviz benchmarks/profile/output/profile_pipeline_medium.prof
+```
+
+### Large Benchmark (100 cells × 10K frames)
+
+Comprehensive profiling - **warning: may take several minutes**:
+
+```bash
+# Quick runtime check
+python benchmarks/profile/profile_pipeline_large.py
+
+# With profiling
+python benchmarks/profile/profile_pipeline_large.py --profile
+
+# View results
+snakeviz benchmarks/profile/output/profile_pipeline_large.prof
+```
+
+## When to Use Each Tool
+
+| Tool | Use Case |
+|------|----------|
+| line_profiler | Hot inner loops, solver internals, kernel construction |
+| yappi | Pipeline flow, function call attribution, call graphs |
+| Benchmark scripts | Performance regression detection, optimization validation |
+
+## Performance Regression Workflow
+
+1. **Baseline**: Run benchmark without profiling to establish baseline runtime
+   ```bash
+   python benchmarks/profile/profile_pipeline_small.py
+   ```
+
+2. **Profile**: Run with profiling to identify bottlenecks
+   ```bash
+   python benchmarks/profile/profile_pipeline_small.py --profile
+   ```
+
+3. **Analyze**: View call graph and function timings in snakeviz
+   ```bash
+   snakeviz benchmarks/profile/output/profile_pipeline_small.prof
+   ```
+
+4. **Optimize**: Focus on functions with highest cumulative time
+
+5. **Validate**: Re-run benchmark to measure improvement
+
+## Output Location
+
+All profiling output files are saved in `benchmarks/profile/output/`:
+
+- `profile_pipeline_small.prof`
+- `profile_pipeline_medium.prof`
+- `profile_pipeline_large.prof`
+
+These files are in pstat format and can be viewed with:
+- **snakeviz** (recommended): Interactive web-based visualization
+- **gprof2dot**: Generate call graph diagrams
+- **pyprof2calltree**: Convert for use with kcachegrind
+
diff --git a/benchmarks/profile/profile_pipeline_large.py b/benchmarks/profile/profile_pipeline_large.py
@@ -0,0 +1,183 @@
+#!/usr/bin/env python
+"""Large deterministic benchmark for comprehensive pipeline profiling.
+
+Configuration: 100 cells x 10,000 frames
+Purpose: Comprehensive profiling and performance baseline establishment
+
+Warning: This benchmark may take several minutes to complete.
+
+Usage:
+    # Quick runtime check
+    python benchmarks/profile/profile_pipeline_large.py
+
+    # With yappi profiling (wall-clock time)
+    python benchmarks/profile/profile_pipeline_large.py --profile
+
+    # With yappi profiling (CPU time)
+    python benchmarks/profile/profile_pipeline_large.py --profile --clock cpu
+
+    # View results
+    snakeviz benchmarks/profile/output/profile_pipeline_large.prof
+"""
+
+import argparse
+import sys
+import time
+from pathlib import Path
+
+import numpy as np
+
+# Add project root to path for imports
+PROJECT_ROOT = Path(__file__).parent.parent.parent
+sys.path.insert(0, str(PROJECT_ROOT / "src"))
+
+from indeca.core.simulation import ar_trace
+from indeca.pipeline import DeconvPipelineConfig, pipeline_bin_new
+
+# Benchmark parameters
+NCELL = 100
+T = 10000
+SEED = 42
+TAU_D = 6.0
+TAU_R = 1.0
+SIGNAL_LEVEL = (1.0, 5.0)
+NOISE_STD = 1.0
+MAX_ITERS = 20
+
+# Markov transition matrix for spike generation (P[from_state, to_state])
+MARKOV_P = np.array([[0.95, 0.05], [0.8, 0.2]])
+
+# Output directory
+OUTPUT_DIR = Path(__file__).parent / "output"
+
+
+def make_test_data(ncell: int, T: int, seed: int) -> np.ndarray:
+    """Generate deterministic synthetic calcium imaging data.
+
+    Parameters
+    ----------
+    ncell : int
+        Number of cells
+    T : int
+        Number of time frames
+    seed : int
+        Random seed for reproducibility
+
+    Returns
+    -------
+    Y : np.ndarray
+        Noisy calcium traces, shape (ncell, T)
+    """
+    rng = np.random.default_rng(seed)
+
+    # Generate signal levels for each cell
+    sig_levels = np.sort(rng.uniform(SIGNAL_LEVEL[0], SIGNAL_LEVEL[1], size=ncell))
+
+    # Generate traces
+    Y = np.zeros((ncell, T))
+    for i in range(ncell):
+        C, S = ar_trace(T, MARKOV_P, tau_d=TAU_D, tau_r=TAU_R, rng=rng)
+        noise = rng.normal(0, NOISE_STD, size=T)
+        Y[i] = C * sig_levels[i] + noise
+
+    return Y
+
+
+def get_config() -> DeconvPipelineConfig:
+    """Get fixed pipeline configuration for benchmarking."""
+    return DeconvPipelineConfig.from_legacy_kwargs(
+        up_factor=1,
+        max_iters=MAX_ITERS,
+        ar_use_all=True,
+        est_noise_freq=0.06,
+        est_use_smooth=True,
+        est_add_lag=50,
+        deconv_norm="l2",
+        deconv_backend="osqp",
+    )
+
+
+def run_benchmark(profile: bool = False, clock: str = "wall") -> float:
+    """Run the benchmark.
+
+    Parameters
+    ----------
+    profile : bool
+        Whether to enable yappi profiling
+    clock : str
+        Clock type for yappi: "wall" or "cpu"
+
+    Returns
+    -------
+    elapsed : float
+        Elapsed time in seconds
+    """
+    # Generate data
+    print(f"Generating test data: {NCELL} cells x {T} frames (seed={SEED})")
+    Y = make_test_data(NCELL, T, SEED)
+    config = get_config()
+
+    print(f"Running pipeline (max_iters={MAX_ITERS})...")
+    print("Note: This may take several minutes...")
+
+    if profile:
+        from indeca.utils.profiling import yappi_profile
+
+        OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+        prof_path = OUTPUT_DIR / "profile_pipeline_large.prof"
+        print(f"Profiling enabled (clock={clock})")
+        print(f"Profile output: {prof_path}")
+
+        t0 = time.perf_counter()
+        with yappi_profile(str(prof_path), clock=clock):
+            C, S, metrics = pipeline_bin_new(
+                Y, config=config, spawn_dashboard=False, da_client=None
+            )
+        elapsed = time.perf_counter() - t0
+
+        print(f"\nView profile with: snakeviz {prof_path}")
+    else:
+        t0 = time.perf_counter()
+        C, S, metrics = pipeline_bin_new(
+            Y, config=config, spawn_dashboard=False, da_client=None
+        )
+        elapsed = time.perf_counter() - t0
+
+    return elapsed
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Large benchmark for comprehensive pipeline profiling"
+    )
+    parser.add_argument(
+        "--profile",
+        action="store_true",
+        help="Enable yappi profiling",
+    )
+    parser.add_argument(
+        "--clock",
+        choices=["wall", "cpu"],
+        default="wall",
+        help="Clock type for profiling (default: wall)",
+    )
+    args = parser.parse_args()
+
+    print("=" * 60)
+    print("Pipeline Benchmark: LARGE")
+    print(f"  Cells: {NCELL}")
+    print(f"  Frames: {T}")
+    print(f"  Max iterations: {MAX_ITERS}")
+    print("  Warning: This may take several minutes")
+    print("=" * 60)
+
+    elapsed = run_benchmark(profile=args.profile, clock=args.clock)
+
+    print("=" * 60)
+    print(f"Total runtime: {elapsed:.3f} seconds")
+    print("=" * 60)
+
+
+if __name__ == "__main__":
+    main()
+