Add optional GPU support with CuPy and fallback to CPU computation

RaulSimpetru · RaulSimpetru · commit f2aa0dfa7e4e · 2025-06-30T04:55:16.000+02:00
diff --git a/README.md b/README.md
@@ -18,6 +18,11 @@ https://docs.astral.sh/uv/#highlights
 uv sync
 ```
 
+### Add cupy if CUDA is available
+```bash
+uv pip install cupy-cuda12x
+```
+
 ### Activate the environment
 ```bash
 source .venv/bin/activate
diff --git a/myogen/simulator/core/emg/surface_emg.py b/myogen/simulator/core/emg/surface_emg.py
@@ -77,7 +77,13 @@
 import warnings
 from datetime import datetime
 
-import cupy
+try:
+    import cupy as cp
+
+    HAS_CUPY = True
+except ImportError:
+    HAS_CUPY = False
+
 import joblib
 import matplotlib.patches as patches
 import matplotlib.pyplot as plt
@@ -1037,8 +1043,9 @@ def simulate_surface_emg(
         convolution of spike trains with MUAP shapes, and summation across all active motor units
         for each electrode channel.
         
-        The simulation uses GPU acceleration (via CuPy) for efficient parallel processing of
+        The simulation can optionally use GPU acceleration (via CuPy) for efficient parallel processing of
         convolution operations across multiple electrode positions, motor units, and time points.
+        If CuPy is not available, the simulation automatically falls back to CPU computation with NumPy.
         
         Scientific Background
         --------------------
@@ -1067,11 +1074,13 @@ def simulate_surface_emg(
         
         GPU Acceleration
         ---------------
-        The method automatically uses GPU processing for:
+        The method automatically uses GPU processing (if CuPy is available) for:
         - Large-scale convolution operations across multiple dimensions
         - Parallel processing of electrode positions and motor unit combinations
         - Efficient memory management for large EMG datasets
-        - Automatic fallback to CPU if GPU unavailable
+        
+        If CuPy is not installed or GPU is unavailable, the method automatically falls back
+        to CPU-based computation using NumPy with equivalent functionality.
         
         Parameters
         ----------
@@ -1113,14 +1122,15 @@ def simulate_surface_emg(
             between simulated motor units and available spike trains.
             
         RuntimeError
-            If GPU memory is insufficient for large simulations (automatically falls back to CPU).
+            If GPU memory is insufficient for large simulations when using CuPy 
+            (will automatically fall back to CPU-based computation).
         
         Notes
         -----
         **Computational Complexity:**
         - Time complexity: O(n_positions × n_pools × n_MUs × n_electrodes × n_time_samples)
         - Space complexity: O(n_positions × n_pools × n_electrodes × n_time_samples)
-        - GPU acceleration provides 10-50x speedup for large simulations
+        - GPU acceleration (if available) provides 10-50x speedup for large simulations
         
         **Memory Requirements:**
         - Input MUAPs: n_positions × n_MUs × n_electrodes × n_time_samples × 8 bytes
@@ -1239,41 +1249,76 @@ def simulate_surface_emg(
             mode="same",
         )
 
-        # Perform the convolution and summation using GPU acceleration
-        spike_gpu = cupy.asarray(motor_neuron_pool.spike_trains)
-        muap_gpu = cupy.asarray(muap_shapes__tensor)
-        surface_emg_gpu = cupy.zeros(
-            (n_positions, n_pools, n_rows, n_cols, len(sample_conv))
-        )
+        # Perform the convolution and summation using GPU acceleration if available
+        if HAS_CUPY:
+            # Use GPU acceleration with CuPy
+            spike_gpu = cp.asarray(motor_neuron_pool.spike_trains)
+            muap_gpu = cp.asarray(muap_shapes__tensor)
+            surface_emg_gpu = cp.zeros(
+                (n_positions, n_pools, n_rows, n_cols, len(sample_conv))
+            )
 
-        for mu_pool_idx in tqdm(range(n_pools), desc="Simulating surface EMG"):
-            active_neuron_indices = set(
-                motor_neuron_pool.active_neuron_indices[mu_pool_idx]
+            for mu_pool_idx in tqdm(range(n_pools), desc="Simulating surface EMG (GPU)"):
+                active_neuron_indices = set(
+                    motor_neuron_pool.active_neuron_indices[mu_pool_idx]
+                )
+
+                for position_idx in range(n_positions):
+                    for row_idx in range(n_rows):
+                        for col_idx in range(n_cols):
+                            # Process all MUAPs for this combination on GPU
+                            convolutions = cp.array(
+                                [
+                                    cp.correlate(
+                                        spike_gpu[mu_pool_idx, muap_idx],
+                                        muap_gpu[position_idx, muap_idx, row_idx, col_idx],
+                                        mode="same",
+                                    )
+                                    for muap_idx in MUs_to_simulate.intersection(
+                                        active_neuron_indices
+                                    )
+                                ]
+                            )
+                            # Sum across MUAPs on GPU
+                            surface_emg_gpu[position_idx, mu_pool_idx, row_idx, col_idx] = (
+                                cp.sum(convolutions, axis=0)
+                            )
+
+            # Transfer results back to CPU
+            self.surface_emg__tensor = cp.asnumpy(surface_emg_gpu)
+        else:
+            # Fallback to CPU computation with NumPy
+            surface_emg_cpu = np.zeros(
+                (n_positions, n_pools, n_rows, n_cols, len(sample_conv))
             )
 
-            for position_idx in range(n_positions):
-                for row_idx in range(n_rows):
-                    for col_idx in range(n_cols):
-                        # Process all MUAPs for this combination on GPU
-                        convolutions = cupy.array(
-                            [
-                                cupy.correlate(
-                                    spike_gpu[mu_pool_idx, muap_idx],
-                                    muap_gpu[position_idx, muap_idx, row_idx, col_idx],
-                                    mode="same",
-                                )
-                                for muap_idx in MUs_to_simulate.intersection(
-                                    active_neuron_indices
-                                )
-                            ]
-                        )
-                        # Sum across MUAPs on GPU
-                        surface_emg_gpu[position_idx, mu_pool_idx, row_idx, col_idx] = (
-                            cupy.sum(convolutions, axis=0)
-                        )
+            for mu_pool_idx in tqdm(range(n_pools), desc="Simulating surface EMG (CPU)"):
+                active_neuron_indices = set(
+                    motor_neuron_pool.active_neuron_indices[mu_pool_idx]
+                )
 
-        # Transfer results back to CPU
-        self.surface_emg__tensor = cupy.asnumpy(surface_emg_gpu)
+                for position_idx in range(n_positions):
+                    for row_idx in range(n_rows):
+                        for col_idx in range(n_cols):
+                            # Process all MUAPs for this combination on CPU
+                            convolutions = np.array(
+                                [
+                                    np.correlate(
+                                        motor_neuron_pool.spike_trains[mu_pool_idx, muap_idx],
+                                        muap_shapes__tensor[position_idx, muap_idx, row_idx, col_idx],
+                                        mode="same",
+                                    )
+                                    for muap_idx in MUs_to_simulate.intersection(
+                                        active_neuron_indices
+                                    )
+                                ]
+                            )
+                            # Sum across MUAPs on CPU
+                            surface_emg_cpu[position_idx, mu_pool_idx, row_idx, col_idx] = (
+                                np.sum(convolutions, axis=0)
+                            )
+
+            self.surface_emg__tensor = surface_emg_cpu
 
         return self.surface_emg__tensor
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -6,7 +6,6 @@ readme = "README.md"
 requires-python = ">=3.12"
 dependencies = [
     "beartype>=0.21.0",
-    "cupy-cuda12x>=13.4.1",
     "impi-rt>=2021.15.0 ; sys_platform == 'win32'",
     "matplotlib>=3.10.1",
     "mpi4py>=4.0.3",
@@ -44,4 +43,4 @@ requires = ["setuptools>=61.0"]
 build-backend = "setuptools.build_meta"
 
 [tool.setuptools]
-packages = ["myogen"]
+packages = ["myogen"]
diff --git a/uv.lock b/uv.lock