dipplestix · dipplestix · Jan 12, 2026 · Jan 12, 2026
diff --git a/marketsim/cuda/README.md b/marketsim/cuda/README.md
@@ -0,0 +1,140 @@
+# CUDA GPU-Accelerated Market Simulator
+
+A fully GPU-accelerated market simulator using CuPy/CUDA for massively parallel market simulations.
+
+## Performance
+
+Benchmarked on NVIDIA RTX 4090:
+
+| Environments | Agents | Steps | Time (s) | Steps/s |
+|-------------|--------|-------|----------|---------|
+| 1,000 | 10 | 1,000 | 5.2 | **190,925** |
+| 5,000 | 10 | 1,000 | 6.6 | **756,032** |
+| 10,000 | 10 | 1,000 | 7.1 | **1,415,362** |
+| 1,000 | 50 | 1,000 | 8.3 | **120,083** |
+| 5,000 | 50 | 1,000 | 9.9 | **503,847** |
+| 1,000 | 100 | 1,000 | 10.2 | **98,152** |
+
+**Peak throughput: 1.4M steps/second** with 10,000 parallel environments.
+
+## Installation
+
+```bash
+pip install cupy-cuda12x scipy
+```
+
+## Quick Start
+
+```python
+from marketsim.cuda import CUDASimulator
+
+# Create simulator with 1000 parallel environments
+sim = CUDASimulator(
+    num_envs=1000,       # Number of parallel simulations
+    num_agents=50,       # Agents per environment
+    sim_time=10000,      # Timesteps per simulation
+    arrival_rate=0.005,  # Agent arrival probability
+    seed=42,
+)
+
+# Run all simulations
+results = sim.run()
+
+# Results contain:
+# - positions: Final positions (num_envs, num_agents)
+# - cash: Final cash (num_envs, num_agents)
+# - total_matches: Match count per environment
+# - final_fundamental: Final fundamental value per environment
+print(f"Mean matches: {results['total_matches'].mean():.1f}")
+
+# Verify conservation laws
+conservation = sim.verify_conservation()
+print(f"Position conservation: {conservation['position_conservation']}")
+```
+
+## Architecture
+
+The GPU simulator consists of:
+
+- **GPUFundamental**: Precomputed mean-reverting fundamental values
+- **GPUPrivateValues**: Vectorized private value generation/lookup
+- **GPUOrderBook**: Sorting-based order book with CDA matching
+- **CUDASimulator**: Main orchestrator running on GPU
+
+All operations are fully vectorized across environments.
+
+## Market Mechanism
+
+This implementation uses **Continuous Double Auction (CDA)** with price-time priority:
+- Orders are matched based on price priority (best prices first)
+- Multiple matches can occur per timestep
+- Matched orders are cleared immediately
+
+**Note**: The CPU baseline in `marketsim.simulator` uses batch/call market clearing, which is a different mechanism. This leads to different match statistics, but both maintain correct position/cash conservation.
+
+## Configuration
+
+```python
+CUDASimulator(
+    num_envs=1000,        # Parallel environments
+    num_agents=50,        # Agents per environment
+    sim_time=10000,       # Simulation timesteps
+
+    # Agent parameters
+    q_max=10,             # Max position quantity
+    shade=(0, 2),         # Price shade range
+    pv_var=5e6,           # Private value variance
+    eta=1.0,              # Aggressiveness (1.0 = passive)
+
+    # Market parameters
+    mean=1e5,             # Fundamental mean
+    r=0.05,               # Mean reversion rate
+    shock_var=1e6,        # Shock variance
+    arrival_rate=0.005,   # Arrival probability
+
+    # Other
+    seed=42,              # Random seed
+    device=0,             # CUDA device ID
+)
+```
+
+## Multi-GPU Support
+
+```python
+from marketsim.cuda import MultiGPUSimulator
+
+# Distribute across multiple GPUs
+sim = MultiGPUSimulator(
+    num_gpus=4,           # Use 4 GPUs
+    envs_per_gpu=10000,   # 10k envs per GPU = 40k total
+    num_agents=50,
+    sim_time=10000,
+)
+
+results = sim.run_and_aggregate()
+```
+
+## Validation
+
+Conservation laws verified:
+- **Position conservation**: Sum of positions = 0 for all environments ✓
+- **Cash conservation**: Minor floating point deviation (< 1.0) ✓
+
+## GPU Requirements
+
+- CUDA 12.x compatible GPU
+- CuPy with CUDA 12.x support
+- Recommended: 8GB+ VRAM for large-scale simulations
+
+## Files
+
+| File | Description |
+|------|-------------|
+| `__init__.py` | Package init, GPU detection utilities |
+| `simulator.py` | Main CUDASimulator class |
+| `order_book.py` | GPU order book with sorting-based matching |
+| `fundamental.py` | GPU fundamental value generation |
+| `private_values.py` | GPU private values |
+| `kernels.py` | Vectorized computation kernels |
+| `multi_gpu.py` | Multi-GPU orchestration |
+| `benchmark.py` | Benchmark suite |
diff --git a/marketsim/cuda/__init__.py b/marketsim/cuda/__init__.py
@@ -0,0 +1,131 @@
+"""
+CUDA GPU-accelerated market simulation.
+
+This package provides a fully GPU-accelerated implementation of the market simulator
+using CuPy/CUDA for maximum throughput.
+
+Target performance:
+- >25,000 steps/s (100 msgs, 1 agent) on RTX 4090
+- >400,000 steps/s (1 msg, 1 agent) on RTX 4090
+- Scales to multiple H100s
+"""
+
+import warnings
+
+# Check for CuPy availability
+_CUPY_AVAILABLE = False
+_CUDA_VERSION = None
+_GPU_NAME = None
+_GPU_COUNT = 0
+
+try:
+    import cupy as cp
+    _CUPY_AVAILABLE = True
+    _CUDA_VERSION = cp.cuda.runtime.runtimeGetVersion()
+    _GPU_COUNT = cp.cuda.runtime.getDeviceCount()
+    if _GPU_COUNT > 0:
+        with cp.cuda.Device(0):
+            props = cp.cuda.runtime.getDeviceProperties(0)
+            _GPU_NAME = props['name'].decode() if isinstance(props['name'], bytes) else props['name']
+except ImportError:
+    warnings.warn(
+        "CuPy not available. Install with: pip install cupy-cuda12x\n"
+        "GPU acceleration will not be available."
+    )
+except Exception as e:
+    warnings.warn(f"CUDA initialization failed: {e}")
+
+
+def is_available() -> bool:
+    """Check if CUDA GPU acceleration is available."""
+    return _CUPY_AVAILABLE and _GPU_COUNT > 0
+
+
+def get_device_count() -> int:
+    """Get number of available CUDA devices."""
+    return _GPU_COUNT
+
+
+def get_cuda_version() -> int:
+    """Get CUDA runtime version."""
+    return _CUDA_VERSION
+
+
+def get_gpu_name() -> str:
+    """Get name of the primary GPU."""
+    return _GPU_NAME
+
+
+def get_device_info() -> dict:
+    """Get detailed information about available GPUs."""
+    if not _CUPY_AVAILABLE:
+        return {"available": False, "error": "CuPy not installed"}
+
+    if _GPU_COUNT == 0:
+        return {"available": False, "error": "No CUDA devices found"}
+
+    import cupy as cp
+
+    devices = []
+    for i in range(_GPU_COUNT):
+        with cp.cuda.Device(i):
+            props = cp.cuda.runtime.getDeviceProperties(i)
+            name = props['name'].decode() if isinstance(props['name'], bytes) else props['name']
+            devices.append({
+                "id": i,
+                "name": name,
+                "total_memory_gb": props['totalGlobalMem'] / (1024**3),
+                "compute_capability": f"{props['major']}.{props['minor']}",
+                "multiprocessors": props['multiProcessorCount'],
+            })
+
+    return {
+        "available": True,
+        "cuda_version": _CUDA_VERSION,
+        "device_count": _GPU_COUNT,
+        "devices": devices,
+    }
+
+
+def print_device_info():
+    """Print information about available GPUs."""
+    info = get_device_info()
+
+    if not info["available"]:
+        print(f"CUDA not available: {info.get('error', 'Unknown error')}")
+        return
+
+    print(f"CUDA Version: {info['cuda_version']}")
+    print(f"Number of GPUs: {info['device_count']}")
+    print()
+
+    for dev in info["devices"]:
+        print(f"GPU {dev['id']}: {dev['name']}")
+        print(f"  Memory: {dev['total_memory_gb']:.1f} GB")
+        print(f"  Compute Capability: {dev['compute_capability']}")
+        print(f"  Multiprocessors: {dev['multiprocessors']}")
+        print()
+
+
+# Public API
+__all__ = [
+    'is_available',
+    'get_device_count',
+    'get_cuda_version',
+    'get_gpu_name',
+    'get_device_info',
+    'print_device_info',
+]
+
+# Lazy imports for main classes
+def __getattr__(name):
+    if name == 'CUDASimulator':
+        from .simulator import CUDASimulator
+        return CUDASimulator
+    elif name == 'GPUOrderBook':
+        from .order_book import GPUOrderBook
+        return GPUOrderBook
+    elif name == 'MultiGPUSimulator':
+        from .multi_gpu import MultiGPUSimulator
+        return MultiGPUSimulator
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")