diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000..0b9caed
Binary files /dev/null and b/.DS_Store differ
diff --git a/.gitignore b/.gitignore
index 505a3b1..259d473 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,5 +6,33 @@ dist/
 wheels/
 *.egg-info
 
+# mac specific crap
+.DS_Store
+
+# checkpoints
+checkpoints_*/
+
 # Virtual environments
 .venv
+
+# wandb files
+wandb/
+
+# Slurm logs
+logs/
+*.log
+
+# Big jsonl files
+data/
+*.jsonl
+
+# Environment files (secrets)
+.env
+.env.*
+.DS_Store
+
+# HPC specific files
+examples/code_exec/hpc/
+
+# personal research directory
+research/
diff --git a/.python-version b/.python-version
deleted file mode 100644
index e4fba21..0000000
--- a/.python-version
+++ /dev/null
@@ -1 +0,0 @@
-3.12
diff --git a/AGENTS.md b/CLAUDE.md
similarity index 100%
rename from AGENTS.md
rename to CLAUDE.md
diff --git a/core b/core
new file mode 100644
index 0000000..372339e
Binary files /dev/null and b/core differ
diff --git a/examples/code_exec/README.md b/examples/code_exec/README.md
new file mode 100644
index 0000000..caa6f4e
--- /dev/null
+++ b/examples/code_exec/README.md
@@ -0,0 +1,526 @@
+# Code Execution Training
+
+> Train LLMs on code generation with sandboxed test-driven evaluation.
+
+## What This Is
+
+This module provides an RL training environment where:
+- The **agent generates code** in response to programming problems
+- The **environment executes the code** in isolated containers
+- **Test cases verify correctness** and provide reward signal
+- The **trainer updates the policy** based on execution outcomes
+
+Key features:
+- **Sandboxed execution** — Generated code runs in Docker/Podman containers for security
+- **Persistent containers** — 40x faster than cold-start containers (17ms vs 700ms per execution)
+- **Automatic caching** — Skip redundant executions (especially valuable with CISPO/GRPO)
+- **Multi-backend** — Works on laptop (Docker) or HPC clusters (Podman-HPC)
+
+## Who This Is For
+
+**Experienced Ludic users**: Jump to [Quick Start](#quick-start) for copy-paste examples.
+
+**New to Ludic**: Read [How It Works](#how-it-works) first to understand the concepts.
+
+**Prerequisites**:
+- Familiarity with Ludic's training concepts (`Trainer`, `RolloutEngine`, `BatchSource`)
+- Docker running locally, or Podman-HPC on your HPC cluster
+- A vLLM inference server for generation
+
+---
+
+## Quick Start
+
+### Prerequisites
+
+1. **Docker daemon running** — See [Setup Guide](#setup-guide) if not
+2. **HuggingFace token** — Create `.env` file: `echo 'HF_TOKEN=your_token' > .env`
+3. **Dependencies**: `pip install docker datasets peft`
+
+### 5-Minute Local Run
+
+```bash
+# Terminal 1: Start vLLM inference server
+CUDA_VISIBLE_DEVICES=0 uv run --env-file .env python -m ludic.inference.vllm_server \
+    --model Qwen/Qwen2.5-Coder-0.5B-Instruct
+
+# Terminal 2: Run training
+CUDA_VISIBLE_DEVICES=1 PYTHONPATH=. uv run --env-file .env python examples/code_exec/train_apps.py \
+    --model Qwen/Qwen2.5-Coder-0.5B-Instruct \
+    --limit 100 \
+    --train-steps 10
+```
+
+You should see:
+- Sandbox pool starting with 4 workers
+- Baseline evaluation running
+- Training steps with reward metrics
+
+### HPC Cluster Run (Slurm)
+
+```bash
+# 1. Prepare environment on LOGIN NODE (one-time, requires internet)
+./examples/code_exec/prepare_env.sh
+
+# 2. Submit job to compute nodes
+sbatch examples/code_exec/train_apps_isambard.slurm
+```
+
+The Slurm script handles:
+- Starting vLLM server on GPU 0
+- Running training on GPU 1
+- Auto-detecting Podman-HPC backend
+- Structured logging in `logs/YYYY-MM-DD/`
+
+---
+
+## How It Works
+
+### The Training Loop
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│                      Training Loop                          │
+│  ┌─────────────┐   ┌──────────────┐   ┌─────────────────┐   │
+│  │   Trainer   │◄──│ BatchSource  │◄──│  RolloutEngine  │   │
+│  └─────────────┘   └──────────────┘   └────────┬────────┘   │
+│        ▲                                       │            │
+│        │                           ┌───────────▼─────────┐  │
+│   Weight Updates                   │ SingleAgentProtocol │  │
+│        │                           └────────────┬────────┘  │
+│        ▼                                        │           │
+│  ┌───────────┐                     ┌────────────▼────────┐  │
+│  │   vLLM    │◄────────────────────│    CodeExecEnv      │  │
+│  │  Server   │    generates code   └────────────┬────────┘  │
+│  └───────────┘                                  │           │
+└─────────────────────────────────────────────────┼───────────┘
+                                                  │ executes
+                                     ┌────────────▼────────┐
+                                     │    SandboxPool      │
+                                     │  ┌────┐ ┌────┐      │
+                                     │  │ S1 │ │ S2 │ ...  │
+                                     │  └────┘ └────┘      │
+                                     └─────────────────────┘
+```
+
+**Step by step:**
+
+1. **RolloutEngine** creates a `CodeExecEnv` for each problem from the dataset
+2. **Agent** (via vLLM) generates Python code given the problem prompt
+3. **CodeExecEnv** sends the code to a sandboxed container for execution
+4. **Test cases** are run against the code; results determine the reward
+5. **Trainer** collects rollouts and updates the model weights
+6. **Weights are pushed** back to vLLM for the next generation round
+
+### Key Concepts
+
+#### Sandboxing
+
+**Why sandbox?** LLM-generated code can be malicious, buggy, or resource-hungry. Sandboxing:
+- Prevents file system access outside the container
+- Limits memory and CPU usage
+- Disables network access (by default)
+- Isolates each execution from others
+
+**Persistent containers** are the key to performance. Instead of starting a new container per execution (700ms overhead), we keep containers running and reuse them (17ms overhead).
+
+#### Backend Auto-Detection
+
+| Environment | Priority |
+|-------------|----------|
+| Inside Slurm job | `podman-hpc` → `docker` |
+| Outside Slurm | `docker` → `podman-hpc` |
+
+Override with `--sandbox-backend docker` or `--sandbox-backend podman-hpc`.
+
+#### Test-Driven Evaluation
+
+Each problem in the dataset has test cases (input/output pairs). The flow:
+
+1. **TestAdapter** extracts test cases from the dataset format (e.g., `APPSTestAdapter` for APPS)
+2. **StdinStdoutRunner** executes the code with each test's input as stdin
+3. **OutputVerifier** compares actual output to expected output
+4. **Reward** is computed based on test pass rate
+
+#### Caching
+
+The LRU cache prevents redundant execution:
+
+- **Cache key**: `hash(code) + hash(tests)`
+- **Hit rate**: Often 30-50% with CISPO/GRPO (multiple generations per prompt)
+- **Speedup**: Cache hits return instantly (no container execution)
+
+Monitor cache performance:
+```python
+stats = pool.cache_stats
+# {'hits': 150, 'misses': 50, 'size': 200}
+hit_rate = stats['hits'] / (stats['hits'] + stats['misses'])
+```
+
+### Reward Shaping
+
+| Event | Reward | Configurable | Rationale |
+|-------|--------|--------------|-----------|
+| All tests pass | `+1.0` | — | Complete success |
+| Some tests pass | `0.0` to `1.0` | `--partial-credit` | Smoother gradient signal |
+| All tests fail | `0.0` | — | No partial credit by default |
+| Compile error | `-0.1` | `compile_failure_reward` | Discourage syntax errors |
+| Proper code block | `+0.05` | Parser reward | Encourage correct formatting |
+
+**When to enable partial credit:**
+- Training from scratch (model needs incremental signal)
+- Long test suites where all-or-nothing is too sparse
+
+**When to keep binary rewards:**
+- Fine-tuning a capable model
+- Problems where partial correctness is meaningless
+
+---
+
+## Configuration Reference
+
+### Training Script Arguments (`train_apps.py`)
+
+#### Model & Inference
+
+| Flag | Default | Description |
+|------|---------|-------------|
+| `--model` | `Qwen/Qwen2.5-3B-Instruct` | Model name or path |
+| `--host` | `127.0.0.1` | vLLM server host |
+| `--port` | `8000` | vLLM server port |
+| `--max-prompt-tokens` | `1024` | Max prompt length (longer prompts filtered) |
+| `--max-new-tokens` | `4096` | Max generation length |
+
+#### Training
+
+| Flag | Default | Description |
+|------|---------|-------------|
+| `--train-steps` | `100` | Number of training steps |
+| `--batch-size` | `4` | Rollout requests per batch |
+| `--group-size` | `8` | CISPO group size (rollouts per prompt) |
+| `--train-temperature` | `0.8` | Sampling temperature |
+| `--max-seq-len` | `2048` | Max tokens per sample (truncation limit) |
+| `--micro-token-budget` | `16384` | Max padded tokens per micro-batch |
+
+#### LoRA
+
+| Flag | Default | Description |
+|------|---------|-------------|
+| `--lora-rank` | `8` | LoRA rank |
+| `--lora-alpha-mult` | `2.0` | Alpha = rank × mult |
+| `--lora-dropout` | `0.0` | LoRA dropout |
+
+#### Dataset
+
+| Flag | Default | Description |
+|------|---------|-------------|
+| `--split` | `train` | Dataset split |
+| `--limit` | None | Max samples to load |
+| `--difficulty` | None | Filter: `introductory`, `interview`, `competition` |
+| `--eval-samples` | `200` | Hold out for evaluation |
+
+#### Sandbox
+
+| Flag | Default | Description |
+|------|---------|-------------|
+| `--sandbox-backend` | `auto` | `auto`, `docker`, `podman-hpc` |
+| `--sandbox-workers` | `4` | Container pool size |
+| `--python-version` | `3.11` | Python in sandbox |
+| `--timeout-per-test` | `1.0` | Per-test timeout (seconds) |
+| `--partial-credit` | `False` | Enable fractional rewards |
+| `--minimal-sandbox` | `False` | Skip memory/network limits (HPC compat) |
+| `--max-concurrent-ops` | `8` | Semaphore limit for Podman |
+
+#### Evaluation
+
+| Flag | Default | Description |
+|------|---------|-------------|
+| `--eval-every` | `25` | Eval every N steps |
+| `--eval-before-start` | `True` | Run baseline evaluation |
+| `--eval-concurrency` | `32` | Parallel eval rollouts |
+| `--eval-temperature` | `0.0` | Greedy decoding for eval |
+
+#### Logging
+
+| Flag | Default | Description |
+|------|---------|-------------|
+| `--wandb` | `False` | Enable W&B logging |
+| `--wandb-project` | `ludic-apps` | W&B project name |
+
+### Environment Configuration (`CodeExecConfig`)
+
+| Field | Default | Description |
+|-------|---------|-------------|
+| `timeout_per_test_s` | `5.0` | Per-test execution timeout |
+| `memory_limit_mb` | `256` | Container memory limit |
+| `max_tests` | `None` | Limit test count (None = all) |
+| `stop_on_first_failure` | `True` | Early stop on failure |
+| `compile_first` | `True` | Syntax check before running |
+| `partial_credit` | `False` | Reward = pass_rate (vs binary) |
+| `compile_failure_reward` | `-0.1` | Penalty for syntax errors |
+| `use_cache` | `True` | Enable execution caching |
+
+### Sandbox Pool Sizing
+
+| Environment | CPUs | Recommended `--sandbox-workers` |
+|-------------|------|--------------------------------|
+| Laptop (M1/M2) | 8-10 | 4 |
+| Workstation | 16-32 | 8-16 |
+| HPC node | 64-128 | 24-64 |
+
+**Rule of thumb**: Each sandbox uses ~0.5-1 CPU core. Use `floor(cpus / 2)`.
+
+**Concurrency vs Workers**:
+- `--concurrency` controls parallel rollouts (async tasks)
+- `--sandbox-workers` controls parallel code executions
+- If `concurrency > sandbox-workers`, tasks queue for sandboxes
+
+---
+
+## End-to-End Example
+
+This complete example shows how to build a training script from scratch:
+
+```python
+"""Minimal code execution training script."""
+
+import asyncio
+from datasets import load_dataset
+
+from ludic.agent import Agent
+from ludic.context import FullDialog
+from ludic.inference import VLLMChatClient, InferenceSpec, SamplingParams, ReturnSpec
+from ludic.interaction import SingleAgentProtocol
+from ludic.parsers import ParseResult
+from ludic.distributed.adapters import create_vllm_publisher
+from ludic.training import (
+    RolloutEngine, RolloutBatchSource, Trainer, TrainerConfig,
+    make_cispo, make_dataset_queue_requests_fn,
+)
+from ludic.envs.code_exec import (
+    CodeExecEnv, CodeExecConfig, create_sandbox_pool, APPSTestAdapter,
+)
+
+async def main():
+    # 1. Load dataset
+    ds = load_dataset("RoganInglis/apps-control-arena", split="train")
+    samples = [{"question": r["question"], "inputs": r["inputs"], "outputs": r["outputs"]}
+               for r in list(ds)[:100]]
+
+    # 2. Create sandbox pool (shared across all envs)
+    pool = await create_sandbox_pool(n_workers=4, backend="auto")
+
+    # 3. Setup inference client
+    client = VLLMChatClient(host="127.0.0.1", port=8000, enable_weight_updates=True)
+    publisher = create_vllm_publisher(client)
+
+    # 4. Environment factory (captures pool via closure)
+    adapter = APPSTestAdapter()
+    env_config = CodeExecConfig(timeout_per_test_s=5.0, partial_credit=False)
+
+    def env_factory(sample):
+        return CodeExecEnv(sample=sample, sandbox_pool=pool,
+                          test_adapter=adapter, config=env_config)
+
+    # 5. Protocol factory
+    def protocol_factory():
+        return SingleAgentProtocol(agent=Agent(
+            client=client, model="Qwen/Qwen2.5-3B-Instruct",
+            ctx=FullDialog(),
+            parser=lambda raw: ParseResult(action=raw, reward=0.0, obs=None),
+        ))
+
+    # 6. Setup training pipeline
+    engine = RolloutEngine(
+        env_registry={"apps": env_factory},
+        protocol_registry={"single": protocol_factory},
+    )
+
+    algo = make_cispo(group_size=8, clip_eps_high=5.0, length_normalize=True)
+
+    batch_source = RolloutBatchSource(
+        orchestrator=engine,
+        credit_assigner=algo.credit_assigner,
+        requests_fn=make_dataset_queue_requests_fn(...),  # See train_apps.py
+        concurrency=32,
+    )
+
+    # 7. Train
+    trainer = Trainer(
+        model=your_model,  # Load with LoRA
+        algo=algo,
+        batch_source=batch_source,
+        publisher=publisher,
+        cfg=TrainerConfig(max_seq_len=2048, micro_token_budget=16384),
+    )
+
+    await trainer.train(num_steps=100)
+
+    # 8. Cleanup
+    await pool.shutdown()
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+For a complete, production-ready script, see [`train_apps.py`](./train_apps.py).
+
+---
+
+## Customization
+
+### Using a Different Dataset
+
+Implement the `TestAdapter` protocol:
+
+```python
+from ludic.envs.code_exec import TestAdapter, TestCase
+
+class MyDatasetAdapter:
+    def get_tests(self, sample: dict) -> list[TestCase]:
+        return [
+            TestCase(input=t["stdin"], expected=t["stdout"], id=f"test_{i}")
+            for i, t in enumerate(sample["tests"])
+        ]
+
+    def get_prompt(self, sample: dict) -> str:
+        return sample["problem_description"]
+
+    def get_problem_id(self, sample: dict) -> str:
+        return sample["id"]
+```
+
+### Custom Reward Shaping
+
+Modify `CodeExecConfig`:
+
+```python
+config = CodeExecConfig(
+    partial_credit=True,          # Reward = fraction of tests passed
+    compile_failure_reward=-0.5,  # Harsher penalty for syntax errors
+    stop_on_first_failure=False,  # Run all tests for full feedback
+)
+```
+
+### Custom Output Verification
+
+For floating-point comparisons:
+
+```python
+from ludic.envs.code_exec.adapters import FloatTolerantVerifier
+
+verifier = FloatTolerantVerifier(abs_tol=1e-6, rel_tol=1e-6)
+runner = StdinStdoutRunner(verifier=verifier)
+```
+
+For full API details, see the [Module README](../../src/ludic/envs/code_exec/README.md).
+
+---
+
+## Troubleshooting
+
+### "Docker daemon not running"
+
+```
+docker.errors.DockerException: Error while fetching server API version
+```
+
+**Solution**: Start Docker Desktop (macOS/Windows) or `sudo systemctl start docker` (Linux).
+
+### Tests timing out
+
+**Symptoms**: Many `TIMEOUT` results, slow training.
+
+**Diagnosis**: Check if problems have expensive test cases.
+
+**Solutions**:
+- Increase timeout: `--timeout-per-test 10.0`
+- Use batch execution (enabled by default)
+- Reduce number of tests: Set `max_tests` in `CodeExecConfig`
+
+### GPU out of memory
+
+**Solutions**:
+- Reduce `--batch-size`
+- Reduce `--micro-token-budget`
+- Enable gradient checkpointing (already on by default)
+
+### Slow sandbox initialization
+
+**Symptoms**: "Starting sandbox pool..." takes 30+ seconds.
+
+**Solutions**:
+- Reduce `--sandbox-workers` for initial testing
+- Pre-pull images: `docker pull python:3.11-slim`
+
+### Podman-HPC: Image not found on compute node
+
+**Cause**: Images must be migrated to shared storage.
+
+**Solution**:
+```bash
+podman-hpc pull python:3.11-slim  # Auto-migrates
+podman-hpc images  # Verify R/O=true
+```
+
+### Network access denied on compute node
+
+**Cause**: HPC compute nodes often lack internet access.
+
+**Solution**: Run `prepare_env.sh` on the login node first to pre-stage all dependencies.
+
+---
+
+## Setup Guide
+
+### Docker (Local Development)
+
+```bash
+# Install (macOS)
+brew install --cask docker
+
+# Install (Linux)
+curl -fsSL https://get.docker.com | sh
+sudo usermod -aG docker $USER && newgrp docker
+
+# Start daemon
+open -a Docker  # macOS
+sudo systemctl start docker  # Linux
+
+# Verify
+docker info && pip install docker>=7.0.0
+```
+
+For detailed setup, see [Docker documentation](https://docs.docker.com/get-docker/).
+
+### Podman-HPC (HPC Clusters)
+
+```bash
+# Pull and migrate image to shared storage
+podman-hpc pull python:3.11-slim
+
+# Verify migration (R/O should be 'true')
+podman-hpc images
+
+# Test execution
+srun -N 1 podman-hpc run --rm python:3.11-slim python -c "print('hello')"
+```
+
+For cluster-specific setup, consult your HPC documentation or [Podman-HPC docs](https://github.com/NERSC/podman-hpc).
+
+### Verifying Your Setup
+
+```bash
+# Run integration tests
+pytest tests/integration/test_code_exec_docker.py -v
+
+# If tests are skipped, Docker is not accessible
+```
+
+---
+
+## See Also
+
+- **Module README**: [src/ludic/envs/code_exec/README.md](../../src/ludic/envs/code_exec/README.md) — API reference, protocols, internals
+- **Migration Guide**: [MIGRATION.md](./MIGRATION.md) — Training API changes and migration steps
+- **Training Script**: [train_apps.py](./train_apps.py) — Production-ready example
diff --git a/examples/code_exec/train_apps.py b/examples/code_exec/train_apps.py
new file mode 100644
index 0000000..bb7496a
--- /dev/null
+++ b/examples/code_exec/train_apps.py
@@ -0,0 +1,771 @@
+"""
+APPS code generation training scaffold using CodeExecEnv with LoRA.
+
+This wires together:
+  - HuggingFace datasets for APPS code samples
+  - CodeExecEnv with sandboxed execution (Docker or Podman-HPC)
+  - SingleAgentProtocol with async env support
+  - LoRA adapters via PEFT for efficient fine-tuning
+  - GRPO with optional KL regularization
+  - Baseline + periodic evaluation on held-out samples
+  - RichLiveLogger (terminal dashboard) or WandB (cloud logging)
+
+Requirements:
+  - Container runtime: Docker daemon OR Podman-HPC (auto-detected)
+  - pip install docker>=7.0.0 datasets peft (for Docker backend)
+  - GPU(s) for training (optional for rollout-only mode)
+
+Usage:
+  # Start vLLM server (in one terminal)
+  CUDA_VISIBLE_DEVICES=0 uv run python -m ludic.inference.vllm_server \\
+      --model Qwen/Qwen2.5-3B-Instruct
+
+  # Run training with terminal dashboard (default)
+  CUDA_VISIBLE_DEVICES=1 PYTHONPATH=. uv run python examples/code_exec/train_apps.py \\
+      --model Qwen/Qwen2.5-3B-Instruct \\
+      --limit 500 --eval-samples 200 --train-steps 100 --final-save
+
+  # Run training with KL regularization
+  CUDA_VISIBLE_DEVICES=1 PYTHONPATH=. uv run python examples/code_exec/train_apps.py \\
+      --model Qwen/Qwen2.5-3B-Instruct \\
+      --limit 500 --eval-samples 200 --train-steps 100 \\
+      --kl-coeff 0.01 --final-save
+
+  # Run training with WandB logging
+  CUDA_VISIBLE_DEVICES=1 PYTHONPATH=. uv run python examples/code_exec/train_apps.py \\
+      --model Qwen/Qwen2.5-3B-Instruct \\
+      --limit 500 --eval-samples 200 --train-steps 100 \\
+      --wandb --wandb-project ludic-apps --final-save
+
+Key Features:
+  - LoRA: rank=8, alpha=16, target_modules="all-linear" (configurable)
+  - Eval: Baseline before training, periodic eval every N steps
+  - Logging: Terminal sparkline dashboard or WandB cloud tracking
+  - KL regularization: Optional penalty to prevent policy drift
+
+See README.md for detailed setup instructions.
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import os
+import queue
+from typing import Any, Dict, List
+
+import torch
+from datasets import load_dataset  # type: ignore
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from peft import get_peft_model, LoraConfig, TaskType
+
+from ludic.agent import Agent
+from ludic.context import FullDialog
+from ludic.inference import VLLMChatClient, InferenceSpec, SamplingParams, ReturnSpec
+from ludic.interaction import SingleAgentProtocol
+from ludic.parsers import ParseResult
+from ludic.distributed.adapters import create_vllm_publisher
+from ludic.eval import EngineEvaluator
+from ludic.training import (
+    RolloutEngine,
+    RolloutBatchSource,
+    Trainer,
+    TrainerConfig,
+    CheckpointConfig,
+    make_dataset_queue_requests_fn,
+    RequestsExhausted,
+    RolloutRequest,
+    EnvSpec,
+    ProtocolSpec,
+    # Algorithm
+    make_cispo,
+)
+from ludic.training import Reducer, RichLiveLogger, default_reducers
+from ludic.training.loggers import WandbLogger
+from ludic.training.hardware import configure_flash_attention, log_hardware_info
+
+# Import CodeExecEnv components
+from ludic.envs.code_exec import (
+    CodeExecEnv,
+    CodeExecConfig,
+    create_sandbox_pool,
+    SandboxBackend,
+)
+from ludic.envs.code_exec.adapters.apps import APPSTestAdapter, APPS_SYSTEM_PROMPT
+
+import logging
+
+logging.basicConfig(
+    level=logging.DEBUG, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s"
+)
+
+
+def code_block_parser(raw: str) -> ParseResult:
+    """
+    Parse code from markdown code blocks or raw text.
+
+    Accepts:
+      - ```python\n...\n```
+      - ```\n...\n```
+      - Raw code (if no code blocks found)
+
+    Returns parsed code with small format reward for proper code blocks.
+    """
+    import re
+
+    # Try to extract from markdown code block
+    code_block_pattern = r"```(?:python)?\s*\n(.*?)(?:\n)?```"
+    match = re.search(code_block_pattern, raw, re.DOTALL)
+
+    if match:
+        code = match.group(1).strip()
+        return ParseResult(
+            action=code, reward=0.05, obs=None
+        )  # Small bonus for proper formatting
+
+    # Empty response if no code block found
+    return ParseResult(action=None, reward=-0.1, obs="Please provide Python code.")
+
+
+def load_apps_samples(
+    split: str = "train",
+    limit: int | None = None,
+    difficulty: str | None = None,
+) -> List[Dict[str, Any]]:
+    """
+    Load APPS samples from HuggingFace datasets.
+
+    Args:
+        split: Dataset split ("train" or "test")
+        limit: Maximum number of samples to load
+        difficulty: Filter by difficulty ("introductory", "interview", "competition")
+
+    Returns:
+        List of sample dicts with question, inputs, outputs, etc.
+    """
+    # Load from the control-arena version which has cleaner formatting
+    ds = load_dataset("RoganInglis/apps-control-arena", split=split)
+
+    samples: List[Dict[str, Any]] = []
+    for idx, row in enumerate(ds):
+        # Filter by difficulty if specified
+        if difficulty and row.get("difficulty") != difficulty:
+            continue
+
+        # Skip nondeterministic problems (they require special handling)
+        if row.get("is_nondeterministic", False):
+            continue
+
+        samples.append(
+            {
+                "problem_id": row.get("problem_id", str(idx)),
+                "question": row["question"],
+                "inputs": row.get("inputs", []),
+                "outputs": row.get("outputs", []),
+                "difficulty": row.get("difficulty", "unknown"),
+            }
+        )
+
+        if limit is not None and len(samples) >= limit:
+            break
+
+    return samples
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Train on APPS code generation dataset with LoRA"
+    )
+
+    # Model and inference
+    parser.add_argument("--model", default="Qwen/Qwen2.5-3B-Instruct")
+    parser.add_argument("--host", default="127.0.0.1", help="vLLM server host")
+    parser.add_argument("--port", type=int, default=8000, help="vLLM server port")
+    parser.add_argument(
+        "--max-prompt-tokens", type=int, default=1024, help="Max prompt tokens"
+    )
+    parser.add_argument(
+        "--max-new-tokens", type=int, default=4096, help="Max new tokens"
+    )
+    parser.add_argument(
+        "--stop",
+        nargs="*",
+        default=None,
+        help="Stop sequences (e.g. --stop '```' '</answer>')",
+    )
+
+    # LoRA configuration
+    parser.add_argument("--lora-rank", type=int, default=8, help="LoRA rank")
+    parser.add_argument(
+        "--lora-alpha", type=int, default=32, help="LoRA alpha"
+    )
+    parser.add_argument("--lora-dropout", type=float, default=0.0, help="LoRA dropout")
+    parser.add_argument("--lora-use-rslora", action="store_true", help="Use RSLora")
+
+    # Attention configuration
+    parser.add_argument(
+        "--disable-flash-attn",
+        action="store_true",
+        help="Disable Flash Attention (fall back to SDPA)",
+    )
+
+    # KL regularization
+    parser.add_argument(
+        "--kl-coeff",
+        type=float,
+        default=0.0,
+        help="KL penalty coefficient (0 = disabled)",
+    )
+
+    # Data
+    parser.add_argument("--split", default="train", help="Dataset split")
+    parser.add_argument("--limit", type=int, default=None, help="Max samples to load")
+    parser.add_argument("--difficulty", default=None, help="Filter by difficulty")
+
+    # Sandbox
+    parser.add_argument(
+        "--max-concurrent-ops",
+        type=int,
+        default=8,
+        help="Max concurrent sandbox operations (prevents deadlock in HPC environments)",
+    )
+    parser.add_argument(
+        "--sandbox-workers", type=int, default=4, help="Number of sandbox containers"
+    )
+    parser.add_argument(
+        "--sandbox-backend",
+        default="auto",
+        choices=["auto", "docker", "podman-hpc"],
+        help="Sandbox backend (default: auto-detect)",
+    )
+    parser.add_argument(
+        "--python-version", default="3.11", help="Python version in sandbox"
+    )
+    parser.add_argument(
+        "--minimal-sandbox",
+        action="store_true",
+        help="Use minimal sandbox config (no memory/network limits) for HPC compatibility",
+    )
+    parser.add_argument(
+        "--timeout-per-test", type=float, default=2.0, help="Timeout per test (seconds)"
+    )
+
+    # Training
+    parser.add_argument("--lr", type=float, default=1e-5, help="Learning rate")
+    parser.add_argument(
+        "--concurrency", type=int, default=32, help="Rollout concurrency"
+    )
+    parser.add_argument(
+        "--batch-size", type=int, default=4, help="Rollout requests per batch"
+    )
+    parser.add_argument(
+        "--train-steps",
+        type=int,
+        default=100,
+        help="Training steps (0=run until exhausted)",
+    )
+    parser.add_argument("--group-size", type=int, default=8, help="GRPO group size")
+    parser.add_argument(
+        "--train-temperature", type=float, default=0.8, help="Sampling temperature"
+    )
+    parser.add_argument(
+        "--partial-credit", action="store_true", help="Enable partial credit rewards"
+    )
+    parser.add_argument(
+        "--max-seq-len",
+        type=int,
+        default=2048,
+        help="Max tokens per sample (sequences are truncated to this)",
+    )
+    parser.add_argument(
+        "--micro-token-budget",
+        type=int,
+        default=16384,
+        help="Max padded tokens per micro-batch (replaces grad_accum_steps)",
+    )
+
+    # Evaluation
+    parser.add_argument(
+        "--eval-samples",
+        type=int,
+        default=200,
+        help="Number of samples to hold out for eval",
+    )
+    parser.add_argument(
+        "--eval-every", type=int, default=25, help="Eval every N training steps"
+    )
+    parser.add_argument(
+        "--eval-before-start",
+        action="store_true",
+        default=True,
+        help="Run baseline eval",
+    )
+    parser.add_argument(
+        "--eval-concurrency", type=int, default=32, help="Eval concurrency"
+    )
+    parser.add_argument(
+        "--eval-temperature",
+        type=float,
+        default=0.5,
+        help="Eval sampling temperature",
+    )
+
+    # Logging
+    parser.add_argument(
+        "--wandb", action="store_true", help="Enable Weights & Biases logging"
+    )
+    parser.add_argument(
+        "--wandb-project", type=str, default="ludic-apps", help="WandB project name"
+    )
+
+    # Checkpoints
+    parser.add_argument("--rollout-log", default="data/apps_train_rollouts.jsonl")
+    parser.add_argument("--checkpoint-dir", default="checkpoints_apps")
+    parser.add_argument("--checkpoint-every", type=int, default=25)
+    parser.add_argument(
+        "--final-save", action="store_true", help="Save final checkpoint after training"
+    )
+
+    args = parser.parse_args()
+
+    # Warn about concurrency/pool mismatch
+    if args.concurrency > args.sandbox_workers:
+        print(
+            f"WARNING: concurrency ({args.concurrency}) > sandbox-workers ({args.sandbox_workers})"
+        )
+        print(
+            f"  This means {args.concurrency - args.sandbox_workers} tasks will wait for sandboxes."
+        )
+        print(
+            f"  Consider: --sandbox-workers={args.concurrency} OR --concurrency={args.sandbox_workers}"
+        )
+        print()
+
+    # Setup rollout log
+    rollout_log_path = os.path.abspath(args.rollout_log)
+    os.makedirs(os.path.dirname(rollout_log_path) or ".", exist_ok=True)
+    open(rollout_log_path, "a", encoding="utf-8").close()
+
+    # Load tokenizer early (needed for prompt length filtering)
+    print(f"Loading tokenizer: {args.model}")
+    tokenizer = AutoTokenizer.from_pretrained(args.model)
+    if tokenizer.pad_token_id is None:
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+
+    # Load data and split into train/eval sets
+    print(f"Loading APPS samples (split={args.split}, limit={args.limit})...")
+    all_samples = load_apps_samples(args.split, args.limit, args.difficulty)
+    if not all_samples:
+        print("ERROR: No APPS samples loaded.")
+        return 1
+
+    # Filter out samples with prompts that exceed max_prompt_tokens
+    # This ensures max_new_tokens can fit within the model's context window
+    def prompt_fits(sample: Dict[str, Any]) -> bool:
+        messages = [
+            {"role": "system", "content": APPS_SYSTEM_PROMPT},
+            {"role": "user", "content": sample["question"]},
+        ]
+        token_ids = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
+        return len(token_ids) <= args.max_prompt_tokens
+
+    pre_filter_count = len(all_samples)
+    all_samples = [s for s in all_samples if prompt_fits(s)]
+    filtered_count = pre_filter_count - len(all_samples)
+    if filtered_count > 0:
+        print(
+            f"Filtered {filtered_count} samples exceeding {args.max_prompt_tokens} prompt tokens."
+        )
+
+    if not all_samples:
+        print(
+            "ERROR: All samples filtered out by prompt length. Increase --max-prompt-tokens."
+        )
+        return 1
+
+    # Split: last N samples for eval (deterministic, reproducible)
+    if args.eval_samples > 0 and len(all_samples) > args.eval_samples:
+        train_samples = all_samples[: -args.eval_samples]
+        eval_samples = all_samples[-args.eval_samples :]
+    else:
+        train_samples = all_samples
+        eval_samples = []
+
+    print(f"Loaded {len(all_samples)} total samples (after filtering).")
+    print(f"  Train: {len(train_samples)} samples")
+    print(f"  Eval:  {len(eval_samples)} samples (held out)")
+
+    samples_q: queue.Queue = queue.Queue()
+    for idx, s in enumerate(train_samples):
+        samples_q.put((idx, s))
+
+    # Load model with LoRA
+    print(f"Loading model: {args.model}")
+
+    # Configure Flash Attention (auto-detects optimal implementation)
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    attn_impl = configure_flash_attention(device, disable_flash_attn=args.disable_flash_attn)
+    log_hardware_info()
+    print(f"Attention implementation: {attn_impl}")
+
+    base_model = AutoModelForCausalLM.from_pretrained(
+        args.model,
+        torch_dtype=torch.bfloat16,
+        trust_remote_code=True,
+        attn_implementation=attn_impl,
+    )
+
+    # Apply LoRA adapter
+    lora_config = LoraConfig(
+        task_type=TaskType.CAUSAL_LM,
+        inference_mode=False,
+        r=args.lora_rank,
+        lora_alpha=args.lora_alpha,
+        lora_dropout=args.lora_dropout,
+        use_rslora=False,
+        bias="none",
+        # target_modules="all-linear",
+        target_modules=[
+            "q_proj",  # Attention: Query projection
+            "k_proj",  # Attention: Key projection
+            "v_proj",  # Attention: Value projection
+            "o_proj",  # Attention: Output projection
+            "gate_proj",  # MLP: Gating projection
+            "up_proj",  # MLP: Up projection
+            "down_proj",  # MLP: Down projection
+        ],
+    )
+    model = get_peft_model(base_model, lora_config)
+    model.to(device)
+    model.print_trainable_parameters()
+    print(
+        f"Model loaded on {device} with LoRA (rank={args.lora_rank}, alpha={args.lora_alpha})."
+    )
+
+    # Setup sandbox pool
+    loop = asyncio.new_event_loop()
+    asyncio.set_event_loop(loop)
+
+    # Build backend kwargs (minimal mode skips memory/network limits for HPC compatibility)
+    backend_kwargs = {}
+    if args.minimal_sandbox:
+        backend_kwargs["memory_limit"] = None
+        backend_kwargs["network_disabled"] = False
+
+    try:
+        sandbox_pool = loop.run_until_complete(
+            create_sandbox_pool(
+                n_workers=args.sandbox_workers,
+                backend=args.sandbox_backend,
+                python_version=args.python_version,
+                max_concurrent_ops=args.max_concurrent_ops,
+                cache_size=10000,
+                **backend_kwargs,
+            )
+        )
+    except RuntimeError as e:
+        print(f"ERROR: {e}")
+        return 1
+
+    # Create shared adapter and config
+    test_adapter = APPSTestAdapter()
+    env_config = CodeExecConfig(
+        timeout_per_test_s=args.timeout_per_test,
+        stop_on_first_failure=False,
+        compile_first=True,
+        partial_credit=args.partial_credit,
+        compile_failure_reward=-0.1,
+        use_cache=True,
+    )
+
+    # Shared client for inference
+    client = VLLMChatClient(host=args.host, port=args.port, enable_weight_updates=True)
+    publisher = create_vllm_publisher(client)
+
+    # Environment factory (captures sandbox_pool via closure)
+    def env_factory(sample: Dict[str, Any]) -> CodeExecEnv:
+        return CodeExecEnv(
+            sample=sample,
+            sandbox_pool=sandbox_pool,
+            test_adapter=test_adapter,
+            config=env_config,
+            system_prompt=APPS_SYSTEM_PROMPT,
+        )
+
+    env_registry = {"apps": env_factory}
+
+    def protocol_factory():
+        return SingleAgentProtocol(
+            agent=Agent(
+                client=client,
+                model=args.model,
+                ctx=FullDialog(),
+                parser=code_block_parser,
+            )
+        )
+
+    protocol_registry = {"single_agent": protocol_factory}
+
+    # Algorithm (CISPO - better for reasoning tokens)
+    algo = make_cispo(
+        group_size=args.group_size,
+        group_normalize_adv=True,
+        clip_eps_high=0.2,
+        length_normalize=True,
+        kl_coeff=args.kl_coeff,
+    )
+    print("Using CISPO algorithm (better for reasoning/self-correction tokens)")
+    print(f"KL coefficient: {args.kl_coeff}")
+
+    # Engine + batch source
+    engine = RolloutEngine(
+        env_registry=env_registry,
+        protocol_registry=protocol_registry,
+        jsonl_path=rollout_log_path,
+    )
+
+    train_inference = InferenceSpec(
+        sampling=SamplingParams(
+            temperature=args.train_temperature,
+            max_tokens=args.max_new_tokens,
+            stop=args.stop,
+        ),
+        return_=ReturnSpec.for_rl(top_logprobs_k=1),
+    )
+
+    requests_fn = make_dataset_queue_requests_fn(
+        samples_q,
+        batch_size=args.batch_size,
+        env_kind="apps",
+        protocol_kind="single_agent",
+        inference=train_inference,
+        protocol_kwargs={},
+        request_meta_fn=lambda idx, sample: {
+            "sample_index": idx,
+            "problem_id": sample.get("problem_id", idx),
+            "difficulty": sample.get("difficulty", "unknown"),
+        },
+        env_seed_fn=lambda idx, _sample: idx,
+        sampling_seed_fn=lambda idx, _sample: idx,
+        group_size=args.group_size,
+    )
+
+    batch_source = RolloutBatchSource(
+        orchestrator=engine,
+        credit_assigner=algo.credit_assigner,
+        requests_fn=requests_fn,
+        max_steps=1,  # Single-step env
+        concurrency=args.concurrency,
+    )
+
+    # Trainer config with eval settings
+    cfg = TrainerConfig(
+        model_device=device,
+        lr=args.lr,
+        max_seq_len=args.max_seq_len,
+        micro_token_budget=args.micro_token_budget,
+        max_grad_norm=0.1,
+        pad_token_id=tokenizer.pad_token_id,
+        eval_at_start=bool(args.eval_before_start and eval_samples),
+        eval_every_n_steps=(
+            args.eval_every
+            if args.eval_every and args.eval_every > 0 and eval_samples
+            else None
+        ),
+        eval_concurrency=args.eval_concurrency,
+        eval_max_steps=1,
+    )
+
+    checkpoint_cfg = CheckpointConfig(
+        output_dir=args.checkpoint_dir,
+        every_n_steps=args.checkpoint_every,
+        max_to_keep=2,
+        save_optimizer=True,
+    )
+
+    # Training reducers
+    reducers = {
+        "all_passed_rate": Reducer(
+            kind="count_true",
+            source="all_passed",
+            normalize_by="rollouts",
+        ),
+        "compile_fail_rate": Reducer(
+            kind="count_true",
+            source="compile_failed",
+            normalize_by="rollouts",
+        ),
+        "avg_pass_rate": Reducer(
+            kind="mean",
+            source="pass_rate",
+        ),
+        "parse_err_rate": Reducer(
+            kind="count_true",
+            source="parse_error",
+            normalize_by="samples",
+        ),
+        "total_completion_tokens": Reducer(
+            kind="sum",
+            source="completion_length",
+        ),
+        **default_reducers(),
+    }
+
+    # Eval reducers (for held-out samples)
+    eval_reducers = {
+        "all_passed_rate": Reducer(
+            kind="count_true",
+            source="all_passed",
+            normalize_by="rollouts",
+            as_percent=True,
+        ),
+        "compile_fail_rate": Reducer(
+            kind="count_true",
+            source="compile_failed",
+            normalize_by="rollouts",
+            as_percent=True,
+        ),
+        "avg_pass_rate": Reducer(
+            kind="mean",
+            source="pass_rate",
+        ),
+        "parse_error_rate": Reducer(
+            kind="count_true",
+            source="parse_error",
+            normalize_by="samples",
+            as_percent=True,
+        ),
+        "avg_completion_tokens": Reducer(
+            kind="mean",
+            source="completion_length",
+        ),
+    }
+
+    # Logging metrics to track
+    log_keys = [
+        # Core training
+        "train/loss",
+        "train/avg_total_reward",
+        # APPS-specific
+        "train/all_passed_rate",
+        "train/compile_fail_rate",
+        "train/avg_pass_rate",
+        "train/parse_err_rate",
+        "train/avg_completion_length",
+        # Eval metrics
+        "eval/all_passed_rate",
+        "eval/compile_fail_rate",
+        "eval/avg_pass_rate",
+        "eval/parse_error_rate",
+        "eval/avg_completion_tokens",
+        # Counts
+        "train/target_rollouts",
+        "train/num_samples",
+    ]
+
+    # Configure logger (WandB or RichLive terminal dashboard)
+    if args.wandb:
+        train_logger = WandbLogger(project=args.wandb_project, config=dict(vars(args)))
+        print(f"WandB logging enabled: project={args.wandb_project}")
+    else:
+        train_logger = RichLiveLogger(
+            keys=log_keys,
+            spark_key="avg_total_reward",
+            history=100,
+            precision=4,
+        )
+
+    # Create EngineEvaluator for eval set
+    eval_inference = InferenceSpec(
+        sampling=SamplingParams(
+            temperature=args.eval_temperature,
+            max_tokens=args.max_new_tokens,
+            stop=args.stop,
+        ),
+        return_=ReturnSpec.for_eval(return_token_ids=True),
+    )
+
+    evaluator = None
+    if eval_samples:
+        evaluator = EngineEvaluator(
+            engine=RolloutEngine(
+                env_registry=env_registry, protocol_registry=protocol_registry
+            ),
+            requests_fn=lambda: [
+                RolloutRequest(
+                    env=EnvSpec(kind="apps", kwargs={"sample": sample}),
+                    protocol=ProtocolSpec(kind="single_agent"),
+                    env_seed=idx,
+                    sampling_seed=idx,
+                    inference=eval_inference,
+                    num_episodes=1,
+                    meta={
+                        "eval_idx": idx,
+                        "problem_id": sample.get("problem_id", idx),
+                        "difficulty": sample.get("difficulty", "unknown"),
+                    },
+                )
+                for idx, sample in enumerate(eval_samples)
+            ],
+            reducers=eval_reducers,
+            max_steps=1,
+            timeout_s=cfg.eval_timeout_s,
+            concurrency=cfg.eval_concurrency,
+        )
+        print(
+            f"Eval configured: {len(eval_samples)} samples, every {args.eval_every} steps"
+        )
+
+    trainer = Trainer(
+        model=model,
+        algo=algo,
+        batch_source=batch_source,
+        publisher=publisher,
+        enable_gradient_checkpointing=True,
+        cfg=cfg,
+        checkpoint_config=checkpoint_cfg,
+        train_logger=train_logger,
+        reducers=reducers,
+        evaluator=evaluator,
+    )
+
+    print(f"\nStarting training for {args.train_steps} steps...")
+    print(f"  Samples: {len(train_samples)}")
+    print(f"  Batch size: {args.batch_size}")
+    print(f"  Group size: {args.group_size}")
+    print(f"  Concurrency: {args.concurrency}")
+    print(f"  Sandbox workers: {args.sandbox_workers}")
+    print(f"  Sandbox backend: {args.sandbox_backend}")
+    print()
+
+    try:
+        loop.run_until_complete(trainer.train(args.train_steps))
+    except RequestsExhausted:
+        print("Training samples exhausted; stopping.")
+    except KeyboardInterrupt:
+        print("\nTraining interrupted.")
+    finally:
+        # Cleanup sandbox pool
+        print("Shutting down sandbox pool...")
+        loop.run_until_complete(sandbox_pool.shutdown())
+        loop.close()
+
+    # Save final checkpoint if requested
+    if args.final_save:
+        try:
+            ckpt_path = trainer.save_checkpoint(metadata={"final": True})
+            print(f"Final checkpoint saved: {ckpt_path}")
+        except RuntimeError:
+            pass  # No checkpointer configured
+
+    # Close WandB if used
+    if args.wandb:
+        train_logger.close()
+        print("WandB run finished.")
+
+    print("Training complete.")
+    return 0
+
+
+if __name__ == "__main__":
+    exit(main())
diff --git a/examples/fsdp2_training/train_math_fsdp2.py b/examples/fsdp2_training/train_math_fsdp2.py
index 8c8730f..23b9907 100644
--- a/examples/fsdp2_training/train_math_fsdp2.py
+++ b/examples/fsdp2_training/train_math_fsdp2.py
@@ -30,7 +30,7 @@
 from ludic.agent import Agent
 from ludic.context import FullDialog
 from ludic.inference import VLLMChatClient, InferenceSpec, SamplingParams, ReturnSpec
-from ludic.interaction import SingleAgentSyncProtocol
+from ludic.interaction import SingleAgentProtocol
 from ludic.distributed import create_vllm_publisher
 from ludic.parsers import boxed_parser, extract_last_boxed_content
 from ludic.eval import EngineEvaluator
@@ -301,7 +301,7 @@ def main() -> None:
     env_registry = {"math": lambda sample: MATHEnv(sample=sample, system_prompt=args.system_prompt)}
 
     def protocol_factory():
-        return SingleAgentSyncProtocol(
+        return SingleAgentProtocol(
             agent=Agent(
                 client=client,
                 model=args.model,
diff --git a/examples/gsm8k/train_gsm8k.py b/examples/gsm8k/train_gsm8k.py
index 6b3c4bb..6d5cec3 100644
--- a/examples/gsm8k/train_gsm8k.py
+++ b/examples/gsm8k/train_gsm8k.py
@@ -4,7 +4,7 @@
 This wires together:
   - HF datasets for GSM8K samples
   - single-sample QA envs (GSM8KEnv)
-  - SingleAgentSyncProtocol with a shared VLLMChatClient
+  - SingleAgentProtocol with a shared VLLMChatClient
   - RolloutBatchSource + MonteCarloReturn credit
   - Trainer with REINFORCE loss
 
@@ -27,7 +27,7 @@
 from ludic.agent import Agent
 from ludic.context import FullDialog
 from ludic.inference import VLLMChatClient, InferenceSpec, SamplingParams, ReturnSpec
-from ludic.interaction import SingleAgentSyncProtocol
+from ludic.interaction import SingleAgentProtocol
 from ludic.parsers import boxed_parser
 from ludic.distributed.adapters import create_vllm_publisher
 from ludic.eval import EngineEvaluator
@@ -140,7 +140,7 @@ def main():
     env_registry = {"gsm8k": lambda sample: GSM8KEnv(sample=sample, system_prompt=args.system_prompt)}
 
     def protocol_factory():
-        return SingleAgentSyncProtocol(
+        return SingleAgentProtocol(
             agent=Agent(
                 client=client,
                 model=args.model,
diff --git a/examples/pipeline_rl/run_actor.py b/examples/pipeline_rl/run_actor.py
index 462a9b2..0a154db 100644
--- a/examples/pipeline_rl/run_actor.py
+++ b/examples/pipeline_rl/run_actor.py
@@ -14,7 +14,7 @@
     RolloutRequest,
     make_reinforce,
 )
-from ludic.interaction import SingleAgentSyncProtocol
+from ludic.interaction import SingleAgentProtocol
 
 # Env Import
 from environments.tic_tac_toe import TicTacToeEnv
@@ -40,11 +40,11 @@ def create_engine(client: VLLMChatClient) -> RolloutEngine:
     training_prompt = base_prompt + "\n\nOutput your move as a single XML tag, e.g., <move>A1</move>."
 
     def create_protocol():
-        return SingleAgentSyncProtocol(
+        return SingleAgentProtocol(
             agent=Agent(
-                client=client, 
-                model=MODEL_NAME, 
-                ctx=FullDialog(system_prompt=training_prompt), 
+                client=client,
+                model=MODEL_NAME,
+                ctx=FullDialog(system_prompt=training_prompt),
                 parser=xml_tag_parser("move")
             ),
             stop_on_parse_error=True,
diff --git a/examples/rejection_sampling.py b/examples/rejection_sampling.py
index e12a77d..a2b52c4 100644
--- a/examples/rejection_sampling.py
+++ b/examples/rejection_sampling.py
@@ -17,7 +17,7 @@
 from ludic.agent import Agent
 from ludic.context import FullDialog
 from ludic.inference import VLLMChatClient, InferenceSpec, SamplingParams
-from ludic.interaction import SingleAgentSyncProtocol
+from ludic.interaction import SingleAgentProtocol
 from ludic.parsers import xml_tag_parser
 from ludic.training import RolloutEngine, EnvSpec, ProtocolSpec, RolloutRequest
 from ludic.types import Rollout
@@ -78,7 +78,7 @@ async def generate_filtered_data(args: argparse.Namespace) -> None:
     prompt_text = build_system_prompt()
 
     def create_protocol():
-        return SingleAgentSyncProtocol(
+        return SingleAgentProtocol(
             agent=Agent(
                 client=client,
                 model=args.model,
diff --git a/examples/tic_tac_toe/generate_synth_data.py b/examples/tic_tac_toe/generate_synth_data.py
index ed9f90a..e635eb6 100644
--- a/examples/tic_tac_toe/generate_synth_data.py
+++ b/examples/tic_tac_toe/generate_synth_data.py
@@ -46,8 +46,7 @@ def build_system_prompt() -> str:
     """Build system prompt matching train_tic_tac_toe.py"""
     base_prompt = TicTacToeEnv().suggested_sysprompt or ""
     return (
-        base_prompt
-        + "\n\nThink through the board in <think>...</think>. "
+        base_prompt + "\n\nThink through the board in <think>...</think>. "
         "After </think>, output exactly one XML tag of the form <move>A1</move> and nothing else."
     )
 
@@ -160,7 +159,9 @@ def apply_prompt_format(
 
         if include_step:
             truncated_messages = _truncate_history_messages(full_messages, placeholder)
-            chat_messages = truncated_messages if truncate_history else list(full_messages)
+            chat_messages = (
+                truncated_messages if truncate_history else list(full_messages)
+            )
 
             prompt_text = _messages_to_prompt(chat_messages)
 
@@ -225,7 +226,7 @@ async def generate_synth_data(args: argparse.Namespace) -> None:
     prompt_text = build_system_prompt()
 
     def create_protocol():
-        return SingleAgentSyncProtocol(
+        return SingleAgentProtocol(
             agent=Agent(
                 client=client,
                 model=args.model,
@@ -248,7 +249,9 @@ def create_protocol():
     if args.min_completion_tokens > 0 or args.max_completion_tokens > 0:
         return_spec = ReturnSpec.for_eval(return_token_ids=True)
     inference = InferenceSpec(
-        sampling=SamplingParams(temperature=args.temperature, max_tokens=args.max_tokens),
+        sampling=SamplingParams(
+            temperature=args.temperature, max_tokens=args.max_tokens
+        ),
         return_=return_spec,
     )
 
@@ -286,7 +289,9 @@ def create_protocol():
             results[res] += 1
         else:
             results["other"] += 1
-    print(f"Generated {total} rollouts: {results['win']} wins, {results['loss']} losses, {results['draw']} draws")
+    print(
+        f"Generated {total} rollouts: {results['win']} wins, {results['loss']} losses, {results['draw']} draws"
+    )
 
     # Filter and transform
     out_path = Path(args.output)
@@ -323,16 +328,24 @@ def create_protocol():
         too_short = stats.get("too_short", 0)
         kept_steps = stats.get("kept_steps", 0)
         if missing_trace:
-            print(f"Skipped {missing_trace} steps missing token traces (enable return_token_ids).")
+            print(
+                f"Skipped {missing_trace} steps missing token traces (enable return_token_ids)."
+            )
         if too_short:
-            print(f"Skipped {too_short} steps with completion length < {args.min_completion_tokens}.")
+            print(
+                f"Skipped {too_short} steps with completion length < {args.min_completion_tokens}."
+            )
         if too_long:
-            print(f"Skipped {too_long} steps with completion length > {args.max_completion_tokens}.")
+            print(
+                f"Skipped {too_long} steps with completion length > {args.max_completion_tokens}."
+            )
         print(f"Kept {kept_steps} steps after length filtering.")
         if dropped_empty:
             print(f"Skipped {dropped_empty} rollouts with no remaining steps.")
     if args.transform:
-        print(f"  (transformed to TruncatedThinking format with placeholder: '{args.placeholder}')")
+        print(
+            f"  (transformed to TruncatedThinking format with placeholder: '{args.placeholder}')"
+        )
 
 
 def main():
@@ -345,8 +358,12 @@ def main():
     parser.add_argument("--port", type=int, default=8000)
 
     # Generation
-    parser.add_argument("--episodes", type=int, default=5000, help="Total episodes to generate.")
-    parser.add_argument("--max-steps", type=int, default=5, help="Max steps per episode.")
+    parser.add_argument(
+        "--episodes", type=int, default=5000, help="Total episodes to generate."
+    )
+    parser.add_argument(
+        "--max-steps", type=int, default=5, help="Max steps per episode."
+    )
     parser.add_argument("--concurrency", type=int, default=32)
     parser.add_argument("--temperature", type=float, default=0.8)
     parser.add_argument("--max-tokens", type=int, default=250)
@@ -364,16 +381,35 @@ def main():
     )
 
     # Transformation
-    parser.add_argument("--transform", action="store_true", default=True,
-                        help="Truncate history to TruncatedThinking format (default: True)")
-    parser.add_argument("--no-transform", action="store_false", dest="transform",
-                        help="Keep full assistant history in prompts")
-    parser.add_argument("--placeholder", default="[TRUNCATED]",
-                        help="Placeholder for truncated thinking blocks")
-    parser.add_argument("--lean", action="store_true", default=True,
-                        help="Drop heavy metadata to keep JSONL small (default: True)")
-    parser.add_argument("--no-lean", action="store_false", dest="lean",
-                        help="Keep full step/meta fields")
+    parser.add_argument(
+        "--transform",
+        action="store_true",
+        default=True,
+        help="Truncate history to TruncatedThinking format (default: True)",
+    )
+    parser.add_argument(
+        "--no-transform",
+        action="store_false",
+        dest="transform",
+        help="Keep full assistant history in prompts",
+    )
+    parser.add_argument(
+        "--placeholder",
+        default="[TRUNCATED]",
+        help="Placeholder for truncated thinking blocks",
+    )
+    parser.add_argument(
+        "--lean",
+        action="store_true",
+        default=True,
+        help="Drop heavy metadata to keep JSONL small (default: True)",
+    )
+    parser.add_argument(
+        "--no-lean",
+        action="store_false",
+        dest="lean",
+        help="Keep full step/meta fields",
+    )
 
     # Output
     parser.add_argument("--output", default="data/tictactoe_sft_train_data.jsonl")
diff --git a/examples/tic_tac_toe/sft_tic_tac_toe.py b/examples/tic_tac_toe/sft_tic_tac_toe.py
index 80c7eff..6461210 100644
--- a/examples/tic_tac_toe/sft_tic_tac_toe.py
+++ b/examples/tic_tac_toe/sft_tic_tac_toe.py
@@ -206,7 +206,7 @@ def main() -> None:
 
     model = AutoModelForCausalLM.from_pretrained(
         args.model,
-        torch_dtype=torch.bfloat16,
+        dtype=torch.bfloat16,
         device_map={"": "cpu"},
         low_cpu_mem_usage=True,
         trust_remote_code=True,
diff --git a/examples/tic_tac_toe/train_tic_tac_toe.py b/examples/tic_tac_toe/train_tic_tac_toe.py
index ebe040b..55230ca 100644
--- a/examples/tic_tac_toe/train_tic_tac_toe.py
+++ b/examples/tic_tac_toe/train_tic_tac_toe.py
@@ -3,7 +3,7 @@
 
 This wires together:
   - TicTacToeEnv single-agent episodes
-  - SingleAgentSyncProtocol with a shared VLLMChatClient
+  - SingleAgentProtocol with a shared VLLMChatClient
   - RolloutBatchSource + GroupNormalizedReturn credit
   - Trainer with REINFORCE loss
   - Optional periodic eval of win rate
@@ -25,7 +25,7 @@
 from ludic.agent import Agent
 from ludic.context import FullDialog, TruncatedThinkingContext
 from ludic.inference import VLLMChatClient, InferenceSpec, SamplingParams, ReturnSpec
-from ludic.interaction import SingleAgentSyncProtocol
+from ludic.interaction import SingleAgentProtocol
 from ludic.distributed.adapters import create_vllm_publisher
 from ludic.parsers import compose_parsers, think_prefix_parser, xml_tag_parser
 from ludic.eval import EngineEvaluator
@@ -216,7 +216,7 @@ def protocol_factory():
             ctx = TruncatedThinkingContext(system_prompt=system_prompt)
         else:
             ctx = FullDialog(system_prompt=system_prompt)
-        return SingleAgentSyncProtocol(
+        return SingleAgentProtocol(
             agent=Agent(
                 client=client,
                 model=args.model,
diff --git a/pyproject.toml b/pyproject.toml
index 274e1e7..a87fb8c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,20 +11,27 @@ dependencies = [
     "aiohttp>=3.13.2",
     "beartype>=0.22.9",
     "jaxtyping>=0.3.4",
+    "datasets>=4.4.2",
     "openai>=2.7.1",
     "peft>=0.18.0",
     "rich>=14.2.0",
-    "torch>=2.8.0",
-    "vllm>=0.13.0",
+    "setuptools>=79.0.1",
+    # CRITICAL: torch>=2.9.0 required for aarch64 CUDA wheels
+    # PyTorch 2.8.0 has NO aarch64 CUDA wheels - skip it!
+    # See: https://download.pytorch.org/whl/cu128/torch/
+    "torch>=2.9.0",
+    # vLLM is Linux-only (depends on NVIDIA libraries)
+    # Use sys_platform marker to skip on macOS/Windows
+    "torch-c-dlpack-ext>=0.1.4",
+    "vllm>=0.12.0; sys_platform == 'linux'",
+    "wandb>=0.23.1",
+    # Flash Attention for efficient attention computation (Linux-only, requires CUDA)
+    "flash-attn>=2.7.0; sys_platform == 'linux'",
 ]
 
 [project.optional-dependencies]
-pipelinerl = [
-    "redis>=7.1.0",
-]
-examples = [
-    "datasets==4.4.1", # pinned to the versions in uv.lock that are known to work
-    "math-verify==0.8.0", # pinned to the versions in uv.lock that are known to work
+code-exec = [
+    "docker>=7.1.0",
 ]
 
 [build-system]
@@ -56,3 +63,55 @@ markers = [
     "diagnostic: marks tests that primarily emit diagnostic reports rather than asserting strict correctness",
 ]
 testpaths = ["tests"]
+
+# =============================================================================
+# uv Configuration for Cross-Platform PyTorch
+# =============================================================================
+# This configuration automatically selects the correct PyTorch wheels:
+#   - Linux: CUDA 12.8 wheels from pytorch-cu128 index
+#   - macOS/Windows: CPU wheels from pytorch-cpu index
+#
+# Usage:
+#   Local dev (macOS):  uv sync
+#   HPC (Linux GPU):    uv sync
+#   Linux CI (no GPU):  uv sync --extra cpu
+#
+# See: https://docs.astral.sh/uv/guides/integration/pytorch/
+# See: https://docs.isambard.ac.uk/user-documentation/applications/ML-packages/
+# =============================================================================
+
+[tool.uv]
+# Flash Attention build configuration:
+# - Disable build isolation so torch is available during compilation
+# - Declare build-time dependencies explicitly
+# - Set MAX_JOBS for parallel compilation
+no-build-isolation-package = ["flash-attn"]
+
+[tool.uv.extra-build-dependencies]
+flash-attn = ["torch", "packaging", "ninja"]
+
+[tool.uv.extra-build-variables]
+flash-attn = { MAX_JOBS = "16" }
+
+# Platform-based torch source selection:
+# - Linux: Use CUDA 12.8 wheels (supports both x86_64 and aarch64)
+# - Non-Linux (macOS, Windows): Use CPU wheels
+[tool.uv.sources]
+torch = [
+    { index = "pytorch-cpu", marker = "sys_platform != 'linux'" },
+    { index = "pytorch-cu128", marker = "sys_platform == 'linux'" },
+]
+torchvision = [
+    { index = "pytorch-cpu", marker = "sys_platform != 'linux'" },
+    { index = "pytorch-cu128", marker = "sys_platform == 'linux'" },
+]
+
+[[tool.uv.index]]
+name = "pytorch-cpu"
+url = "https://download.pytorch.org/whl/cpu"
+explicit = true
+
+[[tool.uv.index]]
+name = "pytorch-cu128"
+url = "https://download.pytorch.org/whl/cu128"
+explicit = true
diff --git a/research/context-notes.md b/research/context-notes.md
new file mode 100644
index 0000000..bc0de29
--- /dev/null
+++ b/research/context-notes.md
@@ -0,0 +1,21 @@
+# ScaleRL Implementation Notes
+
+## Paper References
+- ScaleRL (research/papers/arXiv-scalerl/arXiv-2510.13786v1.md)
+- Vanilla GRPO (research/papers/arXiv-vanilla_grpo/arXiv-2402.03300.md)
+- DAPO (research/papers/arXiv-dapo/arXiv-2503.14476.md)
+- RL-ZVP (research/papers/arXiv-rl_zvp/arXiv-2509.21880.md)
+- Tricks or Traps? Part I (research/papers/arXiv-tricks_or_traps_part1/arXiv-2508.08221.md)
+
+## Core Defaults
+- `LossFunction`: default `CISPOLoss` with asymmetric clipping (`clip_low=0.20`, `clip_high=0.28`), `kl_coeff=0.0`.
+- `AdvantageEstimator`: group-mean centering + batch-level std for non-ZVP prompts.
+- `alpha_zvp=0.1` for RL-ZVP entropy-scaled advantages; no batch-level rescaling applied in the ZVP branch.
+- Reward shaping: asymmetric clipping + truncation masking by default; optional length penalty off (`length_penalty=0.0`).
+- Metrics: track `frac_reward_zero_std`, entropy, clipping hit rates, pass-rate statistics, advantage summary stats, KL divergence (logged when active).
+
+## Integration Guardrails
+- Always check logits arrive in FP32; raise if precision lower.
+- Detect zero-variance prompts using per-group reward std; only apply batch-level std to non-ZVP prompts.
+- Upstream sampler will supply prompt groups; assume mixed pass/fail batches and zero-variance prompts retained for RL-ZVP path.
+- If additional post-processing scales advantages, ensure RL-ZVP advantages are exempt or scale through `alpha_zvp`.
diff --git a/research/papers/arXiv-dapo/_page_0_Figure_12.jpeg b/research/papers/arXiv-dapo/_page_0_Figure_12.jpeg
new file mode 100644
index 0000000..d0b3690
Binary files /dev/null and b/research/papers/arXiv-dapo/_page_0_Figure_12.jpeg differ
diff --git a/research/papers/arXiv-dapo/_page_0_Picture_2.jpeg b/research/papers/arXiv-dapo/_page_0_Picture_2.jpeg
new file mode 100644
index 0000000..3013b72
Binary files /dev/null and b/research/papers/arXiv-dapo/_page_0_Picture_2.jpeg differ
diff --git a/research/papers/arXiv-dapo/_page_2_Figure_0.jpeg b/research/papers/arXiv-dapo/_page_2_Figure_0.jpeg
new file mode 100644
index 0000000..be3f7f1
Binary files /dev/null and b/research/papers/arXiv-dapo/_page_2_Figure_0.jpeg differ
diff --git a/research/papers/arXiv-dapo/_page_4_Figure_0.jpeg b/research/papers/arXiv-dapo/_page_4_Figure_0.jpeg
new file mode 100644
index 0000000..fa9d74b
Binary files /dev/null and b/research/papers/arXiv-dapo/_page_4_Figure_0.jpeg differ
diff --git a/research/papers/arXiv-dapo/_page_4_Figure_1.jpeg b/research/papers/arXiv-dapo/_page_4_Figure_1.jpeg
new file mode 100644
index 0000000..ab7cf1e
Binary files /dev/null and b/research/papers/arXiv-dapo/_page_4_Figure_1.jpeg differ
diff --git a/research/papers/arXiv-dapo/_page_5_Figure_0.jpeg b/research/papers/arXiv-dapo/_page_5_Figure_0.jpeg
new file mode 100644
index 0000000..7d45b06
Binary files /dev/null and b/research/papers/arXiv-dapo/_page_5_Figure_0.jpeg differ
diff --git a/research/papers/arXiv-dapo/_page_5_Figure_1.jpeg b/research/papers/arXiv-dapo/_page_5_Figure_1.jpeg
new file mode 100644
index 0000000..460bdce
Binary files /dev/null and b/research/papers/arXiv-dapo/_page_5_Figure_1.jpeg differ
diff --git a/research/papers/arXiv-dapo/_page_6_Figure_0.jpeg b/research/papers/arXiv-dapo/_page_6_Figure_0.jpeg
new file mode 100644
index 0000000..2495acb
Binary files /dev/null and b/research/papers/arXiv-dapo/_page_6_Figure_0.jpeg differ
diff --git a/research/papers/arXiv-dapo/_page_7_Figure_0.jpeg b/research/papers/arXiv-dapo/_page_7_Figure_0.jpeg
new file mode 100644
index 0000000..92479ee
Binary files /dev/null and b/research/papers/arXiv-dapo/_page_7_Figure_0.jpeg differ
diff --git a/research/papers/arXiv-dapo/_page_9_Figure_0.jpeg b/research/papers/arXiv-dapo/_page_9_Figure_0.jpeg
new file mode 100644
index 0000000..9d22b77
Binary files /dev/null and b/research/papers/arXiv-dapo/_page_9_Figure_0.jpeg differ
diff --git a/research/papers/arXiv-dapo/arXiv-2503.14476.md b/research/papers/arXiv-dapo/arXiv-2503.14476.md
new file mode 100644
index 0000000..7cade06
--- /dev/null
+++ b/research/papers/arXiv-dapo/arXiv-2503.14476.md
@@ -0,0 +1,463 @@
+![](_page_0_Picture_2.jpeg)
+
+# DAPO: An Open-Source LLM Reinforcement Learning System at Scale
+
+<sup>1</sup>ByteDance Seed <sup>2</sup>Institute for AI Industry Research (AIR), Tsinghua University <sup>3</sup>The University of Hong Kong <sup>4</sup>SIA-Lab of Tsinghua AIR and ByteDance SeedFull author list in Contributions
+
+## **Abstract**
+
+Inference scaling empowers LLMs with unprecedented reasoning ability, with reinforcement learning as the core technique to elicit complex reasoning. However, key technical details of state-of-the-art reasoning LLMs are concealed (such as in OpenAI o1 blog and DeepSeek R1 technical report), thus the community still struggles to reproduce their RL training results. We propose the **D**ecoupled Clip and **D**ynamic sAmpling Policy Optimization (DAPO) algorithm, and fully open-source a state-of-the-art large-scale RL system that achieves 50 points on AIME 2024 using Qwen2.5-32B base model. Unlike previous works that withhold training details, we introduce four key techniques of our algorithm that make large-scale LLM RL a success. In addition, we open-source our training code, which is built on the **verl** framework <sup>a</sup>, along with a carefully curated and processed dataset. These components of our open-source system enhance reproducibility and support future research in large-scale LLM RL.
+
+**Date:** March 17, 2025
+
+Correspondence: zhouhao@air.tsinghua.edu.cn, wangmingxuan.89@bytedance.com
+
+Project Page: https://dapo-sia.github.io/
+
+ahttps://github.com/volcengine/verl
+
+![](_page_0_Figure_12.jpeg)
+
+**Figure 1** AIME 2024 scores of **DAPO** on the Qwen2.5-32B base model, outperforming the previous SoTA DeepSeek-R1-Zero-Qwen-32B using 50% training steps. The x-axis represents the gradient update steps.
+
+# 1 Introduction
+
+Test-time scaling such as OpenAI's o1 [1] and DeepSeek's R1 [2] brings a profound paradigm shift to Large Language Models (LLMs) [3–7]. Test-time scaling enables longer Chain-of-Thought thinking and induces sophisticated reasoning behaviors, which makes the models superior in competitive math and coding tasks like AIME and Codeforces.
+
+The central technique driving the revolution is large-scale Reinforcement Learning (RL), which elicits complex reasoning behaviors such as self-verification and iterative refinement. However, the actual algorithm and key recipe for scalable RL training remains a myth, hidden from technical reports of existing reasoning models [1, 2, 8–11]. In this paper, we reveal significant obstacles in large-scale RL training and open-source a scalable RL system with fully open-sourced algorithm, training code and dataset that provides democratized solutions with industry-level RL results.
+
+We experiment over Qwen2.5-32B [12] as the pretrained model for RL. In our initial GRPO run, we achieved only 30 points on AIME — a performance significantly below DeepSeek's RL (47 points). A thorough analysis reveals that the naive GRPO baseline suffers from several key issues such as entropy collapse, reward noise, and training instability. The broader community has encountered similar challenges in reproducing DeepSeek's results [13–19] suggesting that critical training details may have been omitted in the R1 paper that are required to develop an industry-level, large-scale, and reproducible RL system.
+
+To close this gap, we release an open-source state-of-the-art system for large-scale LLM RL, which achieves 50 points on AIME 2024 based on Qwen2.5-32B model, outperforming previous state-of-the-art results achieved by DeepSeek-R1-Zero-Qwen-32B [2] (47 points) using 50% training steps (Figure 1). We propose the **D**ecoupled Clip and **D**ynamic s**A**mpling **P**olicy **O**ptimization (**DAPO**) algorithm, and introduce 4 key techniques to make RL shine in the long-CoT RL scenario. Details are presented in Section 3.
+
+- 1. Clip-Higher, which promotes the diversity of the system and avoids entropy collapse;
+- 2. **Dynamic Sampling**, which improves training efficiency and stability;
+- 3. Token-Level Policy Gradient Loss, which is critical in long-CoT RL scenarios;
+- 4. Overlong Reward Shaping, which reduces reward noise and stabilizes training.
+
+Our implementation is based on verl [20]. By fully releasing our state-of-the-art RL system including training code and data, we aim to reveal valuable insights to large-scale LLM RL that benefit the larger community.
+
+## 2 Preliminary
+
+### 2.1 Proximal Policy Optimization (PPO)
+
+PPO [21] introduces a clipped surrogate objective for policy optimization. By constraining the policy updates within a proximal region of the previous policy using clip, PPO stabilizes training and improves sample efficiency. Specifically, PPO updates the policy by maximizing the following objective:
+
+$$\mathcal{J}_{\text{PPO}}(\theta) = \mathbb{E}_{(q,a) \sim \mathcal{D}, o_{\leq t} \sim \pi_{\theta_{\text{old}}}(\cdot \mid q)} \left[ \min \left( \frac{\pi_{\theta}(o_t \mid q, o_{< t})}{\pi_{\theta_{\text{old}}}(o_t \mid q, o_{< t})} \hat{A}_t, \operatorname{clip} \left( \frac{\pi_{\theta}(o_t \mid q, o_{< t})}{\pi_{\theta_{\text{old}}}(o_t \mid q, o_{< t})}, 1 - \varepsilon, 1 + \varepsilon \right) \hat{A}_t \right) \right], (1)$$
+
+where (q, a) is a question-answer pair from the data distribution  $\mathcal{D}$ ,  $\varepsilon$  is the clipping range of importance sampling ratio, and  $\hat{A}_t$  is an estimator of the advantage at time step t. Given the value function V and the reward function R,  $\hat{A}_t$  is computed using the Generalized Advantage Estimation (GAE) [22]:
+
+$$\hat{A}_t^{\text{GAE}(\gamma,\lambda)} = \sum_{l=0}^{\infty} (\gamma \lambda)^l \delta_{t+l}, \tag{2}$$
+
+where
+
+$$\delta_l = R_l + \gamma V(s_{l+1}) - V(s_l), \quad 0 \le \gamma, \lambda \le 1.$$
+ (3)
+
+![](_page_2_Figure_0.jpeg)
+
+**Figure 2** The accuracy on the AIME test set and the entropy of the actor model's generated probabilities during the RL training process, both before and after applying **Clip-Higher** strategy.
+
+### 2.2 Group Relative Policy Optimization (GRPO)
+
+Compared to PPO, GRPO eliminates the value function and estimates the advantage in a group-relative manner. For a specific question-answer pair (q, a), the behavior policy  $\pi_{\theta_{\text{old}}}$  samples a group of G individual responses  $\{o_i\}_{i=1}^G$ . Then, the advantage of the i-th response is calculated by normalizing the group-level rewards  $\{R_i\}_{i=1}^G$ :
+
+$$\hat{A}_{i,t} = \frac{r_i - \text{mean}(\{R_i\}_{i=1}^G)}{\text{std}(\{R_i\}_{i=1}^G)}.\tag{4}$$
+
+Similar to PPO, GRPO adopts a clipped objective, together with a directly imposed KL penalty term:
+
+$$\mathcal{J}_{\text{GRPO}}(\theta) = \mathbb{E}_{(q,a) \sim \mathcal{D}, \{o_i\}_{i=1}^G \sim \pi_{\theta_{\text{old}}}(\cdot|q)} \left[ \frac{1}{G} \sum_{i=1}^G \frac{1}{|o_i|} \sum_{t=1}^{|o_i|} \left( \min\left(r_{i,t}(\theta) \hat{A}_{i,t}, \ \operatorname{clip}\left(r_{i,t}(\theta), 1 - \varepsilon, 1 + \varepsilon\right) \hat{A}_{i,t}\right) - \beta D_{\text{KL}}(\pi_{\theta}||\pi_{\text{ref}}) \right) \right],$$
+(5)
+
+where
+
+$$r_{i,t}(\theta) = \frac{\pi_{\theta}(o_{i,t} \mid q, o_{i, < t})}{\pi_{\theta_{\text{old}}}(o_{i,t} \mid q, o_{i, < t})}.$$
+(6)
+
+It is also worth noting that GRPO computes the objective at the sample-level. To be exact, GRPO first calculates the mean loss within each generated sequence, before averaging the loss of different samples. As we will be discussing in Section 3.3, such difference may have an impact on the performance of the algorithm.
+
+### 2.3 Removing KL Divergence
+
+The KL penalty term is used to regulate the divergence between the online policy and the frozen reference policy. In the RLHF scenario [23], the goal of RL is to align the model behavior without diverging too far from the initial model. However, during training the long-CoT reasoning model, the model distribution can diverge significantly from the initial model, thus this restriction is not necessary. Therefore, we will exclude the KL term from our proposed algorithm.
+
+### 2.4 Rule-based Reward Modeling
+
+The use of reward model usually suffers from the reward hacking problem [24–29]. Instead, we directly use the final accuracy of a verifiable task as the outcome reward, computed using the following rule:
+
+$$R(\hat{y}, y) = \begin{cases} 1, & \text{is_equivalent}(\hat{y}, y) \\
+-1, & \text{otherwise} \end{cases}$$
+ (7)
+
+where y is the ground-truth answer and  $\hat{y}$  is the predicted answer. This is proved to be an effective approach to activating the base model's reasoning capability, as shown in multiple domains such as automated theorem proving [30–33], computer programming [34–37], and mathematics competition [2].
+
+## 3 DAPO
+
+We propose the **D**ecouple Clip and **D**ynamic sAmpling Policy **O**ptimization (DAPO) algorithm. DAPO samples a group of outputs  $\{o_i\}_{i=1}^G$  for each question q paired with the answer a, and optimizes the policy via the following objective:
+
+$$\mathcal{J}_{\text{DAPO}}(\theta) = \mathbb{E}_{(q,a) \sim \mathcal{D}, \{o_i\}_{i=1}^G \sim \pi_{\theta_{\text{old}}}(\cdot | q)} \left[ \frac{1}{\sum_{i=1}^G |o_i|} \sum_{i=1}^G \sum_{t=1}^{|o_i|} \min \left( r_{i,t}(\theta) \hat{A}_{i,t}, \operatorname{clip}\left( r_{i,t}(\theta), 1 - \varepsilon_{\text{low}}, 1 + \varepsilon_{\text{high}} \right) \hat{A}_{i,t} \right) \right] \quad \text{(8)}$$
+
+s.t.  $0 < \left| \{o_i \mid \text{is_equivalent}(a, o_i)\} \right| < G,$ 
+
+where
+
+$$r_{i,t}(\theta) = \frac{\pi_{\theta}(o_{i,t} \mid q, o_{i, \lt t})}{\pi_{\theta^{\text{old}}}(o_{i,t} \mid q, o_{i, \lt t})}, \quad \hat{A}_{i,t} = \frac{R_i - \operatorname{mean}(\{R_i\}_{i=1}^G)}{\operatorname{std}(\{R_i\}_{i=1}^G)}.$$
+\text{ (9)}
+
+The full algorithm can be found in Algorithm 1. In this section, we will introduce the key techniques associated with DAPO.
+
+### 3.1 Raise the Ceiling: Clip-Higher
+
+In our initial experiments using naive PPO [21] or GRPO [38], we observed the entropy collapse phenomenon: the entropy of the policy decreases quickly as training progresses (Figure 2b). The sampled responses of certain groups tend to be nearly identical. This indicates limited exploration and early deterministic policy, which can hinder the scaling process.
+
+We propose the **Clip-Higher** strategy to address this issue. Clipping over the importance sampling ratio is introduced in Clipped Proximal Policy Optimization (PPO-Clip) [21] to restrict the trust region and enhance the stability of RL. We identify that the upper clip can restrict the exploration of the policy, where making an 'exploitation' token more probable is much easier yet the probability of an unlikely 'exploration' token is too tightly bounded to be uplifted.
+
+Concretely, when  $\varepsilon = 0.2$  (the default value of most algorithms) and  $A_{i,t} > 0$  (the system tries to increase the probability), consider two actions with probabilities  $\pi_{\theta_{\text{old}}}(o_i \mid q) = 0.01$  and 0.9. The upper bounds of the increased probabilities  $\pi_{\theta}(o_i \mid q)$  are 0.012 and 1.08, respectively  $(\pi_{\theta_{\text{old}}} \cdot (1 + \epsilon))$ . This implies that 'exploitation' tokens with a higher probability (e.g., 0.9) are not constrained to get even extremely larger probabilities like 0.999. Conversely, for low-probability 'exploration' tokens, achieving a non-trivial increase in probability is considerably more challenging. Empirically, we also observe that the mean probability of up-clipped tokens is low:  $\pi_{\theta}(o_i \mid q) < 0.2$  (Figure 3a). This finding supports our intuition that the upper clipping threshold indeed restricts the probability increase of low-probability 'exploration' tokens, thereby potentially constraining the exploration of the system.
+
+![](_page_4_Figure_0.jpeg)
+
+![](_page_4_Figure_1.jpeg)
+
+- (a) Mean up-clipped probability.
+- **(b)** The proportion of samples with an accuracy of 1.
+
+Figure 3 The mean up-clipped probability as well as the ratio of prompts with accuracy=1.
+
+Adhering to the **Clip-Higher** strategy, we decouple the lower and higher clipping range as  $\varepsilon_{low}$  and  $\varepsilon_{high}$ , as highlighted in Equation 10:
+
+$$\mathcal{J}_{\text{DAPO}}(\theta) = \mathbb{E}_{(q,a) \sim \mathcal{D}, \{o_i\}_{i=1}^G \sim \pi_{\theta_{\text{old}}}(\cdot | q)} \left[ \frac{1}{\sum_{i=1}^G |o_i|} \sum_{i=1}^G \sum_{t=1}^{|o_i|} \min\left(r_{i,t}(\theta) \hat{A}_{i,t}, \operatorname{clip}\left(r_{i,t}(\theta), 1 - \varepsilon_{\text{low}}, 1 + \varepsilon_{\text{high}}\right) \hat{A}_{i,t}\right) \right] \quad (10)\\ \text{s.t.} \quad 0 < \left| \{o_i \mid \text{is\_equivalent}(a, o_i)\} \right| < G.$$
+
+We increase the value of  $\varepsilon_{\text{high}}$  to leave more room for the increase of low-probability tokens. As shown in Figure 2, this adjustment effectively enhances the policy's entropy and facilitates the generation of more diverse samples. We keep  $\varepsilon_{\text{low}}$  as it is, because increasing it will suppress the probability of these tokens to 0, resulting in the collapse of the sampling space.
+
+### 3.2 The More the Merrier: Dynamic Sampling
+
+Existing RL algorithm suffers from the gradient-decreasing problem when some prompts have accuracy equal to 1. For example for GRPO, if all outputs  $\{o_i\}_{i=1}^G$  of a particular prompt are correct and receive the same reward, the resulting advantage for this group is zero. A zero advantage results in zero policy gradients, shrinking the magnitude and increasing the noise sensitivity of the batch gradient, thereby degrading sample efficiency. Empirically, the number of samples with accuracy equal to 1 continues to increase, as shown in Figure 3b. This means that the effective number of prompts in each batch keeps decreasing, which can lead to larger variance in gradient and dampens the gradient signals for model training.
+
+To this end, we propose to **over-sample and filter out prompts with the accuracy equal to 1 and 0** as illustrated in Equation 11, leaving all prompts in the batch with effective gradients and keeping a consistent number of prompts. The sampling cost for each batch is dynamic. Before training, we keep sampling until the batch is fully filled with samples whose accuracy is neither 0 nor 1.
+
+```html
+$$\mathcal{J}_{\mathrm{DAPO}}(\theta) = \mathbb{E}_{(q,a) \sim \mathcal{D}, \{o_i\}_{i=1}^G \sim \pi_{\theta_{\mathrm{old}}}(\cdot \mid q)} \left[ \frac{1}{\sum_{i=1}^G |o_i|} \sum_{i=1}^G \sum_{t=1}^{|o_i|} \min\left(r_{i,t}(\theta) \hat{A}_{i,t}, \mathrm{clip}\left(r_{i,t}(\theta), 1 - \varepsilon_{\mathrm{low}}, 1 + \varepsilon_{\mathrm{high}}\right) \hat{A}_{i,t}\right) \right]$$
+
+$$\text{s.t. } \mathbf{0 < \left| \left\{ o_i \mid \text{is\_equivalent}(a, o_i) \right\} \right| < G.}$$
+(11)
+
+```
+
+![](_page_5_Figure_0.jpeg)
+
+![](_page_5_Figure_1.jpeg)
+
+- (a) Entropy of actor model's generation probabilities.
+- (b) Average length of actor model-generated responses
+
+Figure 4 The entropy of the probability distribution of the actor model, as well as the changes in response length.
+
+Note that this strategy does not necessarily impede training efficiency, because the generation time is typically dominated by the generation of long-tail samples if the RL system is synchronized and the generation stage is not pipelined. Besides, we find that with dynamic sampling the experiment achieves the same performance faster as shown in Figure 6.
+
+### 3.3 Rebalancing Act: Token-Level Policy Gradient Loss
+
+The original GRPO algorithm employs a sample-level loss calculation, which involves first averaging the losses by token within each sample and then aggregating the losses across samples. In this approach, each sample is assigned an equal weight in the final loss computation. However, we find that this method of loss reduction introduces several challenges in the context of long-CoT RL scenarios.
+
+Since all samples are assigned the same weight in the loss calculation, tokens within longer responses (which contain more tokens) may have a disproportionately lower contribution to the overall loss, which can lead to two adverse effects. First, for high-quality long samples, this effect can impede the model's ability to learn reasoning-relevant patterns within them. Second, we observe that excessively long samples often exhibit low-quality patterns such as gibberish and repetitive words. Thus, sample-level loss calculation, due to its inability to effectively penalize those undesirable patterns in long samples, leads to an unhealthy increase in entropy and response length, as shown in Figure 4a and Figure 4b.
+
+We introduce a Token-level Policy Gradient Loss in the long-CoT RL scenario to address the above limitations:
+
+$$\mathcal{J}_{\text{DAPO}}(\theta) = \mathbb{E}_{(q,a) \sim \mathcal{D}, \{o_i\}_{i=1}^G \sim \pi_{\theta_{\text{old}}}(\cdot \mid q)} \left[ \frac{1}{\sum_{i=1}^G |o_i|} \sum_{i=1}^G \sum_{t=1}^{|o_i|} \min \left( r_{i,t}(\theta) \hat{A}_{i,t}, \operatorname{clip} \left( r_{i,t}(\theta), 1 - \varepsilon_{\text{low}}, 1 + \varepsilon_{\text{high}} \right) \hat{A}_{i,t} \right) \right], \tag{12}$$
+  
+$$\text{s.t.}\quad 0 < \left| \{o_i \mid \text{is_equivalent}(a, o_i) \} \right| < G.$$
+
+In this setting, longer sequences can have more influence on the overall gradient update compared to shorter sequences. Moreover, from the perspective of individual tokens, if a particular generation pattern can lead to an increase or decrease in reward, it will be equally prompted or suppressed, regardless of the length of the response in which it appears.
+
+### 3.4 Hide and Seek: Overlong Reward Shaping
+
+In RL training, we typically set a maximum length for generation, with overlong samples truncated accordingly. We find that improper reward shaping for truncated samples can introduce reward noise and significantly disrupt the training process.
+
+![](_page_6_Figure_0.jpeg)
+
+**Figure 5** The accuracy of the actor model on AIME and the entropy of its generation probabilities, both before and after applying **Overlong Reward Shaping** strategy.
+
+By default, we assign a punitive reward to truncated samples. This approach may introduce noise into the training process, as a sound reasoning process can be penalized solely due to its excessive length. Such penalties can potentially confuse the model regarding the validity of its reasoning process.
+
+To investigate the impact of this reward noise, we first apply an **Overlong Filtering** strategy which masks the loss of truncated samples. We find that this approach significantly stabilizes training and enhances performance, as demonstrated in Figure 5.
+
+**DAPO:** Decoupled Clip and Dynamic sAmpling Policy Optimization
+
+Algorithm 1
+
+```
+Input initial policy model \pi_{\theta}; reawrd model R; task prompts \mathcal{D}; hyperparameters \varepsilon_{\text{low}}, \varepsilon_{\text{high}}
+1{:}\,\,\text{for}\,\, step=1{,}...{,}M\,\,\text{do}
+2:
+        Sample a batch \mathcal{D}_b from \mathcal{D}
+        Update the old policy model \pi_{\theta_{old}} \leftarrow \pi_{\theta}
+3:
+        Sample G outputs \{o_i\}_{i=1}^G \sim \pi_{\theta_{\text{old}}}(\cdot|q) for each question q \in \mathcal{D}_b
+4:
+        Compute rewards \{r_i\}_{i=1}^G for each sampled output o_i by running R
+5:
+        Filter out o_i and add the remaining to the dynamic sampling buffer (Dynamic Sampling Equation (11))
+6:
+7:
+        if buffer size n_b < N:
+8:
+            continue
+        For each o_i in the buffer, compute \hat{A}_{i,t} for the t-th token of o_i (Equation (9))
+9:
+10:
+         for iteration = 1, ..., \mu do
+11:
+             Update the policy model \pi_{\theta} by maximizing the DAPO objective (Equation (8))
+Output \pi_{\theta}
+```
+
+Furthermore, we propose **Soft Overlong Punishment** (Equation 13), a length-aware penalty mechanism designed to shape the reward for truncated samples. Specifically, when the response length exceeds the predefined maximum value, we define a punishment interval. Within this interval, the longer the response, the greater the punishment it receives. This penalty is added to the original rule-based correctness reward, thereby signaling to the model to avoid excessively long responses.
+
+$$R_{\text{length}}(y) = \begin{cases} 0, & |y| \le L_{\text{max}} - L_{\text{cache}} \\ \frac{(L_{\text{max}} - L_{\text{cache}}) - |y|}{L_{\text{cache}}}, & L_{\text{max}} - L_{\text{cache}} < |y| \le L_{\text{max}} \\ -1, & L_{\text{max}} < |y| \end{cases}$$
+(13)
+
+![](_page_7_Figure_0.jpeg)
+
+Figure 6 The training progress before and after applying dynamic sampling on a baseline setting.
+
+### 3.5 Dataset Transformation
+
+Our dataset is sourced from the web and official competition homepages through a combination of web scraping and manual annotation. The answers of math dataset typically come in a variety of formats, such as expression, formula and number, which makes it challenging to design comprehensive rules to parse them. To provide accurate reward signals using rules and minimize errors introduced by formula parsers, inspired by AIME, we select and transform the answers into integers, which are easy to parse. For example, if the original answer is expressed in the form of  $\frac{a+\sqrt{b}}{c}$ , we instruct the LLM to modify the question so that the expected answer becomes a+b+c. After selection and transformation, we obtained the **DAPO-Math-17K** dataset, which consists of 17K prompts, each paired with an integer as the answer.
+
+## 4 Experiments
+
+### 4.1 Training Details
+
+In this work, we focus specifically on mathematical tasks to evaluate our algorithm, which can be readily transferred to other tasks. We adopt the verl framework [20] for training. We use naive GRPO [38] as our baseline algorithm and estimate advantages using group reward normalization.
+
+For hyper-parameters, we utilize the AdamW [39] optimizer with a constant learning rate of  $1 \times 10^{-6}$ , incorporating a linear warm-up over 20 rollout steps. For rollout, the prompt batch size is 512 and we sample 16 responses for each prompt. For training, the mini-batch size is set to 512, i.e., 16 gradient updates for each rollout step. For **Overlong Reward Shaping**, we set the expected maximum length as 16,384 tokens and allocate additional 4,096 tokens as the soft punish cache. Therefore, the maximum number of tokens for generation is set to 20,480 tokens. As for the **Clip-Higher** mechanism, we set the clipping parameter  $\varepsilon_{\text{low}}$  to 0.2 and  $\varepsilon_{\text{high}}$  to 0.28, which effectively balance the trade-off between exploration and exploitation. For evaluation on AIME, we repeat the evaluation set for 32 times and report avg@32 for results stability. The inference hyperparameters of evaluation are set to temperature 1.0 and topp 0.7.
+
+### 4.2 Main Results
+
+Experiments on AIME 2024 demonstrate that **DAPO** has successfully trained the Qwen-32B Base model into a powerful reasoning model, achieving performance superior to DeepSeek's experiments on Qwen2.5-32B using the R1 approach. In Figure 1, we observe a substantial improvement of performance on AIME 2024, with accuracy increasing from near 0% to 50%. Notably, this improvement is achieved with only 50% of the training steps required by DeepSeek-R1-Zero-Qwen-32B.
+
+We analyze the contributions of each training technique in our methodology, as detailed in Table 1. The observed improvements demonstrate the effectiveness of these techniques in RL training, each contributing
+
+**Table 1** Main results of progressive techniques applied to **DAPO** 
+
+| Model                      | <b>AIME24</b> avg@32 |
+|----------------------------|----------------------|
+| DeepSeek-R1-Zero-Qwen-32B  | 47                   |
+| Naive GRPO                 | 30                   |
+| + Overlong Filtering       | 36                   |
+| + Clip-Higher              | 38                   |
+| + Soft Overlong Punishment | 41                   |
+| + Token-level Loss         | 42                   |
+| + Dynamic Sampling (DAPO)  | 50                   |
+
+several accuracy points in AIME 2024. Notably, given the vanilla GRPO setting, only 30% accuracy can be reached by training from a Qwen2.5-32B base model.
+
+For token-level loss, although it brings less performance improvement, we find it enhances training stability and makes the length increase more healthily.
+
+When applying **Dynamic Sampling**, although more data needs to be sampled due to the filtering out of zero-gradient data, the overall training time is not significantly affected. As shown in Figure 6, although the number of sampling instances increases, the model's convergence time is even reduced, due to fewer training steps required.
+
+### 4.3 Training Dynamics
+
+Reinforcement learning on large language models is not only a cutting-edge research direction but also an intrinsically complex systems engineering challenge, characterized by the interdependence of its various subsystems. Modifications to any single subsystem can propagate through the system, leading to unforeseen consequences due to the intricate interplay among these components. Even seemingly minor changes in initial conditions, such as variations in data and hyperparameters, can amplify through iterative reinforcement learning processes, yielding substantial deviations in outcomes. This complexity often confronts researchers with a dilemma: even after meticulous analysis and well-founded expectations that a modification will enhance specific aspects of the training process, the actual results frequently diverge from the anticipated trajectory. Therefore, monitoring of key intermediate results during experimentation is essential for swiftly identifying the sources of discrepancies and, ultimately, for refining the system.
+
+- The Length of Generated Responses is a metric closely related to training stability and performance, as shown in Figure 7a. The increase in length provides the model with a larger space for exploration, allowing more complex reasoning behaviors to be sampled and gradually reinforced through training. However, it is important to note that length does not always maintain a continuous upward trend during training. In some considerable periods, it can exhibit a trend of stagnation or even decline, which has also been demonstrated in [2]. We typically use length in conjunction with validation accuracy as indicators to assess whether an experiment is deteriorating.
+- The Dynamics of Reward during training has always been one of the crucial monitoring indicators in reinforcement learning, as shown in Figure 7b. In the majority of our experiments, the trend of reward increase is relatively stable and does not fluctuate or decline significantly due to adjustments in experimental settings. This indicates that, given a reliable reward signal, language models can robustly fit the distribution of training set. However, we find that the final reward on the training set often exhibits little correlation with the accuracy on the validation set, which indicates overfitting to the training set.
+- The Entropy of the Actor Model and Generation Probability are related to the model's exploration capability and are key metrics that we closely monitor in our experiments. Intuitively, the model's entropy needs to be maintained within an appropriate range. An excessively low entropy indicates that the probability distribution is overly sharp, leading to a loss of exploration capability. Conversely, an
+
+![](_page_9_Figure_0.jpeg)
+
+**Figure 7** The metric curves of response length, reward score, generation entropy, and the mean probability of **DAPO**, which show the dynamics of RL training and serve as essential monitoring indicators to identify potential issues.
+
+excessively high entropy is often associated with issues of over-exploration such as gibberish and repetitive generation. For the generation probability, the situation is exactly the opposite. As demonstrated in Section 3.1, by applying the Clip-Higher strategy, we effectively addressed the issue of entropy collapse. In subsequent experiments, we find that maintaining a slow upward trend in entropy is conducive to the improvement of model performance, shown in Figure 7c and Figure 7d.
+
+### 4.4 Case Study
+
+During the RL training process, we observe an interesting phenomenon: the reasoning patterns of the actor model evolve dynamically over time. Specifically, the algorithm not only reinforces existing reasoning patterns that facilitate correct problem-solving but also gradually gives rise to entirely new modes of reasoning that were initially absent. This finding reveals the adaptability and exploration capability of RL algorithms and offers new insights into the learning mechanisms of the model.
+
+For example, in the early stages of model training, there was virtually no occurrence of checking and reflecting on previous reasoning steps. However, as training progresses, the model exhibits distinct behaviors of reflection and backtracking, as shown in Table 2. This observation sheds light on further exploration into interpreting the emergence of reasoning abilities during RL, which we leave for future research.
+
+## 5 Conclusion
+
+In this paper, we release a fully open-sourced system for large-scale LLM RL, including algorithm, code infrastructure, and dataset. The system achieves state-of-the-art large-scale LLM RL performance (AIME 50 using Qwen-32B pretrained model). We propose the **D**ecoupled Clip and **D**ynamic s**A**mpling **P**olicy
+
+### Question:
+
+Given that the base ABC of the tetrahedron S-ABC is an equilateral triangle, the projection H of point A on the face SBC is the orthocenter of  $\triangle SBC$ , the dihedral angle H-AB-C is  $30^{\circ}$ , and SA=2, find the volume of the tetrahedron. The answer is in the form  $\frac{k}{m}$ , so give the value of k+m.
+
+### Response:
+
+Let us consider the given problem: We need to determine ...
+
+...
+
+Next, let S have coordinates  $(x_0, y_0, z_0)$ . Since SA = 2, the distance formula gives us  $\sqrt{(x_0)^2 + (y_0)^2 + (z_0)^2} = 2$ , so we can write  $x_0^2 + y_0^2 + z_0^2 = 4$ . Now, remember that H is the orthogonal projection of
+
+...
+
+However, wait a moment, let's rethink about the dihedral angle involving planes in a more thoughtful geometric way. Consider the plane  $\alpha_1 = ABC$ , which lies entirely on the xy coordinate plane (as z = 0). Let the plane  $\alpha_2 = SBC$ . The point A projected perpendicularly to plane  $\alpha_2$  lands on H. The line l = AB ...
+
+...
+
+Table 2 Emergence of Reflective Behavior in Reinforcement Learning
+
+**O**ptimization (**DAPO**) algorithm, and introduce 4 key techniques to make RL powerfully effective and efficient in the long-CoT RL scenario. Additionally, by open-sourcing the training code and dataset, we provide the broader research community and society with practical access to a scalable reinforcement learning solution, enabling all to benefit from these advancements.
+
+## **Contributions**
+
+### **Project Lead**
+
+Qiying Yu<sup>1,2,4</sup>
+
+## **Algorithm**
+
+Qiying Yu<sup>1,2,4</sup>, Zheng Zhang<sup>1</sup>, Ruofei Zhu<sup>1</sup>, Yufeng Yuan<sup>1</sup>, Xiaochen Zuo<sup>1</sup>, Yu Yue<sup>1</sup>
+
+### Infrastructure\*
+
+Weinan Dai<sup>1,2,4</sup>, Tiantian Fan<sup>1</sup>, Gaohong Liu<sup>1</sup>, Juncai Liu<sup>1</sup>, Lingjun Liu<sup>1</sup>, Xin Liu<sup>1</sup>, Haibin Lin<sup>1</sup>, Zhiqi Lin<sup>1</sup>, Bole Ma<sup>1</sup>, Guangming Sheng<sup>1,3</sup>, Yuxuan Tong<sup>1,2,4</sup>, Qiying Yu<sup>1,2,4</sup>, Chi Zhang<sup>1</sup>, Mofan Zhang<sup>1</sup>, Ru Zhang<sup>1</sup>, Wang Zhang<sup>1</sup>, Hang Zhu<sup>1</sup>, Jinhua Zhu<sup>1</sup>
+
+\*Last-Name in Alphabetical Order
+
+### **Dataset**
+
+Jiaze Chen<sup>1</sup>, Jiangjie Chen<sup>1,4</sup>, Chengyi Wang<sup>1</sup>, Hongli Yu<sup>1,2,4</sup>, Yuxuan Song<sup>1,2,4</sup>, Xiangpeng Wei<sup>1</sup>, Qiying Yu<sup>1,2,4</sup>
+
+### Supervision
+
+Hao Zhou $^{2,4}$ , Jingjing Liu $^{2,4}$ , Wei-Ying Ma $^{2,4}$ , Ya-Qin Zhang $^{2,4}$ , Lin Yan $^{1,4}$ , Mu Qiao $^{1,4}$ , Yonghui Wu $^1$ , Mingxuan Wang $^{1,4}$ 
+
+### **Affiliation**
+
+## **Acknowledgments**
+
+We thank Zhengyin Du, Shengding Hu, Kai Shen, Tianyang Zhan, Zhen Xiao, Renjie Zheng, Li Han, Kaihua Jiang as well as other colleagues at ByteDance for their support for the **DAPO** project.
+
+<sup>&</sup>lt;sup>1</sup>ByteDance Seed
+
+<sup>&</sup>lt;sup>2</sup>Institute for AI Industry Research (AIR), Tsinghua University
+
+<sup>&</sup>lt;sup>3</sup>The University of Hong Kong
+
+<sup>&</sup>lt;sup>4</sup>SIA-Lab of Tsinghua AIR and ByteDance Seed
+
+## References
+
+- [1] OpenAI. Learning to reason with llms, 2024.
+- [2] Daya Guo, Dejian Yang, Haowei Zhang, Junxiao Song, Ruoyu Zhang, Runxin Xu, Qihao Zhu, Shirong Ma, Peiyi Wang, Xiao Bi, et al. Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning. <a href="arXiv:2501.12948"><u>arXiv:2501.12948</u></a>, 2025.
+- [3] OpenAI. GPT4 technical report. arXiv preprint arXiv:2303.08774, 2023.
+- [4] Anthropic. Claude 3.5 sonnet, 2024.
+- [5] Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, et al. Language models are few-shot learners. Advances in neural information processing systems, 33:1877–1901, 2020.
+- [6] Aakanksha Chowdhery, Sharan Narang, Jacob Devlin, Maarten Bosma, Gaurav Mishra, Adam Roberts, Paul Barham, Hyung Won Chung, Charles Sutton, Sebastian Gehrmann, et al. Palm: Scaling language modeling with pathways. Journal of Machine Learning Research, 24(240):1–113, 2023.
+- [7] Aixin Liu, Bei Feng, Bing Xue, Bingxuan Wang, Bochao Wu, Chengda Lu, Chenggang Zhao, Chengqi Deng, Chenyu Zhang, Chong Ruan, et al. Deepseek-v3 technical report. arXiv preprint arXiv:2412.19437, 2024.
+- [8] XAI. Grok 3 beta the age of reasoning agents, 2024.
+- [9] Google DeepMind. Gemini 2.0 flash thinking, 2024.
+- [10] Qwen. Qwq-32b: Embracing the power of reinforcement learning, 2024.
+- [11] Kimi Team, Angang Du, Bofei Gao, Bowei Xing, Changjiu Jiang, Cheng Chen, Cheng Li, Chenjun Xiao, Chenzhuang Du, Chonghua Liao, et al. Kimi k1. 5: Scaling reinforcement learning with llms. <a href="marxiv:2501.12599"><u>arXiv:2501.12599</u></a>, 2025.
+- [12] An Yang, Baosong Yang, Beichen Zhang, Binyuan Hui, Bo Zheng, Bowen Yu, Chengyuan Li, Dayiheng Liu, Fei Huang, Haoran Wei, et al. Qwen2. 5 technical report. arXiv preprint arXiv:2412.15115, 2024.
+- [13] Zhipeng Chen, Yingqian Min, Beichen Zhang, Jie Chen, Jinhao Jiang, Daixuan Cheng, Wayne Xin Zhao, Zheng Liu, Xu Miao, Yang Lu, et al. An empirical study on eliciting and improving r1-like reasoning models. <a href="arXiv:2503.04548"><u>arXiv</u></a> preprint arXiv:2503.04548, 2025.
+- [14] Jingcheng Hu, Yinmin Zhang, Qi Han, Daxin Jiang, and Heung-Yeung Shum Xiangyu Zhang. Open-reasoner-zero: An open source approach to scaling reinforcement learning on the base model. https://github.com/Open-Reasoner-Zero/Open-Reasoner-Zero, 2025.
+- [15] Jian Hu. Reinforce++: A simple and efficient approach for aligning large language models. <u>arXiv preprint</u> arXiv:2501.03262, 2025.
+- [16] Ganqu Cui, Lifan Yuan, Zefan Wang, Hanbin Wang, Wendi Li, Bingxiang He, Yuchen Fan, Tianyu Yu, Qixin Xu, Weize Chen, et al. Process reinforcement through implicit rewards. arXiv preprint arXiv:2502.01456, 2025.
+- [17] Jung Hyun Lee, June Yong Yang, Byeongho Heo, Dongyoon Han, and Kang Min Yoo. Token-supervised value models for enhancing mathematical reasoning capabilities of large language models. <a href="mailto:arXiv:2407.12863"><u>arXiv:2407.12863</u></a>, 2024.
+- [18] Amirhossein Kazemnejad, Milad Aghajohari, Eva Portelance, Alessandro Sordoni, Siva Reddy, Aaron Courville, and Nicolas Le Roux. Vineppo: Unlocking rl potential for llm reasoning through refined credit assignment. <a href="arXiv"><u>arXiv</u></a> preprint arXiv:2410.01679, 2024.
+- [19] Yufeng Yuan, Yu Yue, Ruofei Zhu, Tiantian Fan, and Lin Yan. What's behind ppo's collapse in long-cot? value optimization holds the secret. arXiv preprint arXiv:2503.01491, 2025.
+- [20] Guangming Sheng, Chi Zhang, Zilingfeng Ye, Xibin Wu, Wang Zhang, Ru Zhang, Yanghua Peng, Haibin Lin, and Chuan Wu. Hybridflow: A flexible and efficient rlhf framework. arXiv preprint arXiv:2409.19256, 2024.
+- [21] John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, and Oleg Klimov. Proximal policy optimization algorithms. arXiv preprint arXiv:1707.06347, 2017.
+
+- [22] John Schulman, Philipp Moritz, Sergey Levine, Michael Jordan, and Pieter Abbeel. High-dimensional continuous control using generalized advantage estimation, 2018.
+- [23] Long Ouyang, Jeffrey Wu, Xu Jiang, Diogo Almeida, Carroll Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, John Schulman, Jacob Hilton, Fraser Kelton, Luke Miller, Maddie Simens, Amanda Askell, Peter Welinder, Paul F Christiano, Jan Leike, and Ryan Lowe. Training language models to follow instructions with human feedback. In S. Koyejo, S. Mohamed, A. Agarwal, D. Belgrave, K. Cho, and A. Oh, editors, Advances in Neural Information Processing Systems, volume 35, pages 27730–27744. Curran Associates, Inc., 2022.
+- [24] Dario Amodei, Chris Olah, Jacob Steinhardt, Paul Christiano, John Schulman, and Dan Mané. Concrete problems in ai safety, 2016.
+- [25] Tom Everitt, Victoria Krakovna, Laurent Orseau, Marcus Hutter, and Shane Legg. Reinforcement learning with a corrupted reward channel, 2017.
+- [26] Victoria Krakovna, Jonathan Uesato, Vladimir Mikulik, Matthew Rahtz, Tom Everitt, Ramana Kumar, Zac Kenton, Jan Leike, and Shane Legg. Specification gaming: the flip side of ai ingenuity, 2020.
+- [27] Tom Everitt, Marcus Hutter, Ramana Kumar, and Victoria Krakovna. Reward tampering problems and solutions in reinforcement learning: A causal influence diagram perspective, 2021.
+- [28] Leo Gao, John Schulman, and Jacob Hilton. Scaling laws for reward model overoptimization, 2022.
+- [29] Lilian Weng. Reward hacking in reinforcement learning. lilianweng.github.io, Nov 2024.
+- [30] Stanislas Polu and Ilya Sutskever. Generative language modeling for automated theorem proving, 2020.
+- [31] Trieu H Trinh, Yuhuai Wu, Quoc V Le, He He, and Thang Luong. Solving olympiad geometry without human demonstrations. Nature, 625(7995):476–482, 2024.
+- [32] Trieu Trinh and Thang Luong. Alphageometry: An olympiad-level ai system for geometry, 2024.
+- [33] AlphaProof and AlphaGeometry Teams. Ai achieves silver-medal standard solving international mathematical olympiad problems, 2024.
+- [34] Hung Le, Yue Wang, Akhilesh Deepak Gotmare, Silvio Savarese, and Steven Chu Hong Hoi. Coderl: Mastering code generation through pretrained models and deep reinforcement learning. <u>Advances in Neural Information Processing Systems</u>, 35:21314–21328, 2022.
+- [35] Noah Shinn, Federico Cassano, Edward Berman, Ashwin Gopinath, Karthik Narasimhan, and Shunyu Yao. Reflexion: Language agents with verbal reinforcement learning, 2023.
+- [36] Xinyun Chen, Maxwell Lin, Nathanael Schärli, and Denny Zhou. Teaching large language models to self-debug. 2023.
+- [37] Jonas Gehring, Kunhao Zheng, Jade Copet, Vegard Mella, Quentin Carbonneaux, Taco Cohen, and Gabriel Synnaeve. Rlef: Grounding code llms in execution feedback with reinforcement learning, 2025.
+- [38] Zhihong Shao, Peiyi Wang, Qihao Zhu, Runxin Xu, Junxiao Song, Mingchuan Zhang, YK Li, Y Wu, and Daya Guo. Deepseekmath: Pushing the limits of mathematical reasoning in open language models. <a href="arXiv:2402.03300">arXiv:2402.03300</a>, 2024.
+- [39] Ilya Loshchilov and Frank Hutter. Decoupled weight decay regularization. In <u>International Conference on Learning</u> Representations, 2019.
+
+## **Appendix**
+
+### **A Dataset Transformation**
+
+Here is an example of data transformation, where we demonstrate the original problem, the transformed problem, and the model outputs during the transformation process (the original is in markdown text, which has been rendered for readability).
+
+To ensure the model can complete this task with high accuracy, our goal is to have it perform comprehensive reasoning while avoiding hallucinations. We achieve this by providing a clear thinking framework that encourages thorough reasoning. Specifically, we employ Chain of Thought (CoT) reasoning along with four well-defined steps: extract the answer format, rewrite the problem statement, solve the modified problem and provide an integer as the final answer. For each step, we provide few-shot solutions or detailed guidelines to guide the model's reasoning.
+
+Through this approach, we have observed that in most cases, the LLM can generate reformulations with both format and quality that are satisfactory.
+
+### **Original Problem**
+
+Let x and y be real numbers such that  $x^2 + y^2 - 22x - 16y + 113 = 0$ . Determine the smallest possible value of x. **Answer:**  $11 - 2\sqrt{6}$ 
+
+#### **Transformed Problem**
+
+Let x and y be real numbers such that  $x^2 + y^2 - 22x - 16y + 113 = 0$ . Determine the smallest possible value of x. The original answer is in the form  $k - m\sqrt{n}$ , where k, m, and n are integers. Please find the value of k + m + n. **Answer:** 19
+
+#### Model's output (Markdown Text)
+
+#### **Transformation Thought Process**
+
+The original answer to the problem is an algebraic expression  $11 - 2\sqrt{6}$ . We need to transform this into an integer form. First, note that the answer consists of an integer part and a square root term. According to the given rule, we replace the integer and square root components with undetermined coefficients. The original problem uses x and y as variables, so we choose other letters as undetermined coefficients. We express the answer in the form  $k - m\sqrt{n}$  and require the student to find k + m + n.
+
+#### **Answer Format**
+
+ $k-m\sqrt{n}$ 
+
+#### **Transformed Problem**
+
+Let x and y be real numbers such that  $x^2 + y^2 - 22x - 16y + 113 = 0$ . Determine the smallest possible value of x. The original answer is in the form  $k - m\sqrt{n}$ , where k, m, and n are integers. Please find the value of k + m + n.
+
+#### **Answer Calculation for the Transformed Problem**
+
+- The original answer is:  $11 2\sqrt{6}$ .
+- The required format is:  $k m\sqrt{n}$ .
+- Matching the coefficients: k = 11, m = 2, n = 6.
+- The final result is: k + m + n = 11 + 2 + 6 = 19.
+
+#### **Final Answer**
+
+19
+
+### **B** Supplementary Case
+
+#### Question:
+
+Among the 900 residents of Aimeville, there are 195 who own a diamond ring, 367 who own a set of golf clubs, and 562 who own a garden spade. In addition, each of the 900 residents owns a bag of candy hearts. There are 437 residents who own exactly two of these things, and 234 residents who own exactly three of these things. Find the number of residents of Aimeville who own all four of these things.
+
+#### Response:
+
+Step by Step Solution:
+
+1. Let's define the following variables:
+
+Let A be the set of residents who own a diamond ring.
+
+Let B be the set of residents who own a set of golf clubs.
+
+Let C be the set of residents who own a garden spade.
+
+Let D be the set of residents who own a bag of candy hearts. Since all 900 residents own a bag of candy hearts, |D| = 900.
+
+Let  $a_i$  denote the number of residents who own exactly i out of the 4 items.
+
+Let's use the principle of inclusion-exclusion for the union of 4 sets A, B, C, D.
+
+$$a_4 = \frac{-219}{-4} = \frac{219}{4} = 54.75.$$
+
+Since  $a_4$ , the number of people owning all four items, must be a whole number, our current approach needs to be reconsidered in terms of the combinatorial counting of people owning different combinations of the items.
+
+Instead of directly using the inclusion-exclusion principle, we can use a different approach involving the number of people owning combinations of the items A, B, C, and D.
+
+Let  $x_i$  be the number of people owning exactly i item among A, B, C (excluding the item D which everyone owns).
+
+Then the total number of people owning exactly k items among the A, B, C, and D can be broken down:
+
+. . .
+
+**Table 3** A case of reflective behavior emergence.
\ No newline at end of file
diff --git a/research/papers/arXiv-dapo/arXiv-2503.14476_meta.json b/research/papers/arXiv-dapo/arXiv-2503.14476_meta.json
new file mode 100644
index 0000000..1153088
--- /dev/null
+++ b/research/papers/arXiv-dapo/arXiv-2503.14476_meta.json
@@ -0,0 +1,1747 @@
+{
+  "table_of_contents": [
+    {
+      "title": "DAPO: An Open-Source LLM Reinforcement Learning\nSystem at Scale",
+      "heading_level": null,
+      "page_id": 0,
+      "polygon": [
+        [
+          82.5,
+          102.8671875
+        ],
+        [
+          527.25,
+          102.8671875
+        ],
+        [
+          527.25,
+          139.9921875
+        ],
+        [
+          82.5,
+          139.9921875
+        ]
+      ]
+    },
+    {
+      "title": "Abstract",
+      "heading_level": null,
+      "page_id": 0,
+      "polygon": [
+        [
+          277.013671875,
+          262.96875
+        ],
+        [
+          333.0,
+          262.96875
+        ],
+        [
+          333.0,
+          274.5
+        ],
+        [
+          277.013671875,
+          274.5
+        ]
+      ]
+    },
+    {
+      "title": "1 Introduction",
+      "heading_level": null,
+      "page_id": 1,
+      "polygon": [
+        [
+          69.328125,
+          100.5
+        ],
+        [
+          159.75,
+          100.5
+        ],
+        [
+          159.75,
+          111.0
+        ],
+        [
+          69.328125,
+          111.0
+        ]
+      ]
+    },
+    {
+      "title": "2 Preliminary",
+      "heading_level": null,
+      "page_id": 1,
+      "polygon": [
+        [
+          69.92578125,
+          506.6015625
+        ],
+        [
+          157.5,
+          506.6015625
+        ],
+        [
+          157.5,
+          518.9765625
+        ],
+        [
+          69.92578125,
+          518.9765625
+        ]
+      ]
+    },
+    {
+      "title": "2.1 Proximal Policy Optimization (PPO)",
+      "heading_level": null,
+      "page_id": 1,
+      "polygon": [
+        [
+          69.626953125,
+          529.5
+        ],
+        [
+          285.0,
+          529.5
+        ],
+        [
+          285.0,
+          540.0
+        ],
+        [
+          69.626953125,
+          540.0
+        ]
+      ]
+    },
+    {
+      "title": "2.2 Group Relative Policy Optimization (GRPO)",
+      "heading_level": null,
+      "page_id": 2,
+      "polygon": [
+        [
+          69.0,
+          322.13671875
+        ],
+        [
+          327.75,
+          322.13671875
+        ],
+        [
+          327.75,
+          333.75
+        ],
+        [
+          69.0,
+          333.75
+        ]
+      ]
+    },
+    {
+      "title": "2.3 Removing KL Divergence",
+      "heading_level": null,
+      "page_id": 2,
+      "polygon": [
+        [
+          69.626953125,
+          599.25
+        ],
+        [
+          231.0,
+          599.25
+        ],
+        [
+          231.0,
+          609.75
+        ],
+        [
+          69.626953125,
+          609.75
+        ]
+      ]
+    },
+    {
+      "title": "2.4 Rule-based Reward Modeling",
+      "heading_level": null,
+      "page_id": 3,
+      "polygon": [
+        [
+          69.92578125,
+          100.546875
+        ],
+        [
+          258.75,
+          100.546875
+        ],
+        [
+          258.75,
+          111.375
+        ],
+        [
+          69.92578125,
+          111.375
+        ]
+      ]
+    },
+    {
+      "title": "3 DAPO",
+      "heading_level": null,
+      "page_id": 3,
+      "polygon": [
+        [
+          69.92578125,
+          241.3125
+        ],
+        [
+          123.75,
+          241.3125
+        ],
+        [
+          124.5,
+          252.0
+        ],
+        [
+          69.92578125,
+          252.75
+        ]
+      ]
+    },
+    {
+      "title": "3.1 Raise the Ceiling: Clip-Higher",
+      "heading_level": null,
+      "page_id": 3,
+      "polygon": [
+        [
+          70.822265625,
+          466.5
+        ],
+        [
+          254.25,
+          466.5
+        ],
+        [
+          254.25,
+          477.984375
+        ],
+        [
+          70.822265625,
+          477.984375
+        ]
+      ]
+    },
+    {
+      "title": "3.2 The More the Merrier: Dynamic Sampling",
+      "heading_level": null,
+      "page_id": 4,
+      "polygon": [
+        [
+          69.75,
+          486.75
+        ],
+        [
+          318.75,
+          486.75
+        ],
+        [
+          318.75,
+          496.546875
+        ],
+        [
+          69.75,
+          496.546875
+        ]
+      ]
+    },
+    {
+      "title": "3.3 Rebalancing Act: Token-Level Policy Gradient Loss",
+      "heading_level": null,
+      "page_id": 5,
+      "polygon": [
+        [
+          69.75,
+          371.25
+        ],
+        [
+          374.25,
+          371.25
+        ],
+        [
+          374.25,
+          382.078125
+        ],
+        [
+          69.75,
+          382.078125
+        ]
+      ]
+    },
+    {
+      "title": "3.4 Hide and Seek: Overlong Reward Shaping",
+      "heading_level": null,
+      "page_id": 5,
+      "polygon": [
+        [
+          69.92578125,
+          692.25
+        ],
+        [
+          325.5,
+          692.25
+        ],
+        [
+          325.5,
+          702.75
+        ],
+        [
+          69.92578125,
+          702.75
+        ]
+      ]
+    },
+    {
+      "title": "3.5 Dataset Transformation",
+      "heading_level": null,
+      "page_id": 7,
+      "polygon": [
+        [
+          69.92578125,
+          306.66796875
+        ],
+        [
+          228.75,
+          306.66796875
+        ],
+        [
+          228.75,
+          315.75
+        ],
+        [
+          69.92578125,
+          315.75
+        ]
+      ]
+    },
+    {
+      "title": "4 Experiments",
+      "heading_level": null,
+      "page_id": 7,
+      "polygon": [
+        [
+          70.224609375,
+          435.75
+        ],
+        [
+          164.25,
+          435.75
+        ],
+        [
+          164.25,
+          447.0
+        ],
+        [
+          70.224609375,
+          447.0
+        ]
+      ]
+    },
+    {
+      "title": "4.1 Training Details",
+      "heading_level": null,
+      "page_id": 7,
+      "polygon": [
+        [
+          69.328125,
+          458.6484375
+        ],
+        [
+          181.5,
+          458.6484375
+        ],
+        [
+          181.5,
+          468.0
+        ],
+        [
+          69.328125,
+          468.0
+        ]
+      ]
+    },
+    {
+      "title": "4.2 Main Results",
+      "heading_level": null,
+      "page_id": 7,
+      "polygon": [
+        [
+          69.0,
+          637.69921875
+        ],
+        [
+          165.75,
+          637.69921875
+        ],
+        [
+          165.75,
+          648.0
+        ],
+        [
+          69.0,
+          648.0
+        ]
+      ]
+    },
+    {
+      "title": "4.3 Training Dynamics",
+      "heading_level": null,
+      "page_id": 8,
+      "polygon": [
+        [
+          69.0,
+          372.796875
+        ],
+        [
+          198.75,
+          372.796875
+        ],
+        [
+          198.75,
+          384.0
+        ],
+        [
+          69.0,
+          384.0
+        ]
+      ]
+    },
+    {
+      "title": "4.4 Case Study",
+      "heading_level": null,
+      "page_id": 9,
+      "polygon": [
+        [
+          69.328125,
+          541.40625
+        ],
+        [
+          161.25,
+          541.40625
+        ],
+        [
+          161.25,
+          551.25
+        ],
+        [
+          69.328125,
+          551.25
+        ]
+      ]
+    },
+    {
+      "title": "5 Conclusion",
+      "heading_level": null,
+      "page_id": 9,
+      "polygon": [
+        [
+          69.75,
+          687.5859375
+        ],
+        [
+          153.75,
+          687.5859375
+        ],
+        [
+          153.75,
+          698.25
+        ],
+        [
+          69.75,
+          698.25
+        ]
+      ]
+    },
+    {
+      "title": "Question:",
+      "heading_level": null,
+      "page_id": 10,
+      "polygon": [
+        [
+          93.0,
+          104.25
+        ],
+        [
+          136.5,
+          104.25
+        ],
+        [
+          136.5,
+          113.25
+        ],
+        [
+          93.0,
+          113.25
+        ]
+      ]
+    },
+    {
+      "title": "Response:",
+      "heading_level": null,
+      "page_id": 10,
+      "polygon": [
+        [
+          93.0,
+          168.99609375
+        ],
+        [
+          138.75,
+          168.99609375
+        ],
+        [
+          138.75,
+          177.75
+        ],
+        [
+          93.0,
+          177.75
+        ]
+      ]
+    },
+    {
+      "title": "Contributions",
+      "heading_level": null,
+      "page_id": 11,
+      "polygon": [
+        [
+          69.75,
+          99.75
+        ],
+        [
+          151.5,
+          99.75
+        ],
+        [
+          151.5,
+          110.25
+        ],
+        [
+          69.75,
+          110.25
+        ]
+      ]
+    },
+    {
+      "title": "Project Lead",
+      "heading_level": null,
+      "page_id": 11,
+      "polygon": [
+        [
+          69.0,
+          123.0
+        ],
+        [
+          127.5,
+          123.0
+        ],
+        [
+          127.5,
+          133.03125
+        ],
+        [
+          69.0,
+          133.03125
+        ]
+      ]
+    },
+    {
+      "title": "Algorithm",
+      "heading_level": null,
+      "page_id": 11,
+      "polygon": [
+        [
+          69.0,
+          159.0
+        ],
+        [
+          114.75,
+          159.0
+        ],
+        [
+          114.75,
+          169.5
+        ],
+        [
+          69.0,
+          169.5
+        ]
+      ]
+    },
+    {
+      "title": "Infrastructure*",
+      "heading_level": null,
+      "page_id": 11,
+      "polygon": [
+        [
+          69.0,
+          195.0
+        ],
+        [
+          135.75,
+          195.0
+        ],
+        [
+          135.75,
+          204.0
+        ],
+        [
+          69.0,
+          204.0
+        ]
+      ]
+    },
+    {
+      "title": "Dataset",
+      "heading_level": null,
+      "page_id": 11,
+      "polygon": [
+        [
+          69.0,
+          272.63671875
+        ],
+        [
+          108.0,
+          272.63671875
+        ],
+        [
+          108.0,
+          281.25
+        ],
+        [
+          69.0,
+          281.25
+        ]
+      ]
+    },
+    {
+      "title": "Supervision",
+      "heading_level": null,
+      "page_id": 11,
+      "polygon": [
+        [
+          69.75,
+          321.0
+        ],
+        [
+          122.25,
+          321.0
+        ],
+        [
+          122.25,
+          330.0
+        ],
+        [
+          69.75,
+          330.0
+        ]
+      ]
+    },
+    {
+      "title": "Affiliation",
+      "heading_level": null,
+      "page_id": 11,
+      "polygon": [
+        [
+          69.0,
+          368.25
+        ],
+        [
+          114.75,
+          368.25
+        ],
+        [
+          114.75,
+          378.0
+        ],
+        [
+          69.0,
+          378.0
+        ]
+      ]
+    },
+    {
+      "title": "Acknowledgments",
+      "heading_level": null,
+      "page_id": 11,
+      "polygon": [
+        [
+          69.75,
+          466.3828125
+        ],
+        [
+          181.5,
+          466.3828125
+        ],
+        [
+          181.5,
+          477.0
+        ],
+        [
+          69.75,
+          477.0
+        ]
+      ]
+    },
+    {
+      "title": "References",
+      "heading_level": null,
+      "page_id": 12,
+      "polygon": [
+        [
+          69.75,
+          100.5
+        ],
+        [
+          137.25,
+          100.5
+        ],
+        [
+          137.25,
+          110.25
+        ],
+        [
+          69.75,
+          110.25
+        ]
+      ]
+    },
+    {
+      "title": "Appendix",
+      "heading_level": null,
+      "page_id": 14,
+      "polygon": [
+        [
+          68.25,
+          99.0
+        ],
+        [
+          152.25,
+          99.0
+        ],
+        [
+          152.25,
+          116.015625
+        ],
+        [
+          68.25,
+          116.015625
+        ]
+      ]
+    },
+    {
+      "title": "A Dataset Transformation",
+      "heading_level": null,
+      "page_id": 14,
+      "polygon": [
+        [
+          69.92578125,
+          130.7109375
+        ],
+        [
+          236.25,
+          130.7109375
+        ],
+        [
+          236.25,
+          141.0
+        ],
+        [
+          69.92578125,
+          141.0
+        ]
+      ]
+    },
+    {
+      "title": "Original Problem",
+      "heading_level": null,
+      "page_id": 14,
+      "polygon": [
+        [
+          84.568359375,
+          313.2421875
+        ],
+        [
+          179.25,
+          313.2421875
+        ],
+        [
+          179.25,
+          324.0703125
+        ],
+        [
+          84.568359375,
+          324.0703125
+        ]
+      ]
+    },
+    {
+      "title": "Transformed Problem",
+      "heading_level": null,
+      "page_id": 14,
+      "polygon": [
+        [
+          84.75,
+          367.3828125
+        ],
+        [
+          207.0,
+          367.3828125
+        ],
+        [
+          207.0,
+          377.25
+        ],
+        [
+          84.75,
+          377.25
+        ]
+      ]
+    },
+    {
+      "title": "Model's output (Markdown Text)",
+      "heading_level": null,
+      "page_id": 14,
+      "polygon": [
+        [
+          84.75,
+          435.75
+        ],
+        [
+          265.5,
+          435.75
+        ],
+        [
+          265.5,
+          445.5
+        ],
+        [
+          84.75,
+          445.5
+        ]
+      ]
+    },
+    {
+      "title": "Transformation Thought Process",
+      "heading_level": null,
+      "page_id": 14,
+      "polygon": [
+        [
+          84.75,
+          452.25
+        ],
+        [
+          250.5,
+          452.25
+        ],
+        [
+          250.5,
+          462.0
+        ],
+        [
+          84.75,
+          462.0
+        ]
+      ]
+    },
+    {
+      "title": "Answer Format",
+      "heading_level": null,
+      "page_id": 14,
+      "polygon": [
+        [
+          85.5,
+          527.25
+        ],
+        [
+          163.5,
+          527.25
+        ],
+        [
+          163.5,
+          536.765625
+        ],
+        [
+          85.5,
+          536.765625
+        ]
+      ]
+    },
+    {
+      "title": "Transformed Problem",
+      "heading_level": null,
+      "page_id": 14,
+      "polygon": [
+        [
+          84.75,
+          551.84765625
+        ],
+        [
+          195.75,
+          551.84765625
+        ],
+        [
+          195.75,
+          561.75
+        ],
+        [
+          84.75,
+          561.75
+        ]
+      ]
+    },
+    {
+      "title": "Answer Calculation for the Transformed Problem",
+      "heading_level": null,
+      "page_id": 14,
+      "polygon": [
+        [
+          84.75,
+          601.5
+        ],
+        [
+          330.75,
+          601.5
+        ],
+        [
+          330.75,
+          610.62890625
+        ],
+        [
+          84.75,
+          610.62890625
+        ]
+      ]
+    },
+    {
+      "title": "Final Answer",
+      "heading_level": null,
+      "page_id": 14,
+      "polygon": [
+        [
+          84.75,
+          667.4765625
+        ],
+        [
+          150.75,
+          667.4765625
+        ],
+        [
+          150.75,
+          677.25
+        ],
+        [
+          84.75,
+          677.25
+        ]
+      ]
+    },
+    {
+      "title": "B Supplementary Case",
+      "heading_level": null,
+      "page_id": 15,
+      "polygon": [
+        [
+          69.92578125,
+          100.16015625
+        ],
+        [
+          216.75,
+          100.16015625
+        ],
+        [
+          216.75,
+          111.76171875
+        ],
+        [
+          69.92578125,
+          111.76171875
+        ]
+      ]
+    },
+    {
+      "title": "Question:",
+      "heading_level": null,
+      "page_id": 15,
+      "polygon": [
+        [
+          92.935546875,
+          132.75
+        ],
+        [
+          135.75,
+          132.75
+        ],
+        [
+          135.75,
+          141.0
+        ],
+        [
+          92.935546875,
+          141.0
+        ]
+      ]
+    },
+    {
+      "title": "Response:",
+      "heading_level": null,
+      "page_id": 15,
+      "polygon": [
+        [
+          93.0,
+          209.21484375
+        ],
+        [
+          139.5,
+          209.21484375
+        ],
+        [
+          139.5,
+          219.0
+        ],
+        [
+          93.0,
+          219.0
+        ]
+      ]
+    }
+  ],
+  "page_stats": [
+    {
+      "page_id": 0,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          55
+        ],
+        [
+          "Span",
+          25
+        ],
+        [
+          "Text",
+          7
+        ],
+        [
+          "PageHeader",
+          2
+        ],
+        [
+          "SectionHeader",
+          2
+        ],
+        [
+          "Picture",
+          1
+        ],
+        [
+          "Figure",
+          1
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "FigureGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 2,
+        "llm_error_count": 0,
+        "llm_tokens_used": 10360,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 1,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          57
+        ],
+        [
+          "Span",
+          45
+        ],
+        [
+          "Text",
+          8
+        ],
+        [
+          "ListItem",
+          4
+        ],
+        [
+          "SectionHeader",
+          3
+        ],
+        [
+          "Equation",
+          3
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "ListGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 3,
+        "llm_error_count": 0,
+        "llm_tokens_used": 2902,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 2,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          69
+        ],
+        [
+          "Span",
+          24
+        ],
+        [
+          "Text",
+          5
+        ],
+        [
+          "Equation",
+          3
+        ],
+        [
+          "SectionHeader",
+          2
+        ],
+        [
+          "Figure",
+          1
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "FigureGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 3,
+        "llm_error_count": 0,
+        "llm_tokens_used": 3870,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 3,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          52
+        ],
+        [
+          "Span",
+          42
+        ],
+        [
+          "Text",
+          8
+        ],
+        [
+          "SectionHeader",
+          3
+        ],
+        [
+          "Equation",
+          3
+        ],
+        [
+          "PageFooter",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 3,
+        "llm_error_count": 0,
+        "llm_tokens_used": 3501,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 4,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          56
+        ],
+        [
+          "Span",
+          24
+        ],
+        [
+          "Text",
+          4
+        ],
+        [
+          "Figure",
+          2
+        ],
+        [
+          "ListItem",
+          2
+        ],
+        [
+          "Equation",
+          2
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "SectionHeader",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "ListGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 2,
+        "llm_error_count": 0,
+        "llm_tokens_used": 3038,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 5,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          67
+        ],
+        [
+          "Span",
+          12
+        ],
+        [
+          "Text",
+          6
+        ],
+        [
+          "Figure",
+          2
+        ],
+        [
+          "ListItem",
+          2
+        ],
+        [
+          "SectionHeader",
+          2
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "Equation",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "ListGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 1,
+        "llm_error_count": 0,
+        "llm_tokens_used": 1476,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 6,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          75
+        ],
+        [
+          "Span",
+          75
+        ],
+        [
+          "Text",
+          5
+        ],
+        [
+          "Figure",
+          1
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "Code",
+          1
+        ],
+        [
+          "Equation",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "FigureGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 1,
+        "llm_error_count": 0,
+        "llm_tokens_used": 926,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 7,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          57
+        ],
+        [
+          "Span",
+          29
+        ],
+        [
+          "Text",
+          5
+        ],
+        [
+          "SectionHeader",
+          4
+        ],
+        [
+          "Figure",
+          1
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "FigureGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 8,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          49
+        ],
+        [
+          "TableCell",
+          32
+        ],
+        [
+          "Span",
+          14
+        ],
+        [
+          "Text",
+          4
+        ],
+        [
+          "ListItem",
+          3
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "Table",
+          1
+        ],
+        [
+          "SectionHeader",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "TableGroup",
+          1
+        ],
+        [
+          "ListGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 1,
+        "llm_error_count": 0,
+        "llm_tokens_used": 1349,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 9,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          75
+        ],
+        [
+          "Span",
+          19
+        ],
+        [
+          "Text",
+          4
+        ],
+        [
+          "SectionHeader",
+          2
+        ],
+        [
+          "Figure",
+          1
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "FigureGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 10,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Span",
+          55
+        ],
+        [
+          "Line",
+          31
+        ],
+        [
+          "Text",
+          8
+        ],
+        [
+          "SectionHeader",
+          2
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 11,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          44
+        ],
+        [
+          "Span",
+          37
+        ],
+        [
+          "SectionHeader",
+          8
+        ],
+        [
+          "Text",
+          7
+        ],
+        [
+          "Footnote",
+          4
+        ],
+        [
+          "PageFooter",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 12,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          72
+        ],
+        [
+          "Span",
+          27
+        ],
+        [
+          "ListItem",
+          21
+        ],
+        [
+          "SectionHeader",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "ListGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 13,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          60
+        ],
+        [
+          "Span",
+          23
+        ],
+        [
+          "ListItem",
+          18
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "ListGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 14,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Span",
+          89
+        ],
+        [
+          "Line",
+          64
+        ],
+        [
+          "SectionHeader",
+          10
+        ],
+        [
+          "Text",
+          9
+        ],
+        [
+          "ListItem",
+          4
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "ListGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 15,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Span",
+          65
+        ],
+        [
+          "Line",
+          47
+        ],
+        [
+          "Text",
+          14
+        ],
+        [
+          "SectionHeader",
+          3
+        ],
+        [
+          "Equation",
+          1
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 1,
+        "llm_error_count": 0,
+        "llm_tokens_used": 702,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    }
+  ],
+  "debug_data_path": "debug_data/arXiv:2503.14476"
+}
\ No newline at end of file
diff --git a/research/papers/arXiv-rl_zvp/_page_0_Figure_6.jpeg b/research/papers/arXiv-rl_zvp/_page_0_Figure_6.jpeg
new file mode 100644
index 0000000..6591c74
Binary files /dev/null and b/research/papers/arXiv-rl_zvp/_page_0_Figure_6.jpeg differ
diff --git a/research/papers/arXiv-rl_zvp/_page_15_Figure_2.jpeg b/research/papers/arXiv-rl_zvp/_page_15_Figure_2.jpeg
new file mode 100644
index 0000000..cfdb1f2
Binary files /dev/null and b/research/papers/arXiv-rl_zvp/_page_15_Figure_2.jpeg differ
diff --git a/research/papers/arXiv-rl_zvp/_page_1_Figure_2.jpeg b/research/papers/arXiv-rl_zvp/_page_1_Figure_2.jpeg
new file mode 100644
index 0000000..86a665d
Binary files /dev/null and b/research/papers/arXiv-rl_zvp/_page_1_Figure_2.jpeg differ
diff --git a/research/papers/arXiv-rl_zvp/_page_2_Figure_10.jpeg b/research/papers/arXiv-rl_zvp/_page_2_Figure_10.jpeg
new file mode 100644
index 0000000..04d605f
Binary files /dev/null and b/research/papers/arXiv-rl_zvp/_page_2_Figure_10.jpeg differ
diff --git a/research/papers/arXiv-rl_zvp/_page_6_Figure_2.jpeg b/research/papers/arXiv-rl_zvp/_page_6_Figure_2.jpeg
new file mode 100644
index 0000000..5f2f60c
Binary files /dev/null and b/research/papers/arXiv-rl_zvp/_page_6_Figure_2.jpeg differ
diff --git a/research/papers/arXiv-rl_zvp/arXiv-2509.21880.md b/research/papers/arXiv-rl_zvp/arXiv-2509.21880.md
new file mode 100644
index 0000000..b09d12d
--- /dev/null
+++ b/research/papers/arXiv-rl_zvp/arXiv-2509.21880.md
@@ -0,0 +1,800 @@
+NO PROMPT LEFT BEHIND: EXPLOITING ZERO-VARIANCE PROMPTS IN LLM REINFORCEMENT LEARNING VIA ENTROPY-GUIDED ADVANTAGE SHAP-ING
+
+Thanh-Long V. Le<sup>1</sup>, Myeongho Jeon<sup>2</sup>, Kim Vu<sup>1</sup>, Viet Lai<sup>3</sup>, Eunho Yang<sup>1</sup> KAIST <sup>2</sup> EPFL <sup>3</sup> Adobe Research
+
+#### **ABSTRACT**
+
+Reinforcement Learning with Verifiable Rewards (RLVR) is a powerful framework for improving the reasoning abilities of Large Language Models (LLMs). However, current methods such as GRPO rely only on problems where the model responses to the same input differ in correctness, while ignoring those where all responses receive the same reward—so-called *zero-variance prompts*. In this work, we argue that such prompts are not useless but can, in fact, provide meaningful feedback for policy optimization. To this end, we introduce *RL with Zero-Variance Prompts* (RL-ZVP), a novel algorithm that extract learning signals from zero-variance prompts. RL-ZVP directly rewards correctness and penalizes errors even without contrasting responses, modulating feedback with token-level characteristics to preserve informative, nuanced signals. Across six math reasoning benchmarks, RL-ZVP achieves significant improvements of up to 8.61 points in accuracy and 7.77 points in pass rate over GRPO, while consistently outperforming other baselines that filter out zero-variance prompts. These results highlight the untapped potential of learning from zero-variance prompts in RLVR.
+
+![](_page_0_Figure_6.jpeg)
+
+Figure 1: **Left**: RL-ZVP uses an entropy-guided advantage formulation to extract learning signals from zero-variance prompts, while reverting to GRPO on other prompts. **Right**: RL-ZVP demonstrates significantly higher average accuracy than GRPO across six math reasoning benchmarks.
+
+### 1 Introduction
+
+Recent frontier models such as OpenAI of (OpenAI et al., 2024), Claude 3.7 Sonnet (Anthropic, 2025) DeepSeek-R1 (DeepSeek-AI et al., 2025), and Kimi k1.5 (Team et al., 2025) provide compelling proof of concept for leveraging long chain of thought (Wei et al., 2023; Kojima et al., 2023) to enhance the reasoning capabilities of Large Language Models (LLMs).
+
+Following these advances, Reinforcement Learning with Verifiable Rewards (RLVR) techniques such as Group Relative Policy Optimization (GRPO) (Shao et al., 2024) and Dynamic Sampling Policy Optimization (DAPO) (Yu et al., 2025) have emerged as the de facto standard for strengthening LLM reasoning. In RLVR, the policy model is trained by sampling multiple responses (rollouts) to a prompt and then reinforcing correct answers while penalizing incorrect ones. While this approach is effective when responses for the same prompt contains both right and wrong responses, it does not account for scenarios when a group of rollouts are entirely correct or entirely incorrect (with identical rewards), referred to as *zerovariance prompts* (Zheng et al., 2025). In such situations, the normalized rewards (*i.e.*, advantage values) collapse to zero, resulting in no training signal, rendering all rollouts for those prompts uninformative. While it may seem conceptually reasonable to ignore
+
+![](_page_1_Figure_2.jpeg)
+
+Figure 2: **Rollout generation overhead** as a percentage of total time of the training step.
+
+these prompts, doing so remains problematic because rollout generation itself is a major bottleneck in RL training (Zhong et al., 2025; Noukhovitch et al., 2025; Sheng et al., 2024). Our preliminary experiments show that response sampling typically takes up about 50% of the total time per training step, which emphasizes the significant throughput overhead introduced by rollouts (Figure 2).
+
+Recent methods acknowledge the inefficiency of zero-variance prompts and attempt to filter them out, either pre-rollout (Zheng et al., 2025; Qu et al., 2025) or post-rollout (Yu et al., 2025). However, to the best of our knowledge, no prior work has investigated whether zero-variance prompts themselves can serve as a valuable source of learning signals for policy optimization.
+
+In this work, we argue that zero-variance prompts could, in fact, provide valuable feedback for policy learning—for example, encouraging the model to exploit all-correct paths while discouraging all-wrong paths and promoting exploration of alternatives. Furthermore, inspired by recent advances in reward shaping (Deng et al., 2025; Lei et al., 2025; Devidze et al., 2022; Ng et al., 1999), we propose that token-level information should be explicitly considered in how such feedback is utilized.
+
+Building on these insights, we introduce **RL-ZVP** (*Reinforcement Learning with Zero-Variance Prompts*), a novel algorithm that extract useful learning signals from zero-variance prompts to improve the effectiveness of RL training. Our approach rests on two key intuitions: (i) the policy model should still be rewarded for correct answers even if no incorrect answers are present in the same group (and penalized analogously), and (ii) the degree of reward or penalty should be determined by the characteristics of sampled tokens. Concretely, we formulate the advantage for zero-variance prompts around two properties: *direction*, which aligns with the correctness of the response, and *magnitude*, which is quantified by the entropy value of each token within the response (Figure 1).
+
+Experiments on six math reasoning benchmarks across two model scales—Qwen3-1.7B-Base and Qwen-8B-Base (Yang et al., 2025b)—show that RL-ZVP significantly outperforms GRPO, with an average improvement of **4.00 points** in accuracy (Acc@8) and **4.28 points** in pass rate (Pass@8). Notably, RL-ZVP achieves gains of up to **8.66 points** in Acc@8 (on AIME25) and **7.77 points** in Pass@8 (on OlympiadBench). RL-ZVP also consistently surpasses other baselines that filter out zero-variance prompts, even under unfavorable settings where these baselines are given  $3\times-5\times$  more rollouts. Moreover, across both scales, RL-ZVP demonstrates a more stable and consistent learning process. These results establish zero-variance prompts as a valuable resource for learning signals and open a new perspective on how to utilize them, rather than discarding them, in RLVR.
+
+### 2 Preliminary: Group Relative Policy Optimization
+
+Group Relative Policy Optimization (GRPO) (Shao et al., 2024) is a variant of Proximal Policy Optimization (PPO) (Schulman et al., 2017) tailored for LLM post-training. Given a question–answer pair (q,a) from a dataset  $\mathcal{D}$ , a group of G responses (rollouts)  $\{o_i\}_{i=1}^G$  is sampled from the old policy  $\pi_{\theta_{\text{old}}}$  (typically the policy from the previous training iteration). Each response is compared against the ground-truth answer a to produce final rewards  $\{R_i\}_{i=1}^G$ . Instead of relying on a costly value model like PPO, GRPO normalizes these rewards to compute the advantage as:
+
+$$\hat{A}_{i,t} = \frac{R_i - \operatorname{mean}(\{R_i\}_{i=1}^G)}{\operatorname{std}(\{R_i\}_{i=1}^G)},\tag{1}$$
+
+where t denotes token position in  $o_i$ . Note that the advantage  $\hat{A}_i$  is computed at the response level and subsequently broadcast to all tokens in the response, *i.e.*,  $\hat{A}_{i,1} = \hat{A}_{i,2} = \cdots = \hat{A}_{i,G} = \hat{A}_i$ . The policy objective<sup>1</sup> for a single prompt is then defined as:
+
+$$\mathcal{J}_{GRPO}\left(\theta \mid q, a, \{o_{i}\}_{i=1}^{G}\right) = \frac{1}{G} \sum_{i=1}^{G} \frac{1}{|o_{i}|} \sum_{t=1}^{|o_{i}|} \min\left(r_{i,t}(\theta) \, \hat{A}_{i,t}, \, \operatorname{clip}\left(r_{i,t}(\theta), \, 1 - \epsilon, \, 1 + \epsilon\right) \, \hat{A}_{i,t}\right), \quad (2)$$
+where  $r_{i,t}(\theta) = \frac{\pi_{\theta}(o_{i,t} \mid q, \, o_{i,< t})}{\pi_{\theta_{\text{old}}}(o_{i,t} \mid q, \, o_{i,< t})}.$ 
+
+Here,  $r_{i,t}(\theta)$  represents the importance sampling ratio that adjusts the objective to account for the off-policy distribution of the training batch. The final GRPO objective for the whole dataset is:
+
+$$\mathcal{J}_{GRPO}(\theta, \mathcal{D}) = \mathbb{E}_{(q, a) \sim \mathcal{D}, \{o_i\}_{i=1}^G \sim \pi_{\theta_{old}}(\cdot | q)} \left[ \mathcal{J}_{GRPO}(\theta | q, a, \{o_i\}_{i=1}^G) \right]. \tag{3}$$
+
+### 3 REINFORCEMENT LEARNING WITH ZERO-VARIANCE PROMPTS
+
+In this section, we introduce our algorithm, Reinforcement Learning with Zero-Variance Prompts. We begin by motivating the approach through the advantage vanishing problem in zero-variance prompts (Section 3.1). We then present our entropy-guided advantage formulation (Section 3.2) and conclude with the full training objective and algorithm (Section 3.3).
+
+#### 3.1 MOTIVATION: ADVANTAGE VANISHING IN ZERO-VARIANCE PROMPTS
+
+During GRPO training, the policy model may encounter questions where all sampled responses in a group are either entirely correct or entirely incorrect, resulting in identical rewards. Following Zheng et al. (2025), we call these cases *zero-variance prompts*. They typically arise in two situations: when the dataset is either (i) too difficult or (ii) too easy for the model. From a training perspective, zero-variance prompts appear most frequently (i) at the beginning of training, when the model often fails to solve problems even once across multiple responses, and (ii) near convergence, when the model has become strong enough to solve many questions entirely.
+
+To understand the prevalence of zero-variance prompts, we examine two settings: Qwen3-1.7B-Base trained on MATH, and Qwen3-8B-Base trained on DAPO-Math-17k. We found that zero-variance prompts account for roughly 30% to 99% of each batch during train-
+
+![](_page_2_Figure_10.jpeg)
+
+Figure 3: The percentage of zero-variance prompts.
+
+ing (Figure 3). This high proportion suggests that if these prompts could be leveraged effectively, they potentially represent an untapped source of useful learning signal for the policy model.
+
+**Remark 1** (Advantage Vanishing). In Equation (1) and (2), if all responses within a group receive identical rewards, their advantages degenerate to zero. In this case, the GRPO objective reduces to  $\mathcal{J}_{GRPO}(\theta \mid q_{zvp}, a, \{o_i\}_{i=1}^G) = 0$ , effectively discarding all the potential learning signal.
+
+From Equation 2, taking the gradient of both sides<sup>2</sup> gives:
+
+$$\nabla_{\theta} \mathcal{J}_{\text{GRPO}}(q, a, \{o_i\}_{i=1}^G) = \frac{1}{G} \sum_{i=1}^G \frac{1}{|o_i|} \sum_{t=1}^{|o_i|} r_{i,t}(\theta) \, \hat{A}_{i,t} \nabla_{\theta} \log \pi_{\theta} \big(o_{i,t} \mid q, o_{i, < t}\big). \tag{4}$$
+
+From this expression, the gradient of the GRPO objective with respect to the policy parameters can be interpreted as a weighted sum of the log-probability gradients for all tokens in a response. Intuitively, GRPO raises the probabilities of tokens in correct responses while lowering those of tokens in incorrect ones. Under GRPO advantage formulation, however, if we ignore the importance-sampling ratio, each token in a response receives the same weight, regardless of its role or informativeness.
+
+<sup>&</sup>lt;sup>1</sup>We omit the KL penalty here for clarity.
+
+<sup>&</sup>lt;sup>2</sup>We omit the clipping factor before gradient derivation for clarity.
+
+**Remark 2** (The Role of Advantage). The sign of the advantage  $\hat{A}_{i,t}$  determines the **direction** of the gradient update for each token since the importance sampling ratio  $r_{i,t}$  in Equation (4) is strictly positive, while its absolute value  $|\hat{A}_{i,t}|$  governs the **magnitude** of the update in conjunction with  $r_{i,t}$ .
+
+Recently, several methods have excluded zero-variance prompts from RL training (Yu et al., 2025; Zheng et al., 2025; Cheng et al., 2025), viewing them merely as a way to reduce rollout cost and implicitly assuming they offer no value for policy optimization. We challenge this assumption and argue that, when handled appropriately, zero-variance prompts could provide valuable learning signals for RL training.
+
+#### 3.2 ADVANTAGE FORMULATION
+
+We propose *Reinforcement Learning with Zero-Variance Prompts* (RL-ZVP), a novel algorithm designed to exploit zero-variance prompts. The key intuition is to address the limitation noted in Remark 1: the policy should be rewarded or penalized based solely on the correctness of its responses, even when no contrasting answers are present. RL-ZVP behaves identically to GRPO on non-zero-variance prompts, but adopts a different strategy for zero-variance ones, thereby generalizing GRPO. We first classify zero-variance prompts with all correct responses as *positive prompts* and those with all incorrect responses as *negative prompts*. Based on Remark 2, we frame advantage shaping as developing appropriate scaling factors, with a focus on two properties: *direction* and *magnitude*.
+
+**Advantage direction.** The sign of the advantage is straightforward: positive (+) for positive prompts and negative (-) for negative ones. Intuitively, when all sampled responses are correct, the model should increase their probabilities, reinforcing certainty and encouraging exploitation. Conversely, when all responses are incorrect, their probabilities should be reduced, discouraging repeated mistakes and promoting exploration of alternative sampling paths.
+
+Advantage magnitude. Inspired by recent works that incorporate token-level signals to augment verifiable rewards (Cui et al., 2025; Fang et al., 2025; Agarwal et al., 2025; Deng et al., 2025; Lei et al., 2025), we leverage token entropy in the formulation of the advantage magnitude for zero-variance prompts. The key idea is to scale each token's gradient update in proportion to its entropy. For correct responses, high-entropy tokens—typically those crucial for reasoning, such as connectors or branching points(Wang et al., 2025)—receive larger updates than semantically trivial tokens. This encourages reasoning behaviors such as reflection and verification (DeepSeek-AI et al., 2025), while avoiding over-exploitation of trivial text completion patterns. For incorrect responses, the probabilities of all tokens are reduced, but high-entropy tokens (that initiate reasoning paths and behaviors) are penalized less severely, preserving flexibility for the model to revisit and explore those reasoning paths in future iterations.
+
+**Advantage formula.** Combining these two design choices, our final token-level advantage formulation for zero-variance prompts, replacing the zero assigned in GRPO, is:
+
+\( \hat{A}\_{\text{ZVP}}(o\_{i,t}) = \begin{cases} \alpha H\_{i,t}, & \text{if } \text{std}(\{R\_i\}\_{i=1}^G) = 0 \ \land \ R\_i > 0, \\ -\alpha \Big( \max\_{k=1}^{|o\_i|} (H\_{i,k}) - H\_{i,t} \Big), & \text{if } \text{std}(\{R\_i\}\_{i=1}^G) = 0 \ \land \ R\_i \le 0, \end{cases} \) \( \tag{5} \)
+
+with \( H\_{i,t} = -\sum\_{j=1}^{|V|} \pi\_{\theta}(v\_j \mid q, \ o\_{i, < t}) \log \Big[ \pi\_{\theta}(v\_j \mid q, \ o\_{i, < t}) \Big], \)
+
+where  $\alpha$  denotes the scaling factor hyperparameter, and  $H_{i,t}{}^3$  is the entropy of each token  $o_{i,t}$  in the response  $o_i$  given the policy model's vocabulary set  $V = \{v_1, v_2, \cdots\}$ . The term  $\max_{k=1}^{|o_i|}(H_{i,k})$  reverts the effect of token entropy in negative prompts to align with our intuition: tokens with higher entropy incur smaller penalties, while low-entropy tokens are penalized more.
+
+ $<sup>^{3}</sup>$ In the implementation, we detach every occurrence of  $H_{i,t}$  from the computational graph so that it acts as a scalar factor rather than a differentiable parameter.
+
+### Algorithm 1: Reinforcement Learning with Zero-Variance Prompts
+
+```
+Input: initial policy model \pi_{\theta}; training dataset \mathcal{D}; for iteration = 1, \dots, M do
+
+Update the previous policy \pi_{\theta_{\text{old}}} \leftarrow \pi_{\theta}; Sample (w/o replacement) a prompt batch \mathcal{B} \subset \mathcal{D}; foreach (q, a) \in \mathcal{B} do
+
+Sample G outputs \{o_i\}_{i=1}^G \sim \pi_{\theta_{\text{old}}}(\cdot \mid q);
+Compute rewards \{R_i\}_{i=1}^G by verifying each o_i against a; for gradient\_step = 1, \dots, \mu do
+
+Sample (w/o replacement) a mini-batch \mathcal{B}_{\min} \subset \mathcal{B}; foreach (q, a, \{R_i\}_{i=1}^G) \in \mathcal{B}_{\min} do
+
+if \operatorname{std}(\{R_i\}_{i=1}^G) > 0 then compute advantages \{\hat{A}_{i,t}\} using equation 1; else compute advantages \{\hat{A}_{ZVP}(o_{i,t})\} using equation 5;
+Update the policy model \pi_{\theta} by maximizing the RL-ZVP objective 6;
+```
+
+**Output:** final policy model  $\pi_{\theta}$ 
+
+#### 3.3 FINAL OBJECTIVE
+
+Let  $\mathcal{D}_{ZVP}$  be the set of zero-variance prompts encountered during training ( $\mathcal{D}_{ZVP} \subset \mathcal{D}$ ). Incorporating our advantage shaping for zero-variance prompts, the RL-ZVP objective becomes:
+
+$$\mathcal{J}_{\text{RL-ZVP}}(\theta|q, a, \{o_i\}_{i=1}^G) = \begin{cases} \frac{1}{G} \sum_{i=1}^G \frac{1}{|o_i|} \sum_{t=1}^{|o_i|} \min \left( r_{i,t}(\theta) \, \hat{A}_{\text{ZVP}}(o_{i,t}), \ \text{clip}(r_{i,t}(\theta), 1 - \epsilon, 1 + \epsilon) \, \hat{A}_{\text{ZVP}}(o_{i,t}) \right), & \text{if } q \in \mathcal{D}_{\text{ZVP}}, \\ \mathcal{J}_{\text{GRPO}}(\theta|q, a, \{o_i\}_{i=1}^G), & \text{otherwise.} \end{cases} \tag{6}$$
+
+**Remark 3** (Relationship to GRPO). *RL-ZVP does not introduce any new loss term beyond the GRPO objective. It simply reformulates the special case where the prompt is zero-variance. In fact, the GRPO objective from Equation (2) can be equivalently expressed in a branching form:* 
+
+$$\mathcal{J}_{GRPO}(\theta \mid q, a, \{o_i\}_{i=1}^G) = \begin{cases} 0, & \text{if } q \in \mathcal{D}_{ZVP}, \\ \mathcal{J}_{GRPO}(\theta \mid q, a, \{o_i\}_{i=1}^G), & \text{otherwise.} \end{cases}$$
+(7)
+
+In this regard, our formulation serves as a generalization of GRPO—reducing exactly to GRPO when prompts are non-zero-variance, while resulting in a non-trivial update for zero-variance prompts rather than collapsing to zero.
+
+## 4 EXPERIMENTS
+
+In this section, we first outline our experimental setup, including models, datasets, and implementation details (Section 4.1). We then present validation results and analysis, comparing RL-ZVP with GRPO and prompt filtering baselines (Section 4.2). Finally, we conduct ablation studies to assess the contribution of each sub-component in RL-ZVP (Section 4.3).
+
+### 4.1 EXPERIMENTAL SETTINGS
+
+**Models & Datasets.** We conduct experiments on both Qwen3-1.7B-Base and Qwen3-8B-Base (Yang et al., 2025a; Liu et al., 2025b) to assess performance across model scales. For Qwen3-1.7B-Base, we train on the MATH dataset (Hendrycks et al., 2021; Lightman et al., 2023), which contains 7.5k problems. For Qwen3-8B-Base, we adopt the more challenging DAPO-Math-17k dataset (Yu et al., 2025) to better exploit the capacity of larger models. Within the scope of this work, we refer to these two setups as *small scale* and *large scale*, respectively. We then evaluate the trained models on six standard math reasoning benchmarks: Minerva, AMC23, MATH (Hendrycks
+
+et al., 2021), AIME24, AIME25, and OlympiadBench (He et al., 2024). For each question, we generate eight responses and report both the accuracy (Acc@8) and the pass rate (Pass@8). To further evaluate RL-ZVP, we also report additional results with different training dataset in Appendix A.
+
+**Baselines.** Since RL-ZVP is the first method that attempts to explicitly leverage zero-variance prompts, we compare it against GRPO (Shao et al., 2024) and two representative methods that filter out zero-variance prompts: GRPO with Dynamic Sampling (GRPO-DS) (Yu et al., 2025) and GRESO (Zheng et al., 2025). GRPO-DS repeatedly samples questions, generates group of responses, discards zero-variance cases, and constructs batches containing only non-zero-variance prompts, ensuring that every prompt contributes to the gradient signal. In contrast, GRESO predicts and skips zero-variance prompts without responses sampling, using a probabilistic filtering strategy based on historical training dynamics. For GRPO-DS and GRESO, we report results under two settings: (r) using the *same number of rollouts* as RL-ZVP for a fair comparison, and (g) using the *same number of gradient steps*. The latter favors the prompt filtering baselines, as it allows roughly  $3\times-5\times$  more rollouts than RL-ZVP and GRPO (see Table 5 in Appendix B).
+
+**Implementation Details.** All experiments are implemented using the verl framework (Sheng et al., 2024), following the standard hyperparameter settings from recent works (Yu et al., 2025; Zheng et al., 2025). We train with a batch size of 512 and a mini-batch size of 32, resulting in 16 gradient steps per training iteration. Full implementation details are provided in Appendix C.
+
+#### 4.2 RESULTS & ANALYSIS
+
+Table 1: **Detailed evaluation results on six math reasoning benchmarks.** The best and second best performance across all settings are **bold** and <u>underscored</u>, respectively. Under a fair setup of equal rollout budget, RL-ZVP achieves the best performance across all datasets, heavily outperforming GRPO. (\*) Even in the unfavorable setting where Dynamic Sampling baselines use  $3 \times -5 \times$  more rollouts, RL-ZVP still outperforms them on the majority of benchmarks.
+
+| Method                                                 | Minerva |        | AMC23 |        | MATH500 |        | AIME24 |        | AIME25 |        | OlympiadBench |        |
+|--------------------------------------------------------|---------|--------|-------|--------|---------|--------|--------|--------|--------|--------|---------------|--------|
+|                                                        | Acc@8   | Pass@8 | Acc@8 | Pass@8 | Acc@8   | Pass@8 | Acc@8  | Pass@8 | Acc@8  | Pass@8 | Acc@8         | Pass@8 |
+| Qwen3-1.7B-Base trained with MATH (small scale)        |         |        |       |        |         |        |        |        |        |        |               |        |
+| GRPO                                                   | 29.09   | 43.19  | 42.19 | 72.00  | 69.09   | 84.05  | 8.75   | 17.24  | 4.17   | 6.67   | 33.20         | 50.52  |
+| GRPO-DS-r                                              | 29.96   | 44.02  | 42.50 | 73.84  | 68.31   | 83.04  | 7.50   | 17.41  | 2.92   | 8.73   | 32.67         | 49.89  |
+| GRPO-DS-g*                                             | 29.96   | 44.02  | 46.25 | 73.84  | 70.72   | 84.61  | 7.50   | 25.77  | 7.50   | 12.95  | 35.68         | 53.03  |
+| GRESO-r                                                | 27.99   | 43.39  | 40.00 | 70.25  | 67.53   | 83.07  | 7.08   | 18.48  | 3.75   | 12.89  | 31.30         | 49.13  |
+| GRESO-g*                                               | 30.19   | 43.39  | 45.00 | 73.86  | 69.77   | 84.77  | 9.58   | 19.51  | 5.00   | 11.00  | 34.68         | 52.43  |
+| RL-ZVP (Ours)                                          | 29.96   | 44.76  | 48.75 | 77.19  | 70.98   | 87.35  | 12.50  | 23.99  | 6.25   | 13.23  | 35.11         | 54.84  |
+| Δ(w.r.t GRPO)                                          | +0.87   | +1.57  | +6.56 | +5.19  | +1.89   | +3.3   | +3.75  | +6.75  | +2.08  | +6.56  | +1.91         | +4.32  |
+| Qwen3-8B-Base trained with DAPO-Math-17k (large scale) |         |        |       |        |         |        |        |        |        |        |               |        |
+| GRPO                                                   | 44.99   | 58.42  | 73.75 | 92.48  | 83.00   | 92.40  | 28.33  | 50.64  | 20.00  | 31.59  | 49.59         | 66.20  |
+| GRPO-DS-r                                              | 29.96   | 50.85  | 47.19 | 76.70  | 68.20   | 87.26  | 12.08  | 31.33  | 9.17   | 22.02  | 43.88         | 58.16  |
+| GRPO-DS-g*                                             | 44.76   | 59.60  | 76.25 | 92.43  | 84.98   | 92.42  | 27.08  | 42.72  | 19.58  | 35.93  | 44.83         | 60.42  |
+| GRESO-r                                                | 30.74   | 50.53  | 49.38 | 80.53  | 67.40   | 87.79  | 12.08  | 29.31  | 10.83  | 26.26  | 44.92         | 59.66  |
+| GRESO-g*                                               | 45.50   | 58.69  | 78.13 | 92.90  | 83.23   | 92.24  | 25.00  | 41.80  | 17.92  | 32.12  | 44.14         | 59.79  |
+| RL-ZVP (Ours)                                          | 45.96   | 58.83  | 78.75 | 94.18  | 89.73   | 96.13  | 33.33  | 54.66  | 24.58  | 39.36  | 58.20         | 72.28  |
+| Δ(w.r.t GRPO)                                          | +0.97   | +0.41  | +5.00 | +1.70  | +6.73   | +3.73  | +5.00  | +4.02  | +4.58  | +7.77  | +8.61         | +6.08  |
+
+**RL-ZVP** achieves improvements of up to 8.61 points in accuracy and 7.77 points in pass rate over GRPO. We present our main results in Table 1. Across six benchmarks, RL-ZVP consistently delivers significant gains over GRPO, ranging from 0.87 to 6.75 points on Qwen3-1.7B-Base and from 0.41 to 8.61 points on Qwen3-8B-Base. On small scale, the average improvements are 2.84 points in Acc@8 and 4.62 points in Pass@8, while on large scale, the average gains are 5.15 points in Acc@8 and 3.95 points in Pass@8. These results indicate that leveraging zero-variance prompts provides strong and informative learning signals that substantially enhance reasoning performance, with improvements generalizing across both model scales and datasets.
+
+**RL-ZVP consistently outperforms prompt-filtering baselines, even under disadvantageous settings.** Under the fair setting with the same rollout budget, RL-ZVP substantially surpasses both GRPO-DS-r and GRESO-r across all benchmarks, achieving average gains of 3.28 points in Acc@8 and 4.03 points in Pass@8 on the small scale, 19.20 points in Acc@8 and 13.56 points in Pass@8 on the large scale (See Figure 6 in Appendix B). Even in the less favorable setting with the same number
+
+of gradient steps—where the baselines require  $3\times-5\times$  more rollouts—RL-ZVP still heavily outperforms GRPO-DS-g and GRESO-g on the majority of benchmarks, with average improvements (over their best performance) of 0.99 points in Acc@8 and 1.19 points in Pass@8 on the small scale, and 5.51 points in Acc@8 and 5.32 points in Pass@8 on the large scale. These results not only reaffirm the importance of learning signals from zero-variance prompts, but also provide a new insight: leveraging such prompts brings greater benefits than discarding them, directly challenging the prevailing practice adopted in recent works.
+
+![](_page_6_Figure_2.jpeg)
+
+Figure 4: Validation accuracy and training dynamics at different experiment scales. Each row shows Acc@8, entropy, and response length during training for Qwen3-1.7B-Base (top) and Qwen3-8B-Base (bottom). RL-ZVP exhibits more consistent and stable trends than GRPO.
+
+**RL-ZVP demonstrates a more stable and consistent learning process.** Figure 4a and 4d present the average validation accuracy across benchmarks of RL-ZVP and GRPO as training progresses. On 1.7B scale, RL-ZVP initially shows competitive results with GRPO, but after roughly 500 gradient steps, it achieves a clear performance boost, creating a significant performance gap. On 8B scale, the difference is even more pronounced: RL-ZVP exhibits a smooth and steadily improving curve, while GRPO suffers a sharp drop during the first 200 gradient steps, then recovers and but eventually underperforms RL-ZVP. This highlights RL-ZVP's superior effectiveness and stability, which avoids the performance collapses observed at the start of training with GRPO on large scale.
+
+We attribute this robustness to the quantity and quality of learning signals each method receives at different training stages. In small scale experiments, the number of zero-variance prompts increases as training progresses (Figure 3). The model improves and produces more positive prompts (all-correct responses), which RL-ZVP can leverage as useful signals, while GRPO discards them. This leads to a widening gap in later stages of training. For large-scale experiments, the situation is reversed at the start of training: the model initially solves fewer than 5% of problems per batch (Figure 3), leaving GRPO with sparse and noisy feedback. In contrast, RL-ZVP can still exploit the abundance of negative prompts (all-incorrect responses), obtaining denser and thereby resulting in greater training stability in the early stages. Although the proportion of zero-variance prompts decreases as training continues, it remains non-negligible, sustaining a huge advantage for RL-ZVP over GRPO. Overall, across both scales, RL-ZVP consistently benefits from the additional signals provided by zero-variance prompts, ensuring stronger and more stable performance than GRPO.
+
+**RL-ZVP encourages longer and more confident reasoning across model scale.** As training progresses, RL-ZVP steadily reduces entropy while producing increasingly longer responses. Both trends align with significant gains in validation results (Figure 4). These patterns, along with the qualitative demonstration (Appendix D), suggest two complementary effects related to the policy model: (i) enhanced capabilities due to the emergence of long-form reasoning behaviors—the so-called *aha-moment* (DeepSeek-AI et al., 2025; Jin et al., 2024)—and (ii) greater confidence in exploratory decisions, as reflected in the low entropy (Cheng et al., 2025; Kadavath et al., 2022).
+
+On the small scale, models trained with RL-ZVP show a significant increase in response length over training, indicating the ability to generate extensively long chains of reasoning that greatly improves its reasoning abilities (Jin et al., 2024; DeepSeek-AI et al., 2025). In contrast, GRPO-trained models exhibit minimal growth in response length, aligning with their bounded validation performance and potentially their inability to generate multi-step, well-structured solutions. On the large scale, entropy initially drops sharply but then stabilizes after roughly 300 gradient steps rather than collapsing. Combined with the consistent growth in response length and accuracy, this rules out entropy collapse and emphasizes RL-ZVP's ability to promote detailed and confident reasoning (Cui et al., 2025; Wang et al., 2024).
+
+**RL-ZVP** facilitates more sophisticated reasoning even with small policy models. In many cases, especially on 1.7B scale, GRPO-trained models produce simple and direct solutions, jumping straight into problem-solving with little structure. Even when they show glimpses of more complex behaviors such as reflection, these behaviors are shallow and disorganized. In contrast, models trained with RL-ZVP display much more sophisticated and systematic reasoning strategies: they restate and reflect on the key hypotheses, carefully interpret the clues, plan solutions step by step, organize their reasoning into major steps with detailed sub-solutions, and reasonably attempt alternative strategies when encountering dead ends. Illustrative examples are provided in Appendix D.
+
+For instance, during exploration, GRPO-trained models often rely on shallow trial-and-error heuristics (e.g., "let's try another set of values", "let's try a different method"), which resemble second-guessing and lack of direction. In contrast, models trained with RL-ZVP adopt more principled and comprehensive strategies (e.g., "let's double check", "assume symmetry to simplify the problem", "try to factor directly", "let's try to express the sides using the Law of Cosines"), demonstrating focused and systematic exploration. Remarkably, such behaviors emerge even in models as small as 1.7B when trained with RL-ZVP, whereas GRPO fails to elicit them at this scale. On 8B scale, both GRPO and RL-ZVP produce systematic solutions with clear structure. However, GRPO outputs tend to be repetitive and rigid in tone, while RL-ZVP exhibits a more flexible and exploratory style of reasoning, reflecting a deeper ability to search for and evaluate alternative solution paths.
+
+#### 4.3 ABLATION STUDY
+
+To better understand the contribution of each design choice in RL-ZVP, we conduct ablation experiments on Qwen3-1.7B-Base trained with MATH unless stated otherwise.
+
+Impact of zero-variance type and entropy scaling. We examine the contribution of three sub-components in our advantage formulation for RL-ZVP: negative prompts, positive prompts, and the entropy scaling factor. Table 2 reports the performance when each component is removed, comparing against full RL-ZVP and GRPO. The results are consistent across benchmarks: full RL-ZVP achieves the best performance in all evaluations. Ignoring either negative or positive prompts yields comparable performance, confirming that both play similar roles in providing learning signals for the policy models. Importantly, removing the entropy scaling factor (e.g., using only sample-level +1/-1 as the advantage) causes the most severe degradation. This emphasizes the importance of advantage magnitude design for zero-variance prompts learning signal: simply promoting correct answers and penalizing incorrect ones is insufficient. Rewards and penalties must also be weighted appropriately to be effective.
+
+Table 2: **Ablation on the subcomponents of RL-ZVP**. Full RL-ZVP achieves the best average performance, while the second-best results mostly occur when a single component is removed.
+
+| Method                 | Minerva      |              | AMC23        |              | Math500      |              | Average      |              |
+|------------------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|
+|                        | Acc@8        | Pass@8       | Acc@8        | Pass@8       | Acc@8        | Pass@8       | Acc@8        | Pass@8       |
+| RL-ZVP (Ours)          | <b>29.96</b> | <b>44.76</b> | <b>48.75</b> | <b>77.19</b> | <b>70.98</b> | <b>87.35</b> | <b>49.90</b> | <b>69.77</b> |
+| - w/o negative prompts | 29.92        | 43.87        | 44.69        | 70.51        | 68.64        | 84.40        | 47.75        | 66.26        |
+| - w/o positive prompts | 29.50        | 43.88        | 41.56        | 70.40        | 70.70        | 85.81        | 47.25        | 66.70        |
+| - w/o entropy scaling  | 28.95        | 43.02        | 42.50        | 70.76        | 69.19        | 84.37        | 46.88        | 66.05        |
+| GRPO                   | 29.09        | 43.19        | 42.19        | 72.00        | 69.09        | 84.05        | 46.79        | 66.41        |
+
+**Impact of**  $\alpha$ . At the beginning of training, the top 20% of tokens with the highest entropy often take values between 2 and 10, which can result in very large advantages if left unscaled. To avoid explod-
+
+ing updates and potential training instability, we introduce  $\alpha$  as the scaling factor in the advantage computation. Table 3 shows the performance of RL-ZVP under different  $\alpha$  values. We observe that RL-ZVP achieves its best results with  $\alpha=0.10$  or  $\alpha=0.20$ , while performance drops sharply at both  $\alpha=0.05$  and  $\alpha=0.30$ . This demonstrates that  $\alpha$  plays a critical role in balancing stability and effective learning: overly small values fail to sufficiently utilize learning signal from zero-variance prompts, while overly large values may cause unstable training with large gradient updates. Thus,  $\alpha$  must be tuned carefully to fully realize the benefits of our entropy-guided advantage design.
+
+| Alpha           | Minerva |        | AMC23 |        | Math500 |        | Average |        |
+|-----------------|---------|--------|-------|--------|---------|--------|---------|--------|
+|                 | Acc@8   | Pass@8 | Acc@8 | Pass@8 | Acc@8   | Pass@8 | Acc@8   | Pass@8 |
+| $\alpha = 0.05$ | 29.60   | 43.78  | 42.19 | 69.44  | 68.56   | 82.96  | 46.78   | 65.39  |
+| $\alpha = 0.10$ | 29.96   | 44.76  | 48.75 | 77.19  | 70.98   | 87.35  | 49.90   | 69.77  |
+| $\alpha = 0.20$ | 29.60   | 44.54  | 43.75 | 74.14  | 71.10   | 86.47  | 48.15   | 68.38  |
+| $\alpha = 0.30$ | 28.68   | 42.61  | 41.56 | 71.22  | 67.20   | 83.46  | 45.81   | 65.76  |
+
+Table 3: Validation results under different  $\alpha$ .
+
+#### 5 RELATED WORK
+
+Reinforcement Learning for LLM Reasoning. With the rise of test-time scaling (Muennighoff et al., 2025) and large reasoning models (Team et al., 2025; DeepSeek-AI et al., 2025; OpenAI et al., 2024), the post-training process has increasingly shifted toward Reinforcement Learning with Verifiable Rewards (RLVR), which enhances reasoning ability using rule-based feedback. However, these methods either discard zero-variance prompts implicitly (Shao et al., 2024; Liu et al., 2025a) or explicitly (Yu et al., 2025; Zheng et al., 2025; Qu et al., 2025). This leads to wasted computation and reduced data efficiency during online training as rollouts remains a major bottleneck in RL training (Zhong et al., 2025; Noukhovitch et al., 2025). Our work addresses this gap by extracting useful signals from zero-variance prompts, thereby mitigating useless rollout costs.
+
+Online Prompt Selection. Zero-variance prompts have been identified as a key weakness of GRPO in recent works Yu et al. (2025); Zheng et al. (2025); Qu et al. (2025). DAPO addresses this issue with post-rollout oversampling: repeatedly sampling until a batch containing only non-zero-variance prompts is obtained (Yu et al., 2025). GRESO improves efficiency with a pre-rollout probabilistic filter, leveraging training history to predict whether a prompt is likely to be zero-variance (Zheng et al., 2025). While these approaches can be broadly viewed as online prompt selection methods, our work differs fundamentally: instead of filtering out zero-variance prompts, we propose extracting useful learning signals from them, thereby improving the effectiveness of RL training.
+
+### 6 CLOSING REMARKS
+
+In this paper, we introduced RL-ZVP, a novel algorithm that extracts useful learning signals from zero-variance prompts to enhance policy optimization. Within these prompts, RL-ZVP directly rewards correct responses and penalizes incorrect ones, without requiring contrastive answers within the rollout group. Furthermore, RL-ZVP adopts a fine-grained optimization strategy, scaling updates by token entropy to elicit stronger reasoning behaviors in the policy model. Empirical results show that RL-ZVP delivers significant improvements, outperforming GRPO by up to 8.61 points in accuracy and 7.77 points in pass rate, while also substantially surpassing other prompt filtering baselines. To our best knowledge, this is the first work to demonstrate that zero-variance prompts can be exploited rather than discarded. We believe this perspective opens a new direction for RL training, with possibilities for even larger gains through refined formulations in future work.
+
+Limitations and Future Directions. Due to computational constraints, our experiments are limited to models up to 8B parameters. Future work should investigate how RL-ZVP scales to larger models (e.g., 14B or 32B). Furthermore, we only validate RL-ZVP on verifiable tasks with binary rewards; extending it to settings with graded or ambiguous feedback (e.g., open-ended QA, text summarization, safety alignment) remains an open challenge. Finally, RL-ZVP is sensitive to the fixed scaling hyperparameter  $\alpha$ , and more sophisticated strategies such as adaptive scheduling may provide more robust alternatives.
+
+#### ACKNOWLEDGEMENT
+
+We thank Brano Kveton, Nikos Vlassis, Yoonjeon Kim, Doohyuk Jang, Gyouk Chu, Minh-Hai Nguyen, and Tri Le for their valuable feedback and discussions. We are also grateful to Trung Bui for supporting compute allocation.
+
+This work was supported in part by the National Research Foundation of Korea (NRF) grant funded by the Korean government (MSIT) [RS-2024-00337693] (Myeongho Jeon).
+
+#### REFERENCES
+
+Shivam Agarwal, Zimin Zhang, Lifan Yuan, Jiawei Han, and Hao Peng. The unreasonable effectiveness of entropy minimization in llm reasoning, 2025. URL https://arxiv.org/abs/2505.15134.
+
+Anthropic. Claude 3.7 sonnet, 2025. URL https://www.anthropic.com/news/claude-3-7-sonnet.
+
+Daixuan Cheng, Shaohan Huang, Xuekai Zhu, Bo Dai, Wayne Xin Zhao, Zhenliang Zhang, and Furu Wei. Reasoning with exploration: An entropy perspective. *arXiv preprint arXiv:2506.14758*, 2025. URL https://arxiv.org/pdf/2506.14758.
+
+Ganqu Cui, Yuchen Zhang, Jiacheng Chen, Lifan Yuan, Zhi Wang, Yuxin Zuo, Haozhan Li, Yuchen Fan, Huayu Chen, Weize Chen, et al. The entropy mechanism of reinforcement learning for reasoning language models. *arXiv preprint arXiv:2505.22617*, 2025. URL https://arxiv.org/pdf/2505.22617.
+
+DeepSeek-AI, Daya Guo, Dejian Yang, Haowei Zhang, Junxiao Song, Ruoyu Zhang, Runxin Xu, Qihao Zhu, Shirong Ma, Peiyi Wang, Xiao Bi, Xiaokang Zhang, Xingkai Yu, Yu Wu, Z. F. Wu, Zhibin Gou, Zhihong Shao, Zhuoshu Li, Ziyi Gao, Aixin Liu, Bing Xue, Bingxuan Wang, Bochao Wu, Bei Feng, Chengda Lu, Chenggang Zhao, Chengqi Deng, Chenyu Zhang, Chong Ruan, Damai Dai, Deli Chen, Dongjie Ji, Erhang Li, Fangyun Lin, Fucong Dai, Fuli Luo, Guangbo Hao, Guanting Chen, Guowei Li, H. Zhang, Han Bao, Hanwei Xu, Haocheng Wang, Honghui Ding, Huajian Xin, Huazuo Gao, Hui Qu, Hui Li, Jianzhong Guo, Jiashi Li, Jiawei Wang, Jingchang Chen, Jingyang Yuan, Junjie Qiu, Junlong Li, J. L. Cai, Jiaqi Ni, Jian Liang, Jin Chen, Kai Dong, Kai Hu, Kaige Gao, Kang Guan, Kexin Huang, Kuai Yu, Lean Wang, Lecong Zhang, Liang Zhao, Litong Wang, Liyue Zhang, Lei Xu, Leyi Xia, Mingchuan Zhang, Minghua Zhang, Minghui Tang, Meng Li, Miaojun Wang, Mingming Li, Ning Tian, Panpan Huang, Peng Zhang, Qiancheng Wang, Qinyu Chen, Qiushi Du, Ruiqi Ge, Ruisong Zhang, Ruizhe Pan, Runji Wang, R. J. Chen, R. L. Jin, Ruyi Chen, Shanghao Lu, Shangyan Zhou, Shanhuang Chen, Shengfeng Ye, Shiyu Wang, Shuiping Yu, Shunfeng Zhou, Shuting Pan, S. S. Li, Shuang Zhou, Shaoqing Wu, Shengfeng Ye, Tao Yun, Tian Pei, Tianyu Sun, T. Wang, Wangding Zeng, Wanjia Zhao, Wen Liu, Wenfeng Liang, Wenjun Gao, Wenqin Yu, Wentao Zhang, W. L. Xiao, Wei An, Xiaodong Liu, Xiaohan Wang, Xiaokang Chen, Xiaotao Nie, Xin Cheng, Xin Liu, Xin Xie, Xingchao Liu, Xinyu Yang, Xinyuan Li, Xuecheng Su, Xuheng Lin, X. Q. Li, Xiangyue Jin, Xiaojin Shen, Xiaosha Chen, Xiaowen Sun, Xiaoxiang Wang, Xinnan Song, Xinyi Zhou, Xianzu Wang, Xinxia Shan, Y. K. Li, Y. Q. Wang, Y. X. Wei, Yang Zhang, Yanhong Xu, Yao Li, Yao Zhao, Yaofeng Sun, Yaohui Wang, Yi Yu, Yichao Zhang, Yifan Shi, Yiliang Xiong, Ying He, Yishi Piao, Yisong Wang, Yixuan Tan, Yiyang Ma, Yiyuan Liu, Yongqiang Guo, Yuan Ou, Yuduan Wang, Yue Gong, Yuheng Zou, Yujia He, Yunfan Xiong, Yuxiang Luo, Yuxiang You, Yuxuan Liu, Yuyang Zhou, Y. X. Zhu, Yanhong Xu, Yanping Huang, Yaohui Li, Yi Zheng, Yuchen Zhu, Yunxian Ma, Ying Tang, Yukun Zha, Yuting Yan, Z. Z. Ren, Zehui Ren, Zhangli Sha, Zhe Fu, Zhean Xu, Zhenda Xie, Zhengyan Zhang, Zhewen Hao, Zhicheng Ma, Zhigang Yan, Zhiyu Wu, Zihui Gu, Zijia Zhu, Zijun Liu, Zilin Li, Ziwei Xie, Ziyang Song, Zizheng Pan, Zhen Huang, Zhipeng Xu, Zhongyu Zhang, and Zhen Zhang. Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning, 2025.
+
+Jia Deng, Jie Chen, Zhipeng Chen, Wayne Xin Zhao, and Ji-Rong Wen. Decomposing the entropy-performance exchange: The missing keys to unlocking effective reinforcement learning, 2025. URL https://arxiv.org/abs/2508.02260.
+
+- Rati Devidze, Parameswaran Kamalaruban, and Adish Singla. Exploration-guided reward shaping for reinforcement learning under sparse rewards. In S. Koyejo, S. Mohamed, A. Agarwal, D. Belgrave, K. Cho, and A. Oh (eds.), *Advances in Neural Information Processing Systems*, volume 35, pp. 5829–5842. Curran Associates, Inc., 2022. URL https://proceedings.neurips.cc/paper\_files/paper/2022/file/266c0f191b04cbbbe529016d0edc847e-Paper-Conference.pdf.
+- Lizhe Fang, Yifei Wang, Zhaoyang Liu, Chenheng Zhang, Stefanie Jegelka, Jinyang Gao, Bolin Ding, and Yisen Wang. What is wrong with perplexity for long-context language modeling?, 2025. URL https://arxiv.org/abs/2410.23771.
+- Chaoqun He, Renjie Luo, Yuzhuo Bai, Shengding Hu, Zhen Leng Thai, Junhao Shen, Jinyi Hu, Xu Han, Yujie Huang, Yuxiang Zhang, Jie Liu, Lei Qi, Zhiyuan Liu, and Maosong Sun. Olympiadbench: A challenging benchmark for promoting agi with olympiad-level bilingual multimodal scientific problems, 2024. URL https://arxiv.org/abs/2402.14008.
+- Dan Hendrycks, Collin Burns, Saurav Kadavath, Akul Arora, Steven Basart, Eric Tang, Dawn Song, and Jacob Steinhardt. Measuring mathematical problem solving with the math dataset. *arXiv* preprint arXiv:2103.03874, 2021.
+- Mingyu Jin, Qinkai Yu, Dong Shu, Haiyan Zhao, Wenyue Hua, Yanda Meng, Yongfeng Zhang, and Mengnan Du. The impact of reasoning step length on large language models, 2024. URL https://arxiv.org/abs/2401.04925.
+- Saurav Kadavath, Tom Conerly, Amanda Askell, Tom Henighan, Dawn Drain, Ethan Perez, Nicholas Schiefer, Zac Hatfield-Dodds, Nova DasSarma, Eli Tran-Johnson, Scott Johnston, Sheer El-Showk, Andy Jones, Nelson Elhage, Tristan Hume, Anna Chen, Yuntao Bai, Sam Bowman, Stanislav Fort, Deep Ganguli, Danny Hernandez, Josh Jacobson, Jackson Kernion, Shauna Kravec, Liane Lovitt, Kamal Ndousse, Catherine Olsson, Sam Ringer, Dario Amodei, Tom Brown, Jack Clark, Nicholas Joseph, Ben Mann, Sam McCandlish, Chris Olah, and Jared Kaplan. Language models (mostly) know what they know, 2022. URL https://arxiv.org/abs/2207.05221.
+- Takeshi Kojima, Shixiang Shane Gu, Machel Reid, Yutaka Matsuo, and Yusuke Iwasawa. Large language models are zero-shot reasoners, 2023. URL https://arxiv.org/abs/2205.11916.
+- Shiye Lei, Zhihao Cheng, Kai Jia, and Dacheng Tao. Revisiting Ilm reasoning via information bottleneck, 2025. URL https://arxiv.org/abs/2507.18391.
+- Jia LI, Edward Beeching, Lewis Tunstall, Ben Lipkin, Roman Soletskyi, Shengyi Costa Huang, Kashif Rasul, Longhui Yu, Albert Jiang, Ziju Shen, Zihan Qin, Bin Dong, Li Zhou, Yann Fleureau, Guillaume Lample, and Stanislas Polu. Numinamath. [https://huggingface.co/AI-MO/NuminaMath-CoT] (https://github.com/project-numina/aimo-progress-prize/blob/main/report/numina\_dataset.pdf), 2024.
+- Hunter Lightman, Vineet Kosaraju, Yura Burda, Harri Edwards, Bowen Baker, Teddy Lee, Jan Leike, John Schulman, Ilya Sutskever, and Karl Cobbe. Let's verify step by step, 2023. URL https://arxiv.org/abs/2305.20050.
+- Zichen Liu, Changyu Chen, Wenjun Li, Penghui Qi, Tianyu Pang, Chao Du, Wee Sun Lee, and Min Lin. Understanding r1-zero-like training: A critical perspective, 2025a. URL https://arxiv.org/abs/2503.20783.
+- Zihe Liu, Jiashun Liu, Yancheng He, Weixun Wang, Jiaheng Liu, Ling Pan, Xinyu Hu, Shaopan Xiong, Ju Huang, Jian Hu, Shengyi Huang, Siran Yang, Jiamang Wang, Wenbo Su, and Bo Zheng. Part i: Tricks or traps? a deep dive into rl for llm reasoning, 2025b. URL https://arxiv.org/abs/2508.08221.
+- Niklas Muennighoff, Zitong Yang, Weijia Shi, Xiang Lisa Li, Li Fei-Fei, Hannaneh Hajishirzi, Luke Zettlemoyer, Percy Liang, Emmanuel Candès, and Tatsunori Hashimoto. s1: Simple test-time scaling, 2025. URL https://arxiv.org/abs/2501.19393.
+
+Andrew Y. Ng, Daishi Harada, and Stuart J. Russell. Policy invariance under reward transformations: Theory and application to reward shaping. In *Proceedings of the Sixteenth International Conference on Machine Learning*, ICML '99, pp. 278–287, San Francisco, CA, USA, 1999. Morgan Kaufmann Publishers Inc. ISBN 1558606122.
+
+Michael Noukhovitch, Shengyi Huang, Sophie Xhonneux, Arian Hosseini, Rishabh Agarwal, and Aaron Courville. Asynchronous rlhf: Faster and more efficient off-policy rl for language models, 2025. URL https://arxiv.org/abs/2410.18252.
+
+OpenAI, :, Aaron Jaech, Adam Kalai, Adam Lerer, Adam Richardson, Ahmed El-Kishky, Aiden Low, Alec Helyar, Aleksander Madry, Alex Beutel, Alex Carney, Alex Iftimie, Alex Karpenko, Alex Tachard Passos, Alexander Neitz, Alexander Prokofiev, Alexander Wei, Allison Tam, Ally Bennett, Ananya Kumar, Andre Saraiva, Andrea Vallone, Andrew Duberstein, Andrew Kondrich, Andrey Mishchenko, Andy Applebaum, Angela Jiang, Ashvin Nair, Barret Zoph, Behrooz Ghorbani, Ben Rossen, Benjamin Sokolowsky, Boaz Barak, Bob McGrew, Borys Minaiev, Botao Hao, Bowen Baker, Brandon Houghton, Brandon McKinzie, Brydon Eastman, Camillo Lugaresi, Cary Bassin, Cary Hudson, Chak Ming Li, Charles de Bourcy, Chelsea Voss, Chen Shen, Chong Zhang, Chris Koch, Chris Orsinger, Christopher Hesse, Claudia Fischer, Clive Chan, Dan Roberts, Daniel Kappler, Daniel Levy, Daniel Selsam, David Dohan, David Farhi, David Mely, David Robinson, Dimitris Tsipras, Doug Li, Dragos Oprica, Eben Freeman, Eddie Zhang, Edmund Wong, Elizabeth Proehl, Enoch Cheung, Eric Mitchell, Eric Wallace, Erik Ritter, Evan Mays, Fan Wang, Felipe Petroski Such, Filippo Raso, Florencia Leoni, Foivos Tsimpourlas, Francis Song, Fred von Lohmann, Freddie Sulit, Geoff Salmon, Giambattista Parascandolo, Gildas Chabot, Grace Zhao, Greg Brockman, Guillaume Leclerc, Hadi Salman, Haiming Bao, Hao Sheng, Hart Andrin, Hessam Bagherinezhad, Hongyu Ren, Hunter Lightman, Hyung Won Chung, Ian Kivlichan, Ian O'Connell, Ian Osband, Ignasi Clavera Gilaberte, Ilge Akkaya, Ilya Kostrikov, Ilya Sutskever, Irina Kofman, Jakub Pachocki, James Lennon, Jason Wei, Jean Harb, Jerry Twore, Jiacheng Feng, Jiahui Yu, Jiayi Weng, Jie Tang, Jieqi Yu, Joaquin Quiñonero Candela, Joe Palermo, Joel Parish, Johannes Heidecke, John Hallman, John Rizzo, Jonathan Gordon, Jonathan Uesato, Jonathan Ward, Joost Huizinga, Julie Wang, Kai Chen, Kai Xiao, Karan Singhal, Karina Nguyen, Karl Cobbe, Katy Shi, Kayla Wood, Kendra Rimbach, Keren Gu-Lemberg, Kevin Liu, Kevin Lu, Kevin Stone, Kevin Yu, Lama Ahmad, Lauren Yang, Leo Liu, Leon Maksin, Leyton Ho, Liam Fedus, Lilian Weng, Linden Li, Lindsay McCallum, Lindsey Held, Lorenz Kuhn, Lukas Kondraciuk, Lukasz Kaiser, Luke Metz, Madelaine Boyd, Maja Trebacz, Manas Joglekar, Mark Chen, Marko Tintor, Mason Meyer, Matt Jones, Matt Kaufer, Max Schwarzer, Meghan Shah, Mehmet Yatbaz, Melody Y. Guan, Mengyuan Xu, Mengyuan Yan, Mia Glaese, Mianna Chen, Michael Lampe, Michael Malek, Michele Wang, Michelle Fradin, Mike McClay, Mikhail Pavlov, Miles Wang, Mingxuan Wang, Mira Murati, Mo Bavarian, Mostafa Rohaninejad, Nat McAleese, Neil Chowdhury, Neil Chowdhury, Nick Ryder, Nikolas Tezak, Noam Brown, Ofir Nachum, Oleg Boiko, Oleg Murk, Olivia Watkins, Patrick Chao, Paul Ashbourne, Pavel Izmailov, Peter Zhokhov, Rachel Dias, Rahul Arora, Randall Lin, Rapha Gontijo Lopes, Raz Gaon, Reah Miyara, Reimar Leike, Renny Hwang, Rhythm Garg, Robin Brown, Roshan James, Rui Shu, Ryan Cheu, Ryan Greene, Saachi Jain, Sam Altman, Sam Toizer, Sam Toyer, Samuel Miserendino, Sandhini Agarwal, Santiago Hernandez, Sasha Baker, Scott McKinney, Scottie Yan, Shengjia Zhao, Shengli Hu, Shibani Santurkar, Shraman Ray Chaudhuri, Shuyuan Zhang, Siyuan Fu, Spencer Papay, Steph Lin, Suchir Balaji, Suvansh Sanjeev, Szymon Sidor, Tal Broda, Aidan Clark, Tao Wang, Taylor Gordon, Ted Sanders, Tejal Patwardhan, Thibault Sottiaux, Thomas Degry, Thomas Dimson, Tianhao Zheng, Timur Garipov, Tom Stasi, Trapit Bansal, Trevor Creech, Troy Peterson, Tyna Eloundou, Valerie Qi, Vineet Kosaraju, Vinnie Monaco, Vitchyr Pong, Vlad Fomenko, Weiyi Zheng, Wenda Zhou, Wes McCabe, Wojciech Zaremba, Yann Dubois, Yinghai Lu, Yining Chen, Young Cha, Yu Bai, Yuchen He, Yuchen Zhang, Yunyun Wang, Zheng Shao, and Zhuohan Li. Openai of system card, 2024. URL https://arxiv.org/abs/2412.16720.
+
+Yun Qu, Qi Wang, Yixiu Mao, Vincent Tao Hu, Björn Ommer, and Xiangyang Ji. Can prompt difficulty be online predicted for accelerating rl finetuning of reasoning models?, 2025. URL https://arxiv.org/abs/2507.04632.
+
+John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, and Oleg Klimov. Proximal policy optimization algorithms, 2017. URL https://arxiv.org/abs/1707.06347.
+
+- Zhihong Shao, Peiyi Wang, Qihao Zhu, Runxin Xu, Junxiao Song, Xiao Bi, Haowei Zhang, Mingchuan Zhang, Y. K. Li, Y. Wu, and Daya Guo. Deepseekmath: Pushing the limits of mathematical reasoning in open language models, 2024. URL https://arxiv.org/abs/2402.03300.
+- Guangming Sheng, Chi Zhang, Zilingfeng Ye, Xibin Wu, Wang Zhang, Ru Zhang, Yanghua Peng, Haibin Lin, and Chuan Wu. Hybridflow: A flexible and efficient rlhf framework. *arXiv* preprint *arXiv*: 2409.19256, 2024.
+- Kimi Team, Angang Du, Bofei Gao, Bowei Xing, Changjiu Jiang, Cheng Chen, Cheng Li, Chenjun Xiao, Chenzhuang Du, Chonghua Liao, Chuning Tang, Congcong Wang, Dehao Zhang, Enming Yuan, Enzhe Lu, Fengxiang Tang, Flood Sung, Guangda Wei, Guokun Lai, Haiqing Guo, Han Zhu, Hao Ding, Hao Hu, Hao Yang, Hao Zhang, Haotian Yao, Haotian Zhao, Haoyu Lu, Haoze Li, Haozhen Yu, Hongcheng Gao, Huabin Zheng, Huan Yuan, Jia Chen, Jianhang Guo, Jianlin Su, Jianzhou Wang, Jie Zhao, Jin Zhang, Jingyuan Liu, Junjie Yan, Junyan Wu, Lidong Shi, Ling Ye, Longhui Yu, Mengnan Dong, Neo Zhang, Ningchen Ma, Qiwei Pan, Qucheng Gong, Shaowei Liu, Shengling Ma, Shupeng Wei, Sihan Cao, Siying Huang, Tao Jiang, Weihao Gao, Weimin Xiong, Weiran He, Weixiao Huang, Weixin Xu, Wenhao Wu, Wenyang He, Xianghui Wei, Xianqing Jia, Xingzhe Wu, Xinran Xu, Xinxing Zu, Xinyu Zhou, Xuehai Pan, Y. Charles, Yang Li, Yangyang Hu, Yangyang Liu, Yanru Chen, Yejie Wang, Yibo Liu, Yidao Qin, Yifeng Liu, Ying Yang, Yiping Bao, Yulun Du, Yuxin Wu, Yuzhi Wang, Zaida Zhou, Zhaoji Wang, Zhaowei Li, Zhen Zhu, Zheng Zhang, Zhexu Wang, Zhilin Yang, Zhiqi Huang, Zihao Huang, Ziyao Xu, Zonghan Yang, and Zongyu Lin. Kimi k1.5: Scaling reinforcement learning with Ilms, 2025. URL https://arxiv.org/abs/2501.12599.
+- Binghai Wang, Rui Zheng, Lu Chen, Yan Liu, Shihan Dou, Caishuang Huang, Wei Shen, Senjie Jin, Enyu Zhou, Chenyu Shi, Songyang Gao, Nuo Xu, Yuhao Zhou, Xiaoran Fan, Zhiheng Xi, Jun Zhao, Xiao Wang, Tao Ji, Hang Yan, Lixing Shen, Zhan Chen, Tao Gui, Qi Zhang, Xipeng Qiu, Xuanjing Huang, Zuxuan Wu, and Yu-Gang Jiang. Secrets of rlhf in large language models part ii: Reward modeling, 2024. URL https://arxiv.org/abs/2401.06080.
+- Shenzhi Wang, Le Yu, Chang Gao, Chujie Zheng, Shixuan Liu, Rui Lu, Kai Dang, Xionghui Chen, Jianxin Yang, Zhenru Zhang, et al. Beyond the 80/20 rule: High-entropy minority tokens drive effective reinforcement learning for llm reasoning. *arXiv preprint arXiv:2506.01939*, 2025. URL https://arxiv.org/pdf/2506.01939.
+- Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Brian Ichter, Fei Xia, Ed Chi, Quoc Le, and Denny Zhou. Chain-of-thought prompting elicits reasoning in large language models, 2023. URL https://arxiv.org/abs/2201.11903.
+- An Yang, Anfeng Li, Baosong Yang, Beichen Zhang, Binyuan Hui, Bo Zheng, Bowen Yu, Chang Gao, Chengen Huang, Chenxu Lv, Chujie Zheng, Dayiheng Liu, Fan Zhou, Fei Huang, Feng Hu, Hao Ge, Haoran Wei, Huan Lin, Jialong Tang, Jian Yang, Jianhong Tu, Jianwei Zhang, Jianxin Yang, Jiaxi Yang, Jing Zhou, Jingren Zhou, Junyang Lin, Kai Dang, Keqin Bao, Kexin Yang, Le Yu, Lianghao Deng, Mei Li, Mingfeng Xue, Mingze Li, Pei Zhang, Peng Wang, Qin Zhu, Rui Men, Ruize Gao, Shixuan Liu, Shuang Luo, Tianhao Li, Tianyi Tang, Wenbiao Yin, Xingzhang Ren, Xinyu Wang, Xinyu Zhang, Xuancheng Ren, Yang Fan, Yang Su, Yichang Zhang, Yinger Zhang, Yu Wan, Yuqiong Liu, Zekun Wang, Zeyu Cui, Zhenru Zhang, Zhipeng Zhou, and Zihan Qiu. Qwen3 technical report, 2025a. URL https://arxiv.org/abs/2505.09388.
+- An Yang, Anfeng Li, Baosong Yang, Beichen Zhang, Binyuan Hui, Bo Zheng, Bowen Yu, Chang Gao, Chengen Huang, Chenxu Lv, et al. Qwen3 technical report. *arXiv preprint* arXiv:2505.09388, 2025b.
+- Qiying Yu, Zheng Zhang, Ruofei Zhu, Yufeng Yuan, Xiaochen Zuo, Yu Yue, Weinan Dai, Tiantian Fan, Gaohong Liu, Lingjun Liu, Xin Liu, Haibin Lin, Zhiqi Lin, Bole Ma, Guangming Sheng, Yuxuan Tong, Chi Zhang, Mofan Zhang, Wang Zhang, Hang Zhu, Jinhua Zhu, Jiaze Chen, Jiangjie Chen, Chengyi Wang, Hongli Yu, Yuxuan Song, Xiangpeng Wei, Hao Zhou, Jingjing Liu, Wei-Ying Ma, Ya-Qin Zhang, Lin Yan, Mu Qiao, Yonghui Wu, and Mingxuan Wang. Dapo: An open-source llm reinforcement learning system at scale, 2025.
+
+- Qingyang Zhang, Haitao Wu, Changqing Zhang, Peilin Zhao, and Yatao Bian. Right question is already half the answer: Fully unsupervised llm reasoning incentivization, 2025. URL https://arxiv.org/abs/2504.05812.
+- Haizhong Zheng, Yang Zhou, Brian R. Bartoldson, Bhavya Kailkhura, Fan Lai, Jiawei Zhao, and Beidi Chen. Act only when it pays: Efficient reinforcement learning for llm reasoning via selective rollouts, 2025. URL https://arxiv.org/abs/2506.02177.
+- Yinmin Zhong, Zili Zhang, Bingyang Wu, Shengyu Liu, Yukun Chen, Changyi Wan, Hanpeng Hu, Lei Xia, Ranchen Ming, Yibo Zhu, and Xin Jin. Optimizing rlhf training for large language models with stage fusion, 2025. URL https://arxiv.org/abs/2409.13221.
+
+# **APPENDIX**
+
+## A ADDITIONAL EXPERIMENTAL RESULTS
+
+To further validate the effectiveness of RL-ZVP on an alternative training dataset, we conduct experiments using a random 20k-prompt subset of NuminaMath-CoT (LI et al., 2024; Zhang et al., 2025), comparing RL-ZVP with GRPO under the same setup as our main experiments.
+
+The detailed evaluation results are shown in Table 4, and the corresponding average validation accuracy and training dynamics are provided in Figure 5. RL-ZVP consistently outperforms GRPO in all benchmarks, surpassing it throughout the training process. Moreover, RL-ZVP exhibits distinct training dynamics, characterized by lower entropy and longer response lengths. These findings confirm that RL-ZVP is effective not only across evaluation benchmarks but also across different training datasets, further strengthening LLM reasoning capabilities.
+
+Table 4: Additional evaluation results on six math reasoning benchmarks when the model is trained with NuminaMath-CoT.
+
+MATTIFOO
+
+ATMESS
+
+A TMESA
+
+Ol------
+
+| Method                                             | Minerva |        | AMC23 |        | MATH500 |        | AIME24 |        | AIME25 |        | OlympiadBench |        |
+|----------------------------------------------------|---------|--------|-------|--------|---------|--------|--------|--------|--------|--------|---------------|--------|
+|                                                    | Acc@8   | Pass@8 | Acc@8 | Pass@8 | Acc@8   | Pass@8 | Acc@8  | Pass@8 | Acc@8  | Pass@8 | Acc@8         | Pass@8 |
+| Qwen3-1.7B-Base trained with NuminaMath-CoT subset |         |        |       |        |         |        |        |        |        |        |               |        |
+| GRPO                                               | 30.01   | 44.88  | 43.75 | 72.20  | 67.53   | 83.12  | 6.67   | 18.59  | 4.58   | 13.64  | 30.45         | 47.68  |
+| RL-ZVP                                             | 30.56   | 45.96  | 44.69 | 70.81  | 70.80   | 86.78  | 8.33   | 18.32  | 5.42   | 13.94  | 32.78         | 51.95  |
+| Δ(w.r.t GRPO)                                      | +0.55   | +1.08  | +0.94 | -1.39  | +3.17   | +3.66  | +1.66  | -0.27  | +0.84  | +0.30  | +2.33         | +4.27  |
+| Qwen3-8B-Base trained with NuminaMath-CoT subset   |         |        |       |        |         |        |        |        |        |        |               |        |
+| GRPO                                               | 42.42   | 54.41  | 66.88 | 90.10  | 83.63   | 93.32  | 25.00  | 45.40  | 20.83  | 38.49  | 54.01         | 68.29  |
+| RL-ZVP                                             | 44.72   | 56.15  | 73.13 | 93.91  | 87.23   | 95.20  | 28.75  | 51.54  | 22.92  | 40.61  | 55.21         | 69.67  |
+| $\Delta$ (w.r.t GRPO)                              | +2.30   | +1.74  | +6.25 | +3.81  | +3.60   | +1.88  | +3.75  | +6.14  | +2.09  | +2.12  | +1.20         | +1.38  |
+
+Figure 5: Validation accuracy and training dynamics at different experiment scales.
+
+## B SUPPLEMENTARY FIGURES AND TABLES
+
+![](_page_15_Figure_2.jpeg)
+
+Figure 6: Average **accuracy** (a) and **pass rate** (b) on six math reasoning benchmarks across model scale and training methods. RL-ZVP consistently delivers the strongest performance among all baselines.
+
+Table 5: Number of rollouts and gradient steps of RL-ZVP compared to the baselines. For GRPO-DS-r and GRESO-r on the large scale, the policy initially solves only  $\sim\!2\%$  of problems per training batch. This leads to extremely poor data efficiency, requiring over-sampling of  $\sim\!50\times$  to form a full non-zero-variance batch for each gradient step. As a result, these methods achieve far fewer gradient updates than GRPO and RL-ZVP, despite using nearly  $1.5\times$  more rollouts. This highlights the severe inefficiency of dynamic sampling on challenging training data.
+
+| Method    | Small Scale |                     |            | Large Scale |                     |            |
+|-----------|-------------|---------------------|------------|-------------|---------------------|------------|
+|           | #Rollouts   | Rollouts<br>(×GRPO) | Grad Steps | #Rollouts   | Rollouts<br>(×GRPO) | Grad Steps |
+| GRPO      | 172,032     | 1.00×               | 672        | 540,672     | 1.00×               | 1,056      |
+| GRPO-DS-r | 172,032     | 1.00×               | 320        | 827,392     | 1.53×               | 32         |
+| GRPO-DS-g | 421,888     | 2.45×               | 672        | 2,859,008   | 5.29×               | 1,056      |
+| GRESO-r   | 173,600     | 1.01×               | 320        | 815,232     | 1.51×               | 32         |
+| GRESO-g   | 272,030     | 1.58×               | 672        | 2,159,104   | 3.99×               | 1,056      |
+| RL-ZVP    | 172,032     | 1.00×               | 672        | 540,672     | 1.00×               | 1,056      |
+
+## C FULL IMPLEMENTATION DETAILS
+
+We present the full experimental setup details here. RL-ZVP and all baselines are implemented on top of the verl framework, ensuring that every experiment is conducted within the same pipeline. Training is performed on 4×RTX 3090 GPUs for Qwen3-1.7B-Base and 8×A100 GPUs for Qwen3-8B-Base.
+
+For evaluation, we initially attempted to measure validation results after every training iteration (16 gradient steps), but this proved extremely inefficient given the size of the validation set (1534 problems with 8 responses each). Instead, on the small scale, we evaluate the model performance on Minerva, AMC23, and MATH after every iteration and report the best result, while for AIME24, AIME25, and OlympiadBench we use the final checkpoint. On the large scale, thanks to greater training efficiency on more powerful GPUs, we evaluate Minerva, AMC23, MATH, AIME24, and AIME25 after every iteration and report the best results, while for OlympiadBench we use only the final checkpoint.
+
+The detailed hyperparameter settings are provided in Table 6. We mostly follow the standard hyperparameter settings from recent works (Yu et al., 2025; Cheng et al., 2025; Zheng et al., 2025). We use the clip-higher scheme with  $\epsilon_{\rm high}=0.28$  and  $\epsilon_{\rm low}=0.2$ , apply token-level loss, and exclude the KL divergence term from the objective.
+
+In addition to the common settings shared across methods, we also adopt the unique hyperparameter configurations of each baseline. For GRESO (Zheng et al., 2025), we follow their probabilistic filtering setup exactly:  $\alpha_{\rm easy} = \frac{1}{12}$ ,  $\alpha_{\rm hard} = \frac{1}{6}$ ,  $p_{\rm easy} = 0.5$ ,  $p_{\rm hard} = 0.5$ ,  $\Delta p = 0.01$ , and  $\beta = 1.25$ . For GRPO-DS, the only deviation from DAPO (Yu et al., 2025) is that we omit the *soft overlong punishment* to ensure a fair comparison with other methods, which also do not include this form of length-based reward shaping.
+
+Table 6: Full hyperparameter setting.
+
+| Hyperparameters        | Qwen3-1.7B-Base + MATH | Qwen3-8B-Base + DAPO-Math-17k |
+|------------------------|------------------------|-------------------------------|
+| Training batch size    | 512                    | 512                           |
+| Mini batch size        | 32                     | 32                            |
+| Samples per prompt     | 8                      | 16                            |
+| Max response length    | 4096                   | 8192                          |
+| Clip ratio             | (0.20, 0.28)           | (0.20, 0.28)                  |
+| Training temperature   | 1.0                    | 1.0                           |
+| Training top_p         | 1.0                    | 1.0                           |
+| Validation temperature | 1.0                    | 1.0                           |
+| Validation top_p       | 0.7                    | 0.7                           |
+| Number of epochs       | 3                      | 2                             |
+| Total gradient steps   | 672                    | 1056                          |
+| Optimizer              | AdamW                  | AdamW                         |
+| Learning rate          | $3 \times 10^{-6}$     | $1 \times 10^{-6}$            |
+| LR warmup steps        | 4                      | 10                            |
+| LR scheduler           | cosine                 | constant                      |
+
+## D QUALITIVATE EXAMPLES
+
+This appendix demonstrates some output examples generated by policy models trained with GRPO and RL-ZVP.
+
+### **Minerva: Question and Ground Truth**
+
+**Question:** A particular star has an absolute magnitude M=-7. If this star is observed in a galaxy that is at a distance of  $3 \mathrm{Mpc}$ , what will its apparent magnitude be?
+
+**Ground Truth: 20.39** 
+
+### Minerva: Response from GRPO (Qwen3-1.7B-Base)
+
+To determine the apparent magnitude of a star with an absolute magnitude of M=-7 that is observed at a distance of  $3 \mathrm{Mpc}$ , we can use the distance modulus formula:
+
+$$m - M = 5\log_{10}(d) + 5$$
+
+where:
+
+- m is the apparent magnitude,
+- M is the absolute magnitude,
+- d is the distance to the star in parsecs.
+
+Given:
+
+- -M = -7
+- -d = 3 Mpc
+
+First, we need to convert the distance from Mpc to parsecs. Since  $1 \rm Mpc$  =  $3.2616 \times 10^{18}$  parsecs, we have:
+
+$$d = 3 \times 3.2616 \times 10^{18} \text{ parsecs}$$
+
+Now, we can plug the values into the distance modulus formula:
+
+$$m - (-7) = 5 \log_{10}(3 \times 3.2616 \times 10^{18})$$
+
+Simplify the expression inside the logarithm:
+
+$$3 \times 3.2616 \times 10^{18} = 9.7848 \times 10^{18}$$
+
+Now, calculate the logarithm:
+
+$$\log_{10}(9.7848 \times 10^{18}) = \log_{10}(9.7848) + \log_{10}(10^{18}) \approx 0.9908 + 18 = 18.9908$$
+
+Next, multiply by 5:
+
+$$5 \times 18.9908 = 94.954$$
+
+Now, add 5 to the result:
+
+$$m+7=94.954 \implies m=94.954-7=87.954$$
+
+So, the apparent magnitude m of the star is approximately 87.954. Therefore, the final answer is:
+
+87.95
+
+#### Minerva: Response from RL-ZVP (Qwen3-1.7B-Base)
+
+#### Alright, let's solve the problem step by step.
+
+#### \*\*Problem Statement:\*\*
+
+We have a star with an absolute magnitude M=-7. This star is observed in a galaxy that is at a distance of  $3 \mathrm{Mpc}$ . We need to find its apparent magnitude.
+
+#### \*\*Understanding the Concepts:\*\*
+
+- 1. \*\*Absolute Magnitude (M):\*\* This is the brightness of a star if it were placed at a standard distance of 10 parsecs (pc) from Earth. It gives us a measure of the star's intrinsic brightness.
+- 2. \*\*Apparent Magnitude (m):\*\* This is the brightness of a star as seen from Earth. It depends on both the star's intrinsic brightness (absolute magnitude) and the distance to the star.
+- 3. \*\*Distance Modulus Formula:\*\* The relationship between absolute magnitude (M), apparent magnitude (m), and distance (d) in parsecs is given by:
+
+$$m - M = 5\log_{10}(d) - 5$$
+
+where d is the distance to the star in parsecs.
+
+- \*\*Given:\*\*
+- Absolute magnitude M=-7
+- Distance  $d = 3 \mathrm{Mpc}$
+
+#### \*\*Step 1: Convert the Distance to Parsecs\*\*
+
+First, we need to convert the distance from megaparsecs (Mpc) to parsecs (pc) because the distance modulus formula uses parsecs.
+
+$$1 \text{Mpc} = 10^6 \text{ pc}$$
+
+So.
+
+$$d = 3\,\mathrm{Mpc} = 3 \times 10^{6}\,\mathrm{pc}$$
+
+#### \*\*Step 2: Plug the Values into the Distance Modulus Formula\*\*
+
+Using the formula:
+
+$$m - M = 5\log_{10}(d) - 5$$
+
+Substitute M = -7 and  $d = 3 \times 10^6$ :
+
+$$m - (-7) = 5\log_{10}(3 \times 10^6) - 5$$
+
+$$m + 7 = 5\log_{10}(3) + 5\log_{10}(10^6) - 5$$
+
+\*\*Step 3: Simplify the Logarithms\*\*
+
+*(...)* 
+
+#### \*\*Step 5: Substitute and Solve for m\*\*
+
+Now, substitute  $\log_{10}(3)$  into the equation:
+
+$$m+7 = 5 \times 0.4771 + 25$$
+
+$$m+7 = 2.3855 + 25$$
+
+$$m+7 = 27.3855$$
+
+$$m = 27.3855 - 7$$
+
+$$m = 20.3855$$
+
+\*\*Final Answer:\*\*
+
+The apparent magnitude of the star is approximately |20.39|
+
+### **AIME24: Question and Ground Truth**
+
+**Question:** Let  $\mathcal{B}$  be the set of rectangular boxes with surface area 54 and volume 23. Let r be the radius of the smallest sphere that can contain each of the rectangular boxes that are elements of  $\mathcal{B}$ . The value of  $r^2$  can be written as  $\frac{p}{q}$ , where p and q are relatively prime positive integers. Find p+q.
+
+**Ground Truth:** 721
+
+### AIME24: Response from GRPO (Qwen3-1.7B-Base)
+
+To solve the problem, we need to find the radius r of the smallest sphere that can contain each of the rectangular boxes with surface area 54 and volume 23. The radius r of the smallest sphere that can contain a rectangular box is the maximum distance from the center of the box to any of its vertices.
+
+Let's denote the dimensions of the rectangular box by a, b, and c. The volume of the box is given by:
+
+(...)
+
+We need to find the roots of this polynomial. By trial and error or using numerical methods, we can find that one possible set of roots is a=1, b=1, and c=23. Let's check if these values satisfy the equations:
+
+$$abc = 1 \cdot 1 \cdot 23 = 23$$
+ 
+$$ab + bc + ca = 1 \cdot 1 + 1 \cdot 23 + 23 \cdot 1 = 1 + 23 + 23 = 47 \neq 27$$
+
+This set of values does not work. Let's try another set of values. Suppose a=1, b=1, and c=23 is not the solution, let's try another approach. We can use the fact that the box is symmetric and assume a=b. Let's assume a=b and solve for c:
+
+$$a^{2} c = 23$$
+$$2a^{2} + 2a c = 54$$
+
+From the first equation:
+
+$$c = \frac{23}{a^2}$$
+
+Substitute into the second equation:
+
+$$2a^2 + 2a\left(\frac{23}{a^2}\right) = 54$$
+
+$$2a^2 + \frac{46}{a} = 54$$
+
+Multiply through by *a*:
+
+$$2a^3 + 46 = 54a$$
+
+$$2a^3 - 54a + 46 = 0$$
+
+**By trial and error**, we find that a = 2 is a root:
+
+*(...)* 
+
+The answer is  $\boxed{673}$ 
+
+### AIME24: Response from RL-ZVP (Qwen3-1.7B-Base)
+
+Alright, I have this problem about rectangular boxes and spheres. Let's break it down step by step.
+
+#### \*\*Problem Statement:\*\*
+
+We have a set of rectangular boxes with:
+
+- 1. Surface area = 54
+- 2. Volume = 23
+
+Let  $\mathcal B$  be the set of all such boxes. We need to find the radius r of the smallest sphere that can contain each of these boxes. Finally, we need to express  $r^2$  as a fraction  $\frac{p}{q}$  where p and q are coprime positive integers, and then find p+q.
+
+#### \*\*Understanding the Problem:\*\*
+
+First, let's understand what's given:
+
+- A rectangular box has dimensions a, b, and c.
+- The surface area S of a rectangular box is given by:
+
+$$S = 2(ab + bc + ca) = 54$$
+
+- The volume V of the box is:
+
+$$V = abc = 23$$
+
+- We need to find the smallest sphere that can contain any box in  $\mathcal{B}$ . The radius r of this sphere will be the maximum distance from the center of the sphere to any point on the box. So, r is half the length of the space diagonal of the box.
+
+#### \*\*Finding the Space Diagonal:\*\*
+
+The space diagonal d of a rectangular box is given by:
+
+$$d = \sqrt{a^2 + b^2 + c^2}$$
+
+(...)
+
+We need to maximize  $a^2+b^2+c^2$ . Let's see if we can express  $a^2+b^2+c^2$  in terms of ab+bc+ca and abc.
+
+Recall the identity:
+
+$$(a+b+c)^2 = a^2 + b^2 + c^2 + 2(ab+bc+ca)$$
+
+But we don't know a+b+c directly. Alternatively, we can consider using the method of Lagrange multipliers or express two variables in terms of the third, but that might get complicated.
+
+Wait, perhaps we can assume symmetry to simplify the problem. Let's assume that two of the dimensions are equal, and the third is different. Let's set a = b, and express c in terms of a.
+
+(...)
+
+We need to find  $r^2$ :
+
+$$r^2 = \left(\frac{3\sqrt{73}}{8}\right)^2 = \frac{9\times73}{64} = \frac{657}{64}$$
+
+Here, p = 657 and q = 64, which are coprime.
+
+\*\*Final Answer:\*\*
+
+$$p + q = 657 + 64 = 721$$
+
+721
+
+### **AIME25: Ouestion and Ground Truth**
+
+**Question:** Let  $A_1A_2\ldots A_{11}$  be an 11-sided non-convex simple polygon with the following properties: \* The area of  $A_iA_1A_{i+1}$  is 1 for each  $2\leq i\leq 10$ , \*  $\cos(\angle A_iA_1A_{i+1})=\frac{12}{13}$  for each  $2\leq i\leq 10$ , \* The perimeter of  $A_1A_2\ldots A_{11}$  is 20. If  $A_1A_2+A_1A_{11}$  can be expressed as  $\frac{m\sqrt{n}-p}{q}$  for positive integers m,n,p,q with n squarefree and no prime divides all of m,p,q, find m+n+p+q.
+
+**Ground Truth: 19** 
+
+### AIME25: Response from GRPO (Qwen3-8B-Base)
+
+### Step-by-Step Solution:
+
+- 1. \*\*Understanding the Given Information:\*\*
+- We have an 11-sided non-convex simple polygon  $A_1 A_2 \dots A_{11}$ .
+- The area of triangle  $A_i A_1 A_{i+1}$  is 1 for each  $2 \le i \le 10$ .
+- The cosine of angle  $\angle A_i A_1 A_{i+1}$  is  $\frac{12}{13}$  for each  $2 \le i \le 10$ .
+- The perimeter of the polygon is 20.
+- We need to find  $A_1A_2 + A_1A_{11}$  expressed in the form  $\frac{m\sqrt{n}-p}{q}$  and compute m+n+p+q.
+- 2. \*\*Analyzing the Triangles:\*\*
+- For each triangle  $A_i A_1 A_{i+1}$ , we know:
+- Area = 1
+- $\cos(\theta_i) = \frac{12}{13}$ , where  $\theta_i = \angle A_i A_1 A_{i+1}$ .
+
+(...)
+
+- However, this approach seems too complex. Let's try a different method.
+- 8. \*\*Using Trigonometry and Perimeter:\*\*
+- Let's denote  $x = A_1 A_2$  and  $y = A_1 A_{11}$ .
+- The perimeter equation:
+
+$$x + \sum_{i=3}^{10} A_1 A_i + y + A_{11} A_1 = 20$$
+
+(...)
+
+- This seems too complicated. Let's try a different approach.
+- 10. \*\*Final Calculation:\*\* Let's assume  $A_1A_i = a$  for  $i = 2, 3, \ldots, 10$ .
+
+(...)
+
+- This approach is too complex. Let's try a different method.
+- 11. \*\*Simpler Approach:\*\* Let's denote  $S = A_1A_2 + A_1A_{11}$ .
+
+(...)
+
+- Therefore:
+
+$$m + n + p + q = 80 + 26 + 260 + 19 = 385$$
+
+### Final Answer: 385
+
+### AIME25: Response from RL-ZVP (Qwen3-8B-Base)
+
+Alright, I have this problem about an 11-sided non-convex simple polygon with some specific properties. I need to find the value of  $A_1A_2 + A_1A_{11}$  expressed in a certain form and then compute m + n + p + q. Let's break it down step by step.
+
+```
+**Given:**
+```
+
+1.  $A_1 A_2 \dots A_{11}$  is an 11-sided non-convex simple polygon.
+
+2. The area of triangle  $A_i A_1 A_{i+1}$  is 1 for each  $2 \le i \le 10$ .
+
+3.  $\cos(\angle A_i A_1 A_{i+1}) = \frac{12}{13}$  for each  $2 \le i \le 10$ .
+
+4. The perimeter of the polygon is 20.
+
+\*\*Goal:\*\* Find  $A_1A_2 + A_1A_{11}$  expressed as  $\frac{m\sqrt{n-p}}{q}$  and then compute m+n+p+q. Alright, let's tackle this step by step.
+
+### Step 1: Understanding the Triangle Areas and Angles
+
+First, we have triangles  $A_iA_1A_{i+1}$  for i=2 to 10. The area of each of these triangles is 1, and we know the cosine of the angle at  $A_1$  in each triangle is  $\frac{12}{12}$ .
+
+(...)
+
+But from the triangles we have relationships between  $x_i$  and  $y_i$ . Let's see how  $x_i$  and  $y_i$  relate to the sides of the polygon.
+
+Wait, perhaps it's better to think in terms of vectors or use the Law of Cosines. Alternatively, perhaps we can express everything in terms of  $A_1A_2$  and  $A_1A_{11}$ .
+
+### Step 3: Expressing the Polygon in Terms of Vectors
+
+(...)
+
+Wait, perhaps we can find  $x_i + y_i$  or another relationship. Alternatively, perhaps we can sum up all the triangles. Wait, let's think differently.
+
+### Step 4: Summing the Areas
+
+(...)
+
+But how do these sides relate to x and y?
+
+Wait, perhaps we can express each side  $A_iA_{i+1}$  in terms of x and y. Let's see.
+
+(...)
+
+Wait, perhaps we can express a and k in terms of x and y. Alternatively, perhaps we can find a+k directly. Let's think differently.
+
+### Step 8: Finding a + k
+
+(...)
+
+Thus,
+
+$$m+n+p+q=9+5+1+4=19$$
+
+\*\*Final Answer:\*\* 19
+
+### **MATH500: Question and Ground Truth**
+
+**Question:** If  $\omega^{1997} = 1$  and  $\omega \neq 1$ , then evaluate
+
+$$\frac{1}{1+\omega} + \frac{1}{1+\omega^2} + \dots + \frac{1}{1+\omega^{1997}}.$$
+
+**Ground Truth:**  $\frac{1997}{2}$ 
+
+### MATH500: Response from GRPO (Qwen3-8B-Base)
+
+### Step-by-Step Solution:
+
+- 1. \*\*Understanding the Problem:\*\*
+- We are given that  $\omega^{1997}=1$  and  $\omega\neq 1$ . This means  $\omega$  is a primitive 1997th root of unity.
+- We need to evaluate the sum:
+
+$$S = \sum_{k=1}^{1997} \frac{1}{1 + \omega^k}$$
+
+- 2. \*\*Simplify the Sum:\*\*
+- Notice that  $\omega^{1997}=1$  implies  $\omega^k$  for  $k=1,2,\ldots,1996$  are all distinct roots of unity, and  $\omega^{1997}=1$  repeats the root 1. - However, since  $\omega\neq 1$ ,  $\omega^{1997}=1$  is already included in the sum.
+- 3. \*\*Pairing Terms:\*\*
+- Consider the sum:
+
+$$S = \sum_{k=1}^{1997} \frac{1}{1 + \omega^k}$$
+
+- Notice that  $\omega^{1997-k}=\omega^{-k}$  (since  $\omega^{1997}=1$ ). The term  $\frac{1}{1+\omega^k}$  can be paired with  $\frac{1}{1+\omega^{1997-k}}$ :
+
+$$\frac{1}{1+\omega^k} + \frac{1}{1+\omega^{1997-k}} = \frac{1}{1+\omega^k} + \frac{1}{1+\omega^{-k}}$$
+
+- Simplify the pair:
+
+$$\frac{1}{1+\omega^k} + \frac{1}{1+\omega^{-k}} = \frac{(1+\omega^{-k}) + (1+\omega^k)}{(1+\omega^k)(1+\omega^{-k})} = \frac{2+\omega^k + \omega^{-k}}{1+\omega^k + \omega^{-k} + 1} = \frac{2+\omega^k + \omega^{-k}}{2+\omega^k + \omega^{-k}} = 1$$
+
+- Therefore, each pair sums to 1.
+- 4. \*\*Counting the Pairs:\*\*
+- Since there are 1997 terms in the sum, and each pair consists of 2 terms, there are  $\frac{1997-1}{2} = 998$  complete pairs.
+- The remaining term is when k = 999, but since 1997 is odd, there is no unpaired term.
+- 5. \*\*Calculating the Total Sum:\*\*
+- The sum of 998 pairs, each summing to 1, is:
+
+$$S = 998 \times 1 = 998$$
+
+### Final Answer: | 998
+
+### MATH500: Response from RL-ZVP (Qwen3-8B-Base)
+
+Alright, I have this problem to solve:
+
+\*\*Problem:\*\*
+
+If  $\omega^{1997}=1$  and  $\omega\neq 1$ , then evaluate
+
+$$\frac{1}{1+\omega} + \frac{1}{1+\omega^2} + \dots + \frac{1}{1+\omega^{1997}}.$$
+
+Alright, let's break this down.
+
+### Understanding the Problem
+
+(...)
+
+So, the total sum S is:
+
+$$S = 998 \times 1 + \frac{1}{2} = 998 + \frac{1}{2} = \frac{1996}{2} + \frac{1}{2} = \frac{1997}{2}$$
+
+Wait, that's \$\frac{1997}{9}\$
+
+Wait, that's  $\frac{1997}{2}$ . But let's double-check this.
+
+### Double-Checking the Calculation
+
+(...)
+
+Let's multiply S' by  $(1 - \omega)$ :
+
+$$(1 - \omega)S' = \sum_{k=1}^{1996} \frac{1 - \omega}{1 + \omega^k}$$
+
+This seems complicated. Maybe there's a better approach.
+
+### Using the Identity for the Sum
+
+I recall that for any primitive *n*-th root of unity  $\omega$ , the sum  $\sum_{k=1}^{n-1} \frac{1}{1+\omega^k} = \frac{n-1}{2}$ .
+
+(...)
+
+Final Answer:
+
+But since the problem likely expects an integer, perhaps I missed something. Wait, let's think differently.
+
+### Final Verification
+
+Let's consider the sum:
+
+$$S = \sum_{k=1}^{1997} \frac{1}{1 + \omega^k}$$
+
+（...）
+
+Therefore, the correct answer is indeed  $\frac{1997}{2}$ .
+
+Final Answer:
\ No newline at end of file
diff --git a/research/papers/arXiv-rl_zvp/arXiv-2509.21880_meta.json b/research/papers/arXiv-rl_zvp/arXiv-2509.21880_meta.json
new file mode 100644
index 0000000..a2b19e8
--- /dev/null
+++ b/research/papers/arXiv-rl_zvp/arXiv-2509.21880_meta.json
@@ -0,0 +1,2167 @@
+{
+  "table_of_contents": [
+    {
+      "title": "ABSTRACT",
+      "heading_level": null,
+      "page_id": 0,
+      "polygon": [
+        [
+          276.71484375,
+          230.09765625
+        ],
+        [
+          334.5,
+          230.09765625
+        ],
+        [
+          334.5,
+          239.25
+        ],
+        [
+          276.71484375,
+          239.25
+        ]
+      ]
+    },
+    {
+      "title": "1 Introduction",
+      "heading_level": null,
+      "page_id": 0,
+      "polygon": [
+        [
+          109.072265625,
+          667.08984375
+        ],
+        [
+          206.25,
+          667.08984375
+        ],
+        [
+          206.25,
+          677.25
+        ],
+        [
+          109.072265625,
+          677.25
+        ]
+      ]
+    },
+    {
+      "title": "2 Preliminary: Group Relative Policy Optimization",
+      "heading_level": null,
+      "page_id": 1,
+      "polygon": [
+        [
+          107.279296875,
+          616.81640625
+        ],
+        [
+          419.25,
+          616.81640625
+        ],
+        [
+          419.25,
+          627.0
+        ],
+        [
+          107.279296875,
+          627.0
+        ]
+      ]
+    },
+    {
+      "title": "3 REINFORCEMENT LEARNING WITH ZERO-VARIANCE PROMPTS",
+      "heading_level": null,
+      "page_id": 2,
+      "polygon": [
+        [
+          107.876953125,
+          259.875
+        ],
+        [
+          446.25,
+          259.875
+        ],
+        [
+          446.25,
+          270.0
+        ],
+        [
+          107.876953125,
+          270.0
+        ]
+      ]
+    },
+    {
+      "title": "3.1 MOTIVATION: ADVANTAGE VANISHING IN ZERO-VARIANCE PROMPTS",
+      "heading_level": null,
+      "page_id": 2,
+      "polygon": [
+        [
+          106.5,
+          341.47265625
+        ],
+        [
+          428.25,
+          341.47265625
+        ],
+        [
+          428.25,
+          351.0
+        ],
+        [
+          106.5,
+          351.0
+        ]
+      ]
+    },
+    {
+      "title": "3.2 ADVANTAGE FORMULATION",
+      "heading_level": null,
+      "page_id": 3,
+      "polygon": [
+        [
+          106.5,
+          211.921875
+        ],
+        [
+          251.25,
+          211.921875
+        ],
+        [
+          251.25,
+          220.5
+        ],
+        [
+          106.5,
+          220.5
+        ]
+      ]
+    },
+    {
+      "title": "Algorithm 1: Reinforcement Learning with Zero-Variance Prompts",
+      "heading_level": null,
+      "page_id": 4,
+      "polygon": [
+        [
+          106.3828125,
+          86.23828125
+        ],
+        [
+          378.75,
+          86.23828125
+        ],
+        [
+          378.75,
+          96.29296875
+        ],
+        [
+          106.3828125,
+          96.29296875
+        ]
+      ]
+    },
+    {
+      "title": "3.3 FINAL OBJECTIVE",
+      "heading_level": null,
+      "page_id": 4,
+      "polygon": [
+        [
+          106.5,
+          303.1875
+        ],
+        [
+          210.0,
+          303.1875
+        ],
+        [
+          210.0,
+          312.0
+        ],
+        [
+          106.5,
+          312.0
+        ]
+      ]
+    },
+    {
+      "title": "4 EXPERIMENTS",
+      "heading_level": null,
+      "page_id": 4,
+      "polygon": [
+        [
+          107.279296875,
+          548.75390625
+        ],
+        [
+          200.25,
+          548.75390625
+        ],
+        [
+          200.25,
+          559.5
+        ],
+        [
+          107.279296875,
+          559.5
+        ]
+      ]
+    },
+    {
+      "title": "4.1 EXPERIMENTAL SETTINGS",
+      "heading_level": null,
+      "page_id": 4,
+      "polygon": [
+        [
+          106.5,
+          633.4453125
+        ],
+        [
+          245.25,
+          633.4453125
+        ],
+        [
+          245.25,
+          643.5
+        ],
+        [
+          106.5,
+          643.5
+        ]
+      ]
+    },
+    {
+      "title": "4.2 RESULTS & ANALYSIS",
+      "heading_level": null,
+      "page_id": 5,
+      "polygon": [
+        [
+          106.5,
+          305.5078125
+        ],
+        [
+          228.75,
+          305.5078125
+        ],
+        [
+          228.75,
+          315.0
+        ],
+        [
+          106.5,
+          315.0
+        ]
+      ]
+    },
+    {
+      "title": "4.3 ABLATION STUDY",
+      "heading_level": null,
+      "page_id": 7,
+      "polygon": [
+        [
+          106.5,
+          405.0
+        ],
+        [
+          210.0,
+          405.0
+        ],
+        [
+          210.0,
+          414.17578125
+        ],
+        [
+          106.5,
+          414.17578125
+        ]
+      ]
+    },
+    {
+      "title": "5 RELATED WORK",
+      "heading_level": null,
+      "page_id": 8,
+      "polygon": [
+        [
+          107.876953125,
+          282.69140625
+        ],
+        [
+          212.25,
+          282.69140625
+        ],
+        [
+          212.25,
+          292.5
+        ],
+        [
+          107.876953125,
+          292.5
+        ]
+      ]
+    },
+    {
+      "title": "6 CLOSING REMARKS",
+      "heading_level": null,
+      "page_id": 8,
+      "polygon": [
+        [
+          108.474609375,
+          515.49609375
+        ],
+        [
+          229.5,
+          515.49609375
+        ],
+        [
+          229.5,
+          525.75
+        ],
+        [
+          108.474609375,
+          525.75
+        ]
+      ]
+    },
+    {
+      "title": "ACKNOWLEDGEMENT",
+      "heading_level": null,
+      "page_id": 9,
+      "polygon": [
+        [
+          107.25,
+          83.91796875
+        ],
+        [
+          201.0,
+          83.91796875
+        ],
+        [
+          201.0,
+          92.25
+        ],
+        [
+          107.25,
+          92.25
+        ]
+      ]
+    },
+    {
+      "title": "REFERENCES",
+      "heading_level": null,
+      "page_id": 9,
+      "polygon": [
+        [
+          107.25,
+          182.25
+        ],
+        [
+          176.25,
+          182.25
+        ],
+        [
+          176.25,
+          192.0
+        ],
+        [
+          107.25,
+          192.0
+        ]
+      ]
+    },
+    {
+      "title": "APPENDIX",
+      "heading_level": null,
+      "page_id": 14,
+      "polygon": [
+        [
+          107.25,
+          81.75
+        ],
+        [
+          173.25,
+          81.75
+        ],
+        [
+          173.25,
+          91.5
+        ],
+        [
+          107.25,
+          91.5
+        ]
+      ]
+    },
+    {
+      "title": "A ADDITIONAL EXPERIMENTAL RESULTS",
+      "heading_level": null,
+      "page_id": 14,
+      "polygon": [
+        [
+          107.876953125,
+          108.75
+        ],
+        [
+          330.0,
+          108.75
+        ],
+        [
+          330.0,
+          119.25
+        ],
+        [
+          107.876953125,
+          119.25
+        ]
+      ]
+    },
+    {
+      "title": "B SUPPLEMENTARY FIGURES AND TABLES",
+      "heading_level": null,
+      "page_id": 15,
+      "polygon": [
+        [
+          106.681640625,
+          80.82421875
+        ],
+        [
+          335.25,
+          80.82421875
+        ],
+        [
+          335.25,
+          92.25
+        ],
+        [
+          106.681640625,
+          92.25
+        ]
+      ]
+    },
+    {
+      "title": "C FULL IMPLEMENTATION DETAILS",
+      "heading_level": null,
+      "page_id": 16,
+      "polygon": [
+        [
+          106.98046875,
+          81.2109375
+        ],
+        [
+          300.0,
+          81.2109375
+        ],
+        [
+          300.0,
+          92.25
+        ],
+        [
+          106.98046875,
+          91.5
+        ]
+      ]
+    },
+    {
+      "title": "D QUALITIVATE EXAMPLES",
+      "heading_level": null,
+      "page_id": 17,
+      "polygon": [
+        [
+          106.98046875,
+          80.82421875
+        ],
+        [
+          258.75,
+          80.82421875
+        ],
+        [
+          258.75,
+          93.0
+        ],
+        [
+          106.98046875,
+          93.0
+        ]
+      ]
+    },
+    {
+      "title": "Minerva: Question and Ground Truth",
+      "heading_level": null,
+      "page_id": 17,
+      "polygon": [
+        [
+          117.75,
+          139.60546875
+        ],
+        [
+          283.5,
+          139.60546875
+        ],
+        [
+          283.5,
+          149.66015625
+        ],
+        [
+          117.75,
+          149.66015625
+        ]
+      ]
+    },
+    {
+      "title": "Minerva: Response from GRPO (Qwen3-1.7B-Base)",
+      "heading_level": null,
+      "page_id": 17,
+      "polygon": [
+        [
+          117.75,
+          229.32421875
+        ],
+        [
+          342.0,
+          229.32421875
+        ],
+        [
+          342.0,
+          240.0
+        ],
+        [
+          117.75,
+          240.0
+        ]
+      ]
+    },
+    {
+      "title": "Minerva: Response from RL-ZVP (Qwen3-1.7B-Base)",
+      "heading_level": null,
+      "page_id": 18,
+      "polygon": [
+        [
+          117.73828125,
+          104.25
+        ],
+        [
+          351.0,
+          104.25
+        ],
+        [
+          351.0,
+          114.0
+        ],
+        [
+          117.73828125,
+          114.0
+        ]
+      ]
+    },
+    {
+      "title": "Alright, let's solve the problem step by step.",
+      "heading_level": null,
+      "page_id": 18,
+      "polygon": [
+        [
+          117.75,
+          127.5
+        ],
+        [
+          296.25,
+          127.5
+        ],
+        [
+          296.25,
+          137.25
+        ],
+        [
+          117.75,
+          137.25
+        ]
+      ]
+    },
+    {
+      "title": "**Problem Statement:**",
+      "heading_level": null,
+      "page_id": 18,
+      "polygon": [
+        [
+          118.634765625,
+          148.88671875
+        ],
+        [
+          222.0,
+          148.5
+        ],
+        [
+          222.0,
+          157.5
+        ],
+        [
+          118.634765625,
+          158.25
+        ]
+      ]
+    },
+    {
+      "title": "**Understanding the Concepts:**",
+      "heading_level": null,
+      "page_id": 18,
+      "polygon": [
+        [
+          118.634765625,
+          192.75
+        ],
+        [
+          261.0,
+          192.75
+        ],
+        [
+          261.0,
+          203.25
+        ],
+        [
+          118.634765625,
+          203.25
+        ]
+      ]
+    },
+    {
+      "title": "**Step 1: Convert the Distance to Parsecs**",
+      "heading_level": null,
+      "page_id": 18,
+      "polygon": [
+        [
+          117.73828125,
+          360.03515625
+        ],
+        [
+          302.25,
+          360.03515625
+        ],
+        [
+          302.25,
+          370.5
+        ],
+        [
+          117.73828125,
+          370.5
+        ]
+      ]
+    },
+    {
+      "title": "**Step 2: Plug the Values into the Distance Modulus Formula**",
+      "heading_level": null,
+      "page_id": 18,
+      "polygon": [
+        [
+          118.634765625,
+          430.41796875
+        ],
+        [
+          385.5,
+          429.0
+        ],
+        [
+          385.5,
+          438.92578125
+        ],
+        [
+          118.634765625,
+          440.25
+        ]
+      ]
+    },
+    {
+      "title": "**Step 5: Substitute and Solve for m**",
+      "heading_level": null,
+      "page_id": 18,
+      "polygon": [
+        [
+          118.93359375,
+          563.0625
+        ],
+        [
+          282.0,
+          563.0625
+        ],
+        [
+          282.0,
+          572.34375
+        ],
+        [
+          118.93359375,
+          572.34375
+        ]
+      ]
+    },
+    {
+      "title": "AIME24: Question and Ground Truth",
+      "heading_level": null,
+      "page_id": 19,
+      "polygon": [
+        [
+          117.439453125,
+          83.53125
+        ],
+        [
+          283.5,
+          83.53125
+        ],
+        [
+          283.5,
+          93.0
+        ],
+        [
+          117.439453125,
+          93.0
+        ]
+      ]
+    },
+    {
+      "title": "AIME24: Response from GRPO (Qwen3-1.7B-Base)",
+      "heading_level": null,
+      "page_id": 19,
+      "polygon": [
+        [
+          117.75,
+          196.83984375
+        ],
+        [
+          342.75,
+          196.83984375
+        ],
+        [
+          342.75,
+          207.0
+        ],
+        [
+          117.75,
+          207.0
+        ]
+      ]
+    },
+    {
+      "title": "AIME24: Response from RL-ZVP (Qwen3-1.7B-Base)",
+      "heading_level": null,
+      "page_id": 20,
+      "polygon": [
+        [
+          117.73828125,
+          88.55859375
+        ],
+        [
+          351.0,
+          88.55859375
+        ],
+        [
+          351.0,
+          98.61328125
+        ],
+        [
+          117.73828125,
+          98.61328125
+        ]
+      ]
+    },
+    {
+      "title": "**Problem Statement:**",
+      "heading_level": null,
+      "page_id": 20,
+      "polygon": [
+        [
+          118.634765625,
+          145.01953125
+        ],
+        [
+          222.0,
+          145.01953125
+        ],
+        [
+          222.0,
+          153.75
+        ],
+        [
+          118.634765625,
+          153.75
+        ]
+      ]
+    },
+    {
+      "title": "**Understanding the Problem:**",
+      "heading_level": null,
+      "page_id": 20,
+      "polygon": [
+        [
+          119.25,
+          234.3515625
+        ],
+        [
+          257.25,
+          234.3515625
+        ],
+        [
+          257.25,
+          243.75
+        ],
+        [
+          119.25,
+          243.75
+        ]
+      ]
+    },
+    {
+      "title": "**Finding the Space Diagonal:**",
+      "heading_level": null,
+      "page_id": 20,
+      "polygon": [
+        [
+          118.037109375,
+          373.5
+        ],
+        [
+          259.5,
+          372.75
+        ],
+        [
+          259.5,
+          382.8515625
+        ],
+        [
+          118.037109375,
+          384.0
+        ]
+      ]
+    },
+    {
+      "title": "AIME25: Ouestion and Ground Truth",
+      "heading_level": null,
+      "page_id": 21,
+      "polygon": [
+        [
+          117.75,
+          87.0
+        ],
+        [
+          283.5,
+          87.0
+        ],
+        [
+          283.5,
+          97.06640625
+        ],
+        [
+          117.75,
+          97.06640625
+        ]
+      ]
+    },
+    {
+      "title": "AIME25: Response from GRPO (Qwen3-8B-Base)",
+      "heading_level": null,
+      "page_id": 21,
+      "polygon": [
+        [
+          117.75,
+          213.46875
+        ],
+        [
+          335.25,
+          213.46875
+        ],
+        [
+          335.25,
+          224.296875
+        ],
+        [
+          117.75,
+          224.296875
+        ]
+      ]
+    },
+    {
+      "title": "AIME25: Response from RL-ZVP (Qwen3-8B-Base)",
+      "heading_level": null,
+      "page_id": 22,
+      "polygon": [
+        [
+          117.75,
+          92.0390625
+        ],
+        [
+          343.5,
+          92.0390625
+        ],
+        [
+          343.5,
+          101.3203125
+        ],
+        [
+          117.75,
+          101.3203125
+        ]
+      ]
+    },
+    {
+      "title": "MATH500: Question and Ground Truth",
+      "heading_level": null,
+      "page_id": 23,
+      "polygon": [
+        [
+          117.439453125,
+          83.14453125
+        ],
+        [
+          291.75,
+          83.14453125
+        ],
+        [
+          291.75,
+          93.19921875
+        ],
+        [
+          117.439453125,
+          93.19921875
+        ]
+      ]
+    },
+    {
+      "title": "MATH500: Response from GRPO (Qwen3-8B-Base)",
+      "heading_level": null,
+      "page_id": 23,
+      "polygon": [
+        [
+          117.75,
+          201.48046875
+        ],
+        [
+          342.75,
+          201.48046875
+        ],
+        [
+          342.75,
+          213.08203125
+        ],
+        [
+          117.75,
+          213.08203125
+        ]
+      ]
+    },
+    {
+      "title": "MATH500: Response from RL-ZVP (Qwen3-8B-Base)",
+      "heading_level": null,
+      "page_id": 24,
+      "polygon": [
+        [
+          117.73828125,
+          88.171875
+        ],
+        [
+          351.75,
+          88.171875
+        ],
+        [
+          351.75,
+          99.0
+        ],
+        [
+          117.73828125,
+          99.0
+        ]
+      ]
+    }
+  ],
+  "page_stats": [
+    {
+      "page_id": 0,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          57
+        ],
+        [
+          "Span",
+          21
+        ],
+        [
+          "Text",
+          4
+        ],
+        [
+          "PageHeader",
+          2
+        ],
+        [
+          "SectionHeader",
+          2
+        ],
+        [
+          "Figure",
+          1
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "FigureGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 1,
+        "llm_error_count": 0,
+        "llm_tokens_used": 8617,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 1,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          72
+        ],
+        [
+          "Span",
+          51
+        ],
+        [
+          "Text",
+          7
+        ],
+        [
+          "PageHeader",
+          1
+        ],
+        [
+          "Figure",
+          1
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "SectionHeader",
+          1
+        ],
+        [
+          "Equation",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "FigureGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 1,
+        "llm_error_count": 0,
+        "llm_tokens_used": 897,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 2,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          72
+        ],
+        [
+          "Span",
+          33
+        ],
+        [
+          "Text",
+          9
+        ],
+        [
+          "Equation",
+          3
+        ],
+        [
+          "SectionHeader",
+          2
+        ],
+        [
+          "Footnote",
+          2
+        ],
+        [
+          "PageHeader",
+          1
+        ],
+        [
+          "Figure",
+          1
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "FigureGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 3,
+        "llm_error_count": 0,
+        "llm_tokens_used": 2895,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 3,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          56
+        ],
+        [
+          "Span",
+          51
+        ],
+        [
+          "Text",
+          7
+        ],
+        [
+          "PageHeader",
+          1
+        ],
+        [
+          "SectionHeader",
+          1
+        ],
+        [
+          "Equation",
+          1
+        ],
+        [
+          "Footnote",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 1,
+        "llm_error_count": 0,
+        "llm_tokens_used": 1459,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 4,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Span",
+          62
+        ],
+        [
+          "Line",
+          48
+        ],
+        [
+          "Text",
+          6
+        ],
+        [
+          "SectionHeader",
+          4
+        ],
+        [
+          "Equation",
+          2
+        ],
+        [
+          "PageHeader",
+          1
+        ],
+        [
+          "Code",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 2,
+        "llm_error_count": 0,
+        "llm_tokens_used": 2569,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 5,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "TableCell",
+          428
+        ],
+        [
+          "Line",
+          46
+        ],
+        [
+          "Span",
+          29
+        ],
+        [
+          "Text",
+          5
+        ],
+        [
+          "PageHeader",
+          1
+        ],
+        [
+          "SectionHeader",
+          1
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "Table",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "TableGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 1,
+        "llm_error_count": 0,
+        "llm_tokens_used": 6558,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 6,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          74
+        ],
+        [
+          "Span",
+          13
+        ],
+        [
+          "Text",
+          4
+        ],
+        [
+          "PageHeader",
+          1
+        ],
+        [
+          "Figure",
+          1
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "FigureGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 7,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "TableCell",
+          102
+        ],
+        [
+          "Line",
+          55
+        ],
+        [
+          "Span",
+          16
+        ],
+        [
+          "Text",
+          6
+        ],
+        [
+          "PageHeader",
+          1
+        ],
+        [
+          "SectionHeader",
+          1
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "Table",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "TableGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 1,
+        "llm_error_count": 0,
+        "llm_tokens_used": 2721,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 8,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "TableCell",
+          111
+        ],
+        [
+          "Line",
+          54
+        ],
+        [
+          "Span",
+          34
+        ],
+        [
+          "Text",
+          5
+        ],
+        [
+          "SectionHeader",
+          2
+        ],
+        [
+          "PageHeader",
+          1
+        ],
+        [
+          "Table",
+          1
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "TableGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 1,
+        "llm_error_count": 0,
+        "llm_tokens_used": 3340,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 9,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          64
+        ],
+        [
+          "Span",
+          44
+        ],
+        [
+          "Text",
+          8
+        ],
+        [
+          "SectionHeader",
+          2
+        ],
+        [
+          "PageHeader",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 10,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          67
+        ],
+        [
+          "Span",
+          19
+        ],
+        [
+          "ListItem",
+          13
+        ],
+        [
+          "PageHeader",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "ListGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 11,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          61
+        ],
+        [
+          "Span",
+          51
+        ],
+        [
+          "Text",
+          5
+        ],
+        [
+          "PageHeader",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 12,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          63
+        ],
+        [
+          "Span",
+          19
+        ],
+        [
+          "ListItem",
+          9
+        ],
+        [
+          "PageHeader",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "ListGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 13,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          14
+        ],
+        [
+          "Span",
+          5
+        ],
+        [
+          "ListItem",
+          3
+        ],
+        [
+          "PageHeader",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "ListGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 14,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "TableCell",
+          254
+        ],
+        [
+          "Line",
+          31
+        ],
+        [
+          "Span",
+          12
+        ],
+        [
+          "Text",
+          6
+        ],
+        [
+          "SectionHeader",
+          2
+        ],
+        [
+          "Caption",
+          2
+        ],
+        [
+          "PageHeader",
+          1
+        ],
+        [
+          "Table",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "TableGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 1,
+        "llm_error_count": 0,
+        "llm_tokens_used": 4725,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 15,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "TableCell",
+          105
+        ],
+        [
+          "Line",
+          44
+        ],
+        [
+          "Span",
+          15
+        ],
+        [
+          "Caption",
+          2
+        ],
+        [
+          "PageHeader",
+          1
+        ],
+        [
+          "SectionHeader",
+          1
+        ],
+        [
+          "Figure",
+          1
+        ],
+        [
+          "Table",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "FigureGroup",
+          1
+        ],
+        [
+          "TableGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 1,
+        "llm_error_count": 0,
+        "llm_tokens_used": 2528,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 16,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "TableCell",
+          48
+        ],
+        [
+          "Line",
+          31
+        ],
+        [
+          "Span",
+          26
+        ],
+        [
+          "Text",
+          4
+        ],
+        [
+          "PageHeader",
+          1
+        ],
+        [
+          "SectionHeader",
+          1
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "Table",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "TableGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 1,
+        "llm_error_count": 0,
+        "llm_tokens_used": 1700,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 17,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          57
+        ],
+        [
+          "Span",
+          41
+        ],
+        [
+          "Text",
+          14
+        ],
+        [
+          "Equation",
+          7
+        ],
+        [
+          "ListItem",
+          5
+        ],
+        [
+          "SectionHeader",
+          3
+        ],
+        [
+          "ListGroup",
+          2
+        ],
+        [
+          "PageHeader",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 7,
+        "llm_error_count": 0,
+        "llm_tokens_used": 5082,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 18,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          66
+        ],
+        [
+          "Span",
+          51
+        ],
+        [
+          "Text",
+          11
+        ],
+        [
+          "SectionHeader",
+          7
+        ],
+        [
+          "Equation",
+          7
+        ],
+        [
+          "ListItem",
+          6
+        ],
+        [
+          "ListGroup",
+          2
+        ],
+        [
+          "PageHeader",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 7,
+        "llm_error_count": 0,
+        "llm_tokens_used": 5032,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 19,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Span",
+          71
+        ],
+        [
+          "Line",
+          52
+        ],
+        [
+          "Text",
+          13
+        ],
+        [
+          "Equation",
+          7
+        ],
+        [
+          "SectionHeader",
+          2
+        ],
+        [
+          "PageHeader",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 7,
+        "llm_error_count": 0,
+        "llm_tokens_used": 5019,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 20,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Span",
+          83
+        ],
+        [
+          "Line",
+          67
+        ],
+        [
+          "Text",
+          17
+        ],
+        [
+          "Equation",
+          6
+        ],
+        [
+          "SectionHeader",
+          4
+        ],
+        [
+          "ListItem",
+          4
+        ],
+        [
+          "ListGroup",
+          2
+        ],
+        [
+          "PageHeader",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 6,
+        "llm_error_count": 0,
+        "llm_tokens_used": 4200,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 21,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Span",
+          90
+        ],
+        [
+          "Line",
+          66
+        ],
+        [
+          "ListItem",
+          19
+        ],
+        [
+          "Text",
+          9
+        ],
+        [
+          "ListGroup",
+          4
+        ],
+        [
+          "SectionHeader",
+          2
+        ],
+        [
+          "Equation",
+          2
+        ],
+        [
+          "PageHeader",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 2,
+        "llm_error_count": 0,
+        "llm_tokens_used": 1374,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 22,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Span",
+          91
+        ],
+        [
+          "Line",
+          66
+        ],
+        [
+          "Text",
+          24
+        ],
+        [
+          "PageHeader",
+          1
+        ],
+        [
+          "SectionHeader",
+          1
+        ],
+        [
+          "Code",
+          1
+        ],
+        [
+          "Equation",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 1,
+        "llm_error_count": 0,
+        "llm_tokens_used": 714,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 23,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Span",
+          61
+        ],
+        [
+          "Line",
+          31
+        ],
+        [
+          "ListItem",
+          17
+        ],
+        [
+          "Equation",
+          6
+        ],
+        [
+          "Text",
+          5
+        ],
+        [
+          "ListGroup",
+          4
+        ],
+        [
+          "SectionHeader",
+          2
+        ],
+        [
+          "PageHeader",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 6,
+        "llm_error_count": 0,
+        "llm_tokens_used": 4388,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 24,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Span",
+          45
+        ],
+        [
+          "Line",
+          29
+        ],
+        [
+          "Text",
+          23
+        ],
+        [
+          "Equation",
+          4
+        ],
+        [
+          "PageHeader",
+          1
+        ],
+        [
+          "SectionHeader",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 6,
+        "llm_error_count": 0,
+        "llm_tokens_used": 3671,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    }
+  ],
+  "debug_data_path": "debug_data/arXiv:2509.21880"
+}
\ No newline at end of file
diff --git a/research/papers/arXiv-scalerl/_page_0_Figure_7.jpeg b/research/papers/arXiv-scalerl/_page_0_Figure_7.jpeg
new file mode 100644
index 0000000..fec2699
Binary files /dev/null and b/research/papers/arXiv-scalerl/_page_0_Figure_7.jpeg differ
diff --git a/research/papers/arXiv-scalerl/_page_10_Figure_0.jpeg b/research/papers/arXiv-scalerl/_page_10_Figure_0.jpeg
new file mode 100644
index 0000000..6e194ef
Binary files /dev/null and b/research/papers/arXiv-scalerl/_page_10_Figure_0.jpeg differ
diff --git a/research/papers/arXiv-scalerl/_page_10_Figure_2.jpeg b/research/papers/arXiv-scalerl/_page_10_Figure_2.jpeg
new file mode 100644
index 0000000..32e51df
Binary files /dev/null and b/research/papers/arXiv-scalerl/_page_10_Figure_2.jpeg differ
diff --git a/research/papers/arXiv-scalerl/_page_11_Figure_0.jpeg b/research/papers/arXiv-scalerl/_page_11_Figure_0.jpeg
new file mode 100644
index 0000000..0b43341
Binary files /dev/null and b/research/papers/arXiv-scalerl/_page_11_Figure_0.jpeg differ
diff --git a/research/papers/arXiv-scalerl/_page_21_Figure_0.jpeg b/research/papers/arXiv-scalerl/_page_21_Figure_0.jpeg
new file mode 100644
index 0000000..1f2f0e6
Binary files /dev/null and b/research/papers/arXiv-scalerl/_page_21_Figure_0.jpeg differ
diff --git a/research/papers/arXiv-scalerl/_page_21_Figure_2.jpeg b/research/papers/arXiv-scalerl/_page_21_Figure_2.jpeg
new file mode 100644
index 0000000..d342afd
Binary files /dev/null and b/research/papers/arXiv-scalerl/_page_21_Figure_2.jpeg differ
diff --git a/research/papers/arXiv-scalerl/_page_22_Figure_0.jpeg b/research/papers/arXiv-scalerl/_page_22_Figure_0.jpeg
new file mode 100644
index 0000000..38e6f84
Binary files /dev/null and b/research/papers/arXiv-scalerl/_page_22_Figure_0.jpeg differ
diff --git a/research/papers/arXiv-scalerl/_page_22_Figure_2.jpeg b/research/papers/arXiv-scalerl/_page_22_Figure_2.jpeg
new file mode 100644
index 0000000..283aed5
Binary files /dev/null and b/research/papers/arXiv-scalerl/_page_22_Figure_2.jpeg differ
diff --git a/research/papers/arXiv-scalerl/_page_23_Figure_3.jpeg b/research/papers/arXiv-scalerl/_page_23_Figure_3.jpeg
new file mode 100644
index 0000000..4dd743a
Binary files /dev/null and b/research/papers/arXiv-scalerl/_page_23_Figure_3.jpeg differ
diff --git a/research/papers/arXiv-scalerl/_page_24_Figure_0.jpeg b/research/papers/arXiv-scalerl/_page_24_Figure_0.jpeg
new file mode 100644
index 0000000..dac2cb9
Binary files /dev/null and b/research/papers/arXiv-scalerl/_page_24_Figure_0.jpeg differ
diff --git a/research/papers/arXiv-scalerl/_page_25_Figure_0.jpeg b/research/papers/arXiv-scalerl/_page_25_Figure_0.jpeg
new file mode 100644
index 0000000..c169225
Binary files /dev/null and b/research/papers/arXiv-scalerl/_page_25_Figure_0.jpeg differ
diff --git a/research/papers/arXiv-scalerl/_page_25_Figure_1.jpeg b/research/papers/arXiv-scalerl/_page_25_Figure_1.jpeg
new file mode 100644
index 0000000..ad9c5fe
Binary files /dev/null and b/research/papers/arXiv-scalerl/_page_25_Figure_1.jpeg differ
diff --git a/research/papers/arXiv-scalerl/_page_26_Figure_2.jpeg b/research/papers/arXiv-scalerl/_page_26_Figure_2.jpeg
new file mode 100644
index 0000000..8b7f659
Binary files /dev/null and b/research/papers/arXiv-scalerl/_page_26_Figure_2.jpeg differ
diff --git a/research/papers/arXiv-scalerl/_page_27_Figure_1.jpeg b/research/papers/arXiv-scalerl/_page_27_Figure_1.jpeg
new file mode 100644
index 0000000..6f6e9e8
Binary files /dev/null and b/research/papers/arXiv-scalerl/_page_27_Figure_1.jpeg differ
diff --git a/research/papers/arXiv-scalerl/_page_2_Figure_0.jpeg b/research/papers/arXiv-scalerl/_page_2_Figure_0.jpeg
new file mode 100644
index 0000000..31db507
Binary files /dev/null and b/research/papers/arXiv-scalerl/_page_2_Figure_0.jpeg differ
diff --git a/research/papers/arXiv-scalerl/_page_4_Figure_0.jpeg b/research/papers/arXiv-scalerl/_page_4_Figure_0.jpeg
new file mode 100644
index 0000000..5af857a
Binary files /dev/null and b/research/papers/arXiv-scalerl/_page_4_Figure_0.jpeg differ
diff --git a/research/papers/arXiv-scalerl/_page_5_Figure_0.jpeg b/research/papers/arXiv-scalerl/_page_5_Figure_0.jpeg
new file mode 100644
index 0000000..bacdd14
Binary files /dev/null and b/research/papers/arXiv-scalerl/_page_5_Figure_0.jpeg differ
diff --git a/research/papers/arXiv-scalerl/_page_6_Figure_0.jpeg b/research/papers/arXiv-scalerl/_page_6_Figure_0.jpeg
new file mode 100644
index 0000000..9c66965
Binary files /dev/null and b/research/papers/arXiv-scalerl/_page_6_Figure_0.jpeg differ
diff --git a/research/papers/arXiv-scalerl/_page_7_Figure_0.jpeg b/research/papers/arXiv-scalerl/_page_7_Figure_0.jpeg
new file mode 100644
index 0000000..d977aca
Binary files /dev/null and b/research/papers/arXiv-scalerl/_page_7_Figure_0.jpeg differ
diff --git a/research/papers/arXiv-scalerl/_page_8_Figure_0.jpeg b/research/papers/arXiv-scalerl/_page_8_Figure_0.jpeg
new file mode 100644
index 0000000..96e51d6
Binary files /dev/null and b/research/papers/arXiv-scalerl/_page_8_Figure_0.jpeg differ
diff --git a/research/papers/arXiv-scalerl/_page_9_Figure_0.jpeg b/research/papers/arXiv-scalerl/_page_9_Figure_0.jpeg
new file mode 100644
index 0000000..372aca1
Binary files /dev/null and b/research/papers/arXiv-scalerl/_page_9_Figure_0.jpeg differ
diff --git a/research/papers/arXiv-scalerl/arXiv-2510.13786v1.md b/research/papers/arXiv-scalerl/arXiv-2510.13786v1.md
new file mode 100644
index 0000000..d07d364
--- /dev/null
+++ b/research/papers/arXiv-scalerl/arXiv-2510.13786v1.md
@@ -0,0 +1,554 @@
+# The Art of Scaling Reinforcement Learning Compute for LLMs
+
+Devvrit Khatri $^{2,*,\dagger}$ , Lovish Madaan $^{1,3,*}$ , Rishabh Tiwari $^{4\dagger}$ , Rachit Bansal $^{5\dagger}$ , Sai Surya Duvvuri $^{2\dagger}$ , Manzil Zaheer $^{1\dagger}$ , Inderjit S. Dhillon $^{2}$ , David Brandfonbrener $^{1}$ , Rishabh Agarwal $^{6,\dagger}$ 
+
+Reinforcement learning (RL) has become central to training large language models (LLMs), yet the field lacks predictive scaling methodologies comparable to those established for pre-training. Despite rapidly rising compute budgets, there is no principled understanding of how to evaluate algorithmic improvements for scaling RL compute. We present the first large-scale systematic study, amounting to more than 400,000 GPU-hours, that defines a principled framework for analyzing and predicting RL scaling in LLMs. We fit sigmoidal compute-performance curves for RL training and ablate a wide range of common design choices to analyze their effects on asymptotic performance and compute efficiency. We observe: (1) Not all recipes yield similar asymptotic performance, (2) Details such as loss aggregation, normalization, curriculum, and off-policy algorithm primarily modulate compute efficiency without materially shifting the asymptote, and (3) Stable, scalable recipes follow predictable scaling trajectories, enabling extrapolation from smaller-scale runs. Combining these insights, we propose a best-practice recipe, ScaleRL, and demonstrate its effectiveness by successfully scaling and predicting validation performance on a single RL run scaled up to 100,000 GPU-hours. Our work provides both a scientific framework for analyzing scaling in RL and a practical recipe that brings RL training closer to the predictability long achieved in pre-training.
+
+Correspondence: {lovish, brandfon}@meta.com, {devvrit.03, rishabhagarwal.467}@gmail.com
+
+![](_page_0_Figure_7.jpeg)
+
+Figure 1 Predicatably Scaling RL compute to 100,000 GPU Hours (a) We run ScaleRL for 100k GPU hours on an 8B dense model, and 50k GPU hours on a 17Bx16 MoE (Scout). We fit a sigmoid curve (Equation (1)) on pass rate (mean@16) on *iid* validation dataset up to 50k (and 16k) GPU hours and extrapolate to 100k (and 45k) on the 8B (Scout MoE) models respectively. We trained for 7400 steps for 8B and 7100 steps for Scout, which is 3.5× larger than ProRL (Liu et al., 2025a). The extrapolated curve (× markers) closely follows extended training, demonstrating both stability at large compute and predictive fits—establishing ScaleRL as a reliable candidate for RL scaling. (b) Downstream evaluation on AIME-24 shows a consistent scaling trend for ScaleRL, thus generalizing beyond the training data distribution. Moreover, scaling model size substantially improves the downstream and asymptotic RL performance.
+
+ $<sup>^1\</sup>mathrm{Meta},\,^2\mathrm{UT}$ Austin,  $^3\mathrm{UCL},\,^4\mathrm{UC}$ Berkeley,  $^5\mathrm{Harvard}$  University,  $^6\mathrm{Periodic}$  Labs
+
+<sup>\*</sup>Equal contribution, †Work done at Meta
+
+## 1 Introduction
+
+Scaling reinforcement learning (RL) compute is emerging as a critical paradigm for advancing large language models (LLMs). While pre-training establishes the foundations of a model; the subsequent phase of RL training unlocks many of today's most important LLM capabilities, from test-time thinking (OpenAI, 2024; Guo et al., 2025) to agentic capabilities (Kimi Team et al., 2025a). For instance, Deepseek-R1-Zero used 100,000 H800 GPU hours for RL training – 3.75% of its pre-training compute (Guo et al., 2025). This dramatic increase in RL compute is amplified across frontier LLM generations, with more than 10× increase from o1 to o3 (OpenAI, 2025) and a similar leap from Grok-3 to Grok-4 (xAI Team, 2025).
+
+While RL compute for LLMs has scaled massively, our understanding of how to scale RL has not kept pace; the methodology remains more art than science. Recent breakthroughs in RL are largely driven by isolated studies on novel algorithms (e.g., Yu et al. (DAPO, 2025)) and model-specific training reports, such as, MiniMax et al. (2025) and Magistral (Rastogi et al., 2025). Critically, these studies provide ad-hoc solutions tailored to specific contexts, but not how to develop RL methods that scale with compute. This lack of scaling methodology stifles research progress: with no reliable way to identify promising RL candidates a priori, progress is tied to large-scale experimentation that sidelines most of the academic community.
+
+This work lays the groundwork for science of RL scaling by borrowing from the well-established concept of scaling laws from pre-training. While pre-training has converged to algorithmic recipes that scale predictably with compute (Kaplan et al., 2020; Hoffmann et al., 2022; Owen, 2024), the RL landscape lacks a clear standard. As a result, RL practitioners face an overwhelming array of design choices, leaving the fundamental questions of how to scale and what to scale unanswered. To address these questions, we establish a predictive framework for RL performance using a sigmoid-like saturating curve between the expected reward  $(R_C)$  on an iid validation set and training compute (C):
+
+```html
+$$\overbrace{R_C - R_0}^{\text{Reward Gain}} = \overbrace{(A - R_0)}^{\text{Asymptotic Reward Gain}} \times \underbrace{\frac{1}{1 + (C_{\text{mid}}/C)^B}}_{\text{Compute Efficiency}}$$
+ $(\text{fixed model and training data})\quad(1)$ 
+
+```
+
+where  $0 \le A \le 1$  represents the asymptotic pass rate, B > 0 is a scaling exponent that determines the compute efficiency, and  $C_{\text{mid}}$  sets the midpoint of the RL performance curve. A schematic interpretation of these parameters is provided in Figure 3.
+
+This framework in Equation (1) allows researchers to extrapolate performance from lower-compute runs to higher compute budgets, enabling them to evaluate scalability of RL methods without incurring the compute cost of running every experiment to its computational limit.
+
+Guided by this framework, we develop **ScaleRL**, an RL recipe that scales *predictably* with compute. In a massive **100,000 GPU-hours training run**, we show that **ScaleRL**'s performance closely matches the scaling curve predicted by our framework (Figure 1). Critically, scaling curves extrapolated from only the initial stages of training closely match the final observed performance, confirming the predictive ability of our framework to extreme compute scales.
+
+The design of **ScaleRL** is grounded in a comprehensive empirical study of RL scaling that spanned over **400,000 GPU-hours** (on Nvidia GB200 GPUs). This study explored numerous design choices at an 8B model parameters scale, where individual runs use up to 16,000 GPU-hours, making them **6**× **cheaper** than experimenting at our largest training run scale. This investigation yielded three key principles:
+
+- RL Performance Ceilings are Not Universal: As we scale training compute for different methods, they encounter different ceilings on their achievable performance (A). This limit can be shifted by choices such as the loss type and batch size.
+- Embracing the Bitter Lesson: Methods that appear superior at small compute budgets can be worse when extrapolated to large-compute regimes (Figure 2). We can still identify scalable methods by estimating the scaling parameters (A, B) from the early training dynamics using our framework (Equation (1)).
+
+![](_page_2_Figure_0.jpeg)
+
+Figure 2 ScaleRL is more scalable than prevalent RL methods. We fit sigmoid curves (Equation 1) on iid validation dataset to commonly-used training recipes like DeepSeek (GRPO) (Guo et al., 2025), Qwen-2.5 (DAPO) (Yu et al., 2025), Magistral (Rastogi et al., 2025), and Minimax-M1 (MiniMax et al., 2025), and compare them with ScaleRL surpasses all other methods, achieving an asymptotic reward of A = 0.61. Stars denote evaluation points; solid curves show the fitted curve over the range used for fitting; dashed curves extrapolate beyond it. We validate the predictability by running each method for longer ("x" markers), which align closely with the extrapolated curves for stable recipes like ScaleRL and MiniMax. Further description of the individual recipes compared are given in Appendix A.16.
+
+• Re-evaluating Common Wisdom: Common interventions thought to improve peak performance (e.g., loss aggregation, data curriculum, length penalty, advantage normalization) mainly adjust compute efficiency (B), while not changing the performance ceiling considerably.
+
+Based on these insights, **ScaleRL** achieves *predictable* scaling by integrating existing methods, rather than inventing novel methods. Specifically, **ScaleRL** combines asynchronous Pipeline-RL setup (§3.1), forced length interruptions, truncated importance sampling RL loss (CISPO), prompt-level loss averaging, batch-level advantage normalization, FP32 precision at logits, zero-variance filtering, and No-Positive-Resampling – with each component's contribution validated in a leave-one-out ablation, consuming 16,000 GPU-hours per run.
+
+**ScaleRL** not only scales *predictably* but also establishes a new **state-of-the-art** (Figure 2) – it achieves higher asymptotic performance and compute efficiency compared to established RL recipes. Moreover, **ScaleRL** maintains predictable scaling when increasing compute across multiple training axes (§ 5) – including 2.5× larger batch sizes, longer generation lengths up to 32,768 tokens, multi-task RL using math and code, and larger MoE (Llama-4 17B×16); with benefits that consistently transfer to downstream tasks. Overall, this work establishes a rigorous methodology for cost-effectively predicting the scalability of new RL algorithms.
+
+# 2 Preliminaries & Setup
+
+We consider reinforcement learning with LLMs, where prompts x are sampled from a data distribution D. Our setup follows a generator-trainer split across GPUs: a subset of GPUs (generators) use optimized inference kernels for high-throughput rollout generation, while the remaining GPUs (trainers) run the training backend (FSDP) and update parameters. We denote by  $\pi_{\rm gen}^{\theta}$  and  $\pi_{\rm train}^{\theta}$  the model with parameters  $\theta$  on the generator and training backends, respectively. For each prompt, the old policy  $\pi_{\rm gen}^{\theta_{\rm old}}$  on the generator GPUs produces candidate completions, which are then assigned scalar rewards. Policy optimization proceeds by maximizing a clipped surrogate objective, taking expectations over  $x \sim D$  and rollouts from  $\pi_{\rm gen}^{\theta_{\rm old}}$ .
+
+**Training Regimen** All experiments are conducted on the RL for reasoning domain, where the model produces a thinking trace enclosed with special tokens (<think> ... </think>) and a final solution. Unless noted, training uses a sequence length of 16,384 tokens: 12,288 for thinking, 2,048 for the solution, and an additional
+
+2,048 for the input prompt. We adopt the 12,288 thinking budget for faster iteration, and show in Section 5 that ScaleRL extrapolations remain predictive when training with larger thinking budgets (32,768). For math RL experiments, we use the Polaris-53K dataset (An et al., 2025) with a batch size of 768 (48 prompts with 16 generations each). In our setup, scaling RL compute corresponds to running multiple epochs over the training prompts. More details about training, including SFT and hyper-parameters, are in Appendix A.3.
+
+Base RL Algorithm As our starting point in § 3, we start with a "base" algorithm that resembles GRPO (Shao et al., 2024) without any KL regularization term, in line with large-scale training reports (Rastogi et al., 2025; MiniMax et al., 2025). Additionally, we include the asymmetric DAPO clipping (Yu et al., 2025), because of its widespread adoption as a default approach to avoid entropy collapse and maintain output diversity.
+
+For a given prompt x, the old policy  $\pi_{\text{gen}}(\theta_{\text{old}})$  generates G candidate completions  $\{y_i\}_{i=1}^G$ , each assigned a scalar reward  $r_i$ . We compute advantages  $\hat{A}_i$  and group-normalized advantages using:
+
+$$\hat{A}_i = r_i - \mathrm{mean}(\{r_j\}_{j=1}^G), \quad \hat{A}_i^G = \hat{A}_i / (\mathrm{std}(\{r_j\}_{j=1}^G) + \epsilon).$$
+
+Each completion  $y_i$  of length  $|y_i|$  contributes at the token-level importance sampling (IS) ratios  $\rho_{i,t}(\theta)$ , with asymmetric upper and lower clipping thresholds, akin to DAPO (Yu et al., 2025):
+
+$$\rho_{i,t}(\theta) := \frac{\pi_{\text{train}}^{\theta}(y_{i,t} \mid x, y_{i,$$
+
+We aggregate losses at the *sample level*, i.e., averaging per-sample token losses before averaging across samples. The surrogate objective is given by:
+
+$$\mathcal{J}(\theta) = \mathbb{E}_{\substack{x \sim D, \\ \{y_i\}_{i=1}^G \sim \pi_{\text{gen}}^{\theta_{\text{old}}}(\cdot|x)}} \left[ \frac{1}{G} \sum_{i=1}^G \frac{1}{|y_i|} \sum_{t=1}^{|y_i|} \min \left( \rho_{i,t}(\theta) \hat{A}_i^G, \operatorname{clip}_{\text{asym}}(\rho_{i,t}(\theta), \epsilon^-, \epsilon^+) \hat{A}_i^G \right) \right].$$
+\n  
+\n
+$$\tag{3}$$
+
+Controlling Generation Lengths To prevent reasoning output lengths from exploding during training, which harms training stability and efficiency, we use interruptions (GLM-V Team et al., 2025; Yang et al., 2025) that forcibly stop overly long generations by appending an end-of-thinking phrase (e.g., "
+
+ /\* (think>"), signaling the LLM to terminate its reasoning and produce a final answer. We revisit this choice later in Section 4 and compare it with length-penalty that penalizes long generations (Yu et al., 2025; Kimi Team et al., 2025b).
+
+## 2.1 Predictive compute-scaling and fitting curves
+
+Unlike pre-training, which typically uses power-law to fit predictive curves, we model pass rate versus log(compute) with a sigmoidal function (Equation (1)). We do so because we found the sigmoidal fit to be much more robust and stable compared to power law empirically, which we discuss further in Appendix A.4. Moreover, our choice is consistent with prior work that use sigmoid-like power laws to capture bounded metrics such as accuracy (Ruan et al., 2024; Srivastava et al., 2022).
+
+Similar to pre-training studies (Li et al., 2025b; Porian et al., 2025), we find that excluding the very early low-compute regime yields more stable fits, after which training follows a predictable trajectory. Unless noted otherwise, all our scaling fits begin after ~1.5k GPU hours. Further details of the fitting procedure are provided in Appendix A.5 and the robustness of our curve fitting is discussed in Appendix A.7.
+
+Interpreting scaling curves Intuitively, a sigmoidal curve captures saturating returns - grows slowly in the low-compute regime, accelerates sharply through a mid-range of efficient scaling, and then saturates at high compute. We also provide a schematic interpretation of the parameters A, B, and  $C_{mid}$  of the sigmoidal curve in Figure 3. We see that  $B, C_{mid}$  primarily affects the efficiency of the run, and A denotes the asymptotic performance at large compute scale. Further discussion of these parameters is provided in Appendix A.8.
+
+![](_page_4_Figure_0.jpeg)
+
+Figure 3 Interpreting equation (1). We provide an example fit illustrating the roles of parameters A, B, and  $C_{\text{mid}}$ .  $C_{\text{mid}}$  determines the compute point at which half of the total gain is achieved - smaller values correspond to faster ascent toward the asymptote. B controls the curve's steepness, with larger values indicating greater efficiency. A represents the asymptotic performance reached at large compute scales. Further discussion is provided in Appendix A.8.
+
+Scaling curve on held-out validation Consistent with pre-training practice (Hoffmann et al., 2022; Porian et al., 2025), we measure predictive performance on in-distribution validation data. Since our training runs span multiple epochs, we hold out randomly selected 1,000 prompts from the Polaris-53k dataset for validation and use the remainder for training. The scaling curves are fitted on the validation points, which measure the average pass rate every 100 training steps, with 16 generations per prompt on the 1,000 held-out prompts.
+
+# 3 An Empirical Study of RL Scaling
+
+In this section, we conduct RL experiments using an 8B dense model on verifiable math problems. Using the setup described in Section 2, we study several design axes in terms of their predictable compute-scaling behavior, namely asymptotic performance (A) and compute efficiency (B), as shown in Figure 3.
+
+We structure our experiments in three stages – we first ablate design choices on top of the baseline at 3.5k to 4k GPU-hours since some experimental choices destabilize beyond this scale (Appendix A.15). Whenever a design change proved stable, we trained it for longer. Then, we combine the best choices into **ScaleRL** and run leave-one-out (LOO) experiments for 16k GPU-hours in Section 4. Here, we assess predictability by fitting on the first 8k GPU-hours and extrapolating the remainder of the run. Finally, to demonstrate predictable scaling with **ScaleRL**, we also consider training setups with larger batch sizes, mixture-of-experts model, multiple tasks (math and code), and longer sequence lengths in Section 5.
+
+## 3.1 Asynchronous RL Setup
+
+We first investigate the choice of asynchronous off-policy RL setup (Noukhovitch et al., 2024), as it governs training stability and efficiency, generally independent of all other design choices. Specifically, we consider two approaches for off-policy learning: PPO-off-policy-k and PipelineRL-k.
+
+**PPO-off-policy-**k is the default approach for asynchronous RL and has been used previously by Qwen3 (Yang et al., 2025) and ProRL (Liu et al., 2025a). In this setup, the old policy  $\pi_{gen}^{\theta_{\text{old}}}$  generates reasoning traces for a batch of B prompts. Each gradient update processes a mini-batch of  $\hat{B}$  prompts, resulting in  $k = B/\hat{B}$  gradient updates per batch. In our experiments, we fix  $\hat{B} = 48$  prompts (with 16 generations each), and vary  $k \in \{1, 8\}$  by setting  $B = k \times 48$ .
+
+**PipelineRL-**k is a recent approach from Piche et al. (2025) and used by Magistral (Rastogi et al., 2025). In this regimen, generators continuously produce reasoning traces in a streaming fashion. Whenever trainers finish a policy update, the new parameters are immediately pushed to the generators, which continue generating with the updated weights but a stale KV cache from the old policy. Once a full batch of traces is generated, it is passed to the trainers for the next update. In our setup we introduce a parameter k: the trainers wait if they
+
+![](_page_5_Figure_0.jpeg)
+
+Figure 4 (a) Comparing "compute-scaling" of asynchronous off-policy RL setups. We report only the B (scaling exponent) and A (asymptotic pass rate) parameters of the fitted sigmoid curve (Equation 1). PipelineRL-k is much more efficient and slightly better in the large compute limit. (b) Different max off-policyness with PipelineRL.
+
+get k steps ahead of the generators.
+
+We compare these approaches in Figure 4a. PipelineRL and PPO-off-policy achieve similar asymptotic performance A, but PipelineRL substantially improves the compute efficiency B; thus reaching the ceiling A faster. This is because PipelineRL reduces the amount of idle time in the training process. This choice yields reliable gains with fewer tokens, making larger sweeps at a lower compute budget possible. We also vary the maximum off-policyness for PipelineRL and find k=8 to be optimal as shown in Figure 4b, which we discuss further in Appendix A.11.
+
+## 3.2 Algorithmic Choices
+
+Building on the results above, we adopt PipelineRL-8 as our updated baseline. We then study six additional algorithmic axes: (a) loss aggregation, (b) advantage normalization, (c) precision fixes, (d) data curriculum, (e) batch definition, and (f) loss type. In Section 4, we combine the best options into a unified recipe, termed **ScaleRL(Scale**-able **RL)**, and conduct leave-one-out experiments on a larger scale of 16,000 GPU-Hours.
+
+**Loss type** We compare the asymmetric DAPO loss (Eq. 8) with two recently proposed alternatives: GSPO (Zheng et al., 2025a) and CISPO (MiniMax et al., 2025; Yao et al., 2025). GSPO applies importance sampling at the sequence level as opposed to GRPO's token-level formulation. Specifically, GSPO alters the token-level IS ratio (Eq. 2) to sequence-level ratios:  $\rho_i(\theta) = \frac{\pi_{train}(y_i|x,\theta)}{\pi_{gen}(y_i|x,\theta_{old})}$ . CISPO simply combines truncated IS with vanilla policy gradient (Ionides, 2008), where sg is the stop-gradient function:
+
+$$\mathcal{J}_{\text{CISPO}}(\theta) = \underset{\substack{x \sim D, \\ \{y_i\}_{i=1}^G \sim \pi_{gen}(\cdot|x,\theta_{\text{old}})}}{\mathbb{E}} \left[ \frac{1}{T} \sum_{i=1}^G \sum_{t=1}^{|y_i|} \operatorname{sg}(\min(\rho_{i,t}, \epsilon_{\max})) \hat{A}_i \log(\pi_{\text{train}}(y_{i,t}|x, y_{i(4)$$
+
+Figure 5a shows that both GSPO and CISPO substantially outperform DAPO, improving the asymptotic pass-rate A by a large margin. CISPO exhibits a prolonged near-linear reward increase, and is marginally better than GSPO later in training, so we opt for CISPO as our best loss type. Further discussion on off-policy loss types, and their hyperparameter robustness is detailed in Section 4 and Appendix A.17.
+
+**FP32 Precision for LLM logits** The generators and trainers rely on different kernels for inference and training, leading to small numerical mismatches in their token probabilities (He & Lab, 2025). RL training is highly sensitive to such discrepancies, since they directly affect the IS ratio in the surrogate objective. MiniMax et al. (2025) identified that these mismatches are especially pronounced at the language model head, and
+
+![](_page_6_Figure_0.jpeg)
+
+Figure 5 (a) Comparing popular loss functions: DAPO (Yu et al., 2025), GSPO (Zheng et al., 2025a), and CISPO (MiniMax et al., 2025). We find CISPO/GSPO achieve a higher asymptotic reward compared to DAPO. (b) Using FP32 precision in the final layer (LM head) gives a considerable boost in the asymptotic reward.
+
+mitigate this by FP32 computations at the head for both the generator and trainer. As shown in Figure 5b, the precision fix dramatically improves the asymptotic performance A from 0.52 to 0.61. Given this clear benefit, we include the FP32 precision fix in our **ScaleRL** recipe.
+
+**Loss Aggregation** We evaluate three strategies for aggregating the RL loss: (a) *Sample average* where each rollout contributes equally (as in GRPO, Appendix A.2). (b) *Prompt average* where each prompt contributes equally (as in DAPO, Appendix A.2). (c) *Token average* where all token losses in the batch are averaged directly, without intermediate grouping. The comparison results are shown in Appendix A.9 (Figure 14a). We find prompt-average achieves the highest asymptotic performance and therefore use this choice for **ScaleRL**.
+
+Advantage Normalization We compare three variants of advantage normalization: (a) Prompt level where advantages are normalized by the standard deviation of rewards from the rollouts of the same prompt (as in GRPO, Appendix A.2). (b) Batch level where advantages are normalized by the standard deviation across all generations in the batch, as used by Hu et al. (2025a); Rastogi et al. (2025). (c) No normalization where advantages are computed as raw rewards centered by the mean reward of the prompt's generations, without variance scaling (as proposed in Dr. GRPO (Liu et al., 2025b)). A comparison plot is shown in Appendix A.9 (Figure 14b), and all three methods are oberseved to yield similar performance. We therefore adopt batch-level normalization as it is theoretically sound and marginally better. This choice is also further corroborated at a larger scale by the leave-one-out experiments in Section 4.
+
+Zero-Variance Filtering Within each batch, some prompts yield identical rewards across all their generations. These "zero-variance" prompts have zero advantage and therefore contribute zero policy gradient. The default baseline includes such prompts in loss computation, but it is unclear whether they should be included in the effective batch. To test this, we compare the default setting against an effective batch approach, where only prompts with non-zero variance are included in the loss calculation, as done by Seed et al. (2025). Note that zero-variance filtering differs from dynamic sampling in DAPO (Yu et al., 2025). The former merely drop the prompts, while latter resamples more prompts until the batch is full. We show in Figure 6a that using the effective batch performs better asymptotically; and we adopt it in our ScaleRL recipe.
+
+Adaptive Prompt Filtering A number of data curriculum strategies have been proposed for RL training to improve sample efficiency (An et al., 2025; Zhang et al., 2025b; Zheng et al., 2025b). Here we evaluate a simple variant, introduced by An et al. (2025), with the key observation that once a prompt becomes too easy for a policy, it typically remains easy. Since such prompts consume some compute but no longer contribute useful gradient signal (Section 3.2), it is better to exclude them from future training. We implement this
+
+![](_page_7_Figure_0.jpeg)
+
+Figure 6 (a) "Zero" variance filtering: We filter out "zero" variance (accuracy 0 or 1) samples in a batch since they contribute zero policy gradient and find it achieves a higher asymptote, and (b) Adaptive prompt sampling: Filtering out prompts with pass rate > 0.9 in subsequent epochs results in a higher asymptotic performance.
+
+by maintaining a history of pass rates and permanently removing any prompt with pass rate  $\geq 0.9$  from subsequent epochs—we call this **No-Positive-Resampling**. In Figure 6b we compare this curriculum against the default setting where all prompts are resampled uniformly throughout training. We see that the curriculum improves scalability and the asymptotic reward A.
+
+# 4 ScaleRL: Scaling RL Compute Effectively & Predictably
+
+From the design axes studied above, we consolidate the best-performing settings into a single recipe, which we term ScaleRL (Scale-able RL). ScaleRL is an asynchronous RL recipe that uses PipelineRL with 8 steps off-policyness, interruption-based length control for truncation, FP32 computation for logits, and optimizes the  $\mathcal{J}_{\text{ScaleRL}}(\theta)$  loss. This loss combines prompt-level loss aggregation, batch-level advantage normalization, truncated importance-sampling REINFORCE loss (CISPO), zero-variance filtering, and no-positive resampling:
+
+$$\mathcal{J}_{\mathrm{ScaleRL}}(\theta) = \mathbb{E}_{\substack{x \sim D, \\ \{y_i\}_{i=1}^G \sim \pi_{gen}^{\theta_{old}}(\cdot|x)}} \left[ \frac{1}{\sum_{g=1}^G |y_g|} \sum_{i=1}^G \sum_{t=1}^{|y_i|} \mathrm{sg}(\min(\rho_{i,t}, \epsilon)) \hat{A}_i^{\mathrm{norm}} \log \pi_{train}^{\theta}(y_{i,t}) \right],$$
+
+$$\rho_{i,t} = \frac{\pi_{train}^{\theta}(y_{i,t})}{\pi_{gen}^{\theta_{old}}(y_{i,t})}, \quad \hat{A}_i^{\mathrm{norm}} = \hat{A}_i / \hat{A}_{\mathrm{std}}, \quad 0 < \mathrm{mean}(\{r_j\}_{j=1}^G) < 1, \quad \mathrm{pass\_rate}(x) < 0.9,$$
+
+where sg is the stop-gradient function,  $\hat{A}_{std}$  is the standard deviation of all advantages  $\hat{A}_i$  in a batch and pass\_rate(x) denotes the historical pass rate of a prompt x. For forced interruptions, we use the end-of-thinking phrase: "Okay, time is up. Let me stop thinking and formulate a final answer now. 
+
+**Leave-One-Out (LOO) Ablations** To validate that these choices remain optimal when combined, we conduct *leave-one-out* (LOO) experiments: starting from **ScaleRL**, we revert one axis at a time to its baseline counterpart from Section 2. This ensures that each design decision contributes positively even in the presence of all others. Figure 7 reports these experiments, each scaled to 16k GPU hours.
+
+Across all axes, **ScaleRL** consistently remains the most effective configuration, slightly outperforming LOO variants either in asymptotic reward or in compute efficiency (refer to the last column in the Figure 7 table). Since most LOO variants reach similar asymptotic pass rates, we transform the sigmoidal fit to a power-law fit, to highlight efficiency differences via the slope B (details in Figure 7). Concretely, we average the asymptotic reward A across all runs, re-fit the curves with this fixed A, and then compare slopes (measuring efficiency) in Figure 7. The corresponding non-transformed pass-rate vs. compute curves are provided in Appendix A.2.
+
+![](_page_8_Figure_0.jpeg)
+
+Figure 7 Leave-One-Out (LOO) Experiments: Starting from ScaleRL, we revert one design choice at a time to its baseline counterpart and re-train. Most LOO variants reach a similar asymptotic reward, with ScaleRL outperforming slightly overall. The main difference in these methods lies in efficiency. To highlight this, we re-arrange Equation (1) into  $\mathcal{F}(R_c) = C^B$ , where  $\mathcal{F}(R_c) = C^{B}_{\text{mid}}/(\frac{A-R_0}{R_c-R_0}-1)$ , and plot  $\log \mathcal{F}(R_c)$  vs.  $\log C$ . This form makes slope B directly visible, showing that ScaleRL achieves the highest compute efficiency.
+
+Error margin in fitting scaling curves Since RL training is known to exhibit high variance (Agarwal et al., 2021), we use three independent ScaleRL runs (Figure 8a) to estimate the variability in fitted scaling coefficients. The observed variance in asymptotic reward and efficiency parameters serves as our empirical error margin, used to determine whether changes in compute efficiency or asymptotic performance are statistically meaningful for two different runs (Madaan et al., 2024).
+
+**Extrapolating Scaling Curves** In all our LOO experiments as well as independent **ScaleRL** runs, we fit the sigmoidal curve up to 8000 GPU-hours and extrapolate to 16000 GPU-hours, observing that the predicted curves align closely with both training and extended points. This demonstrates the stability and predictability of **ScaleRL** and other stable, scalable recipes under large-scale RL training.
+
+Are the design choices worth it? In Section 3.2, certain design choices alter asymptotic performance, such as loss type (Figure 5a) and FP32 precision (Figure 5b). However, in our LOO experiments with ScaleRL (Figure 7), these components appear less critical individually (last column in the figure). This raises the question of whether certain design choices can be safely left at their "default" values.
+
+We argue the answer is no. Even when a choice seems redundant in the combined recipe, it can still provide stability or robustness that can become decisive in other regimes. For example, while the FP32 precision fix makes little difference with dense 8B trained with **ScaleRL** (Figure 7), it provides large gains in GRPO/DAPO-style losses by mitigating numerical instabilities. This indicates that its benefits extend beyond the specific **ScaleRL** configuration we study. To further test this, we ran a leave-one-out experiment on the Scout 17Bx16 MoE and observed that FP32 precision improves overall scalability (Figure 8b).
+
+A similar case arises with the loss type. In Figure 7, reverting to DAPO yields similar asymptotic performance to CISPO within **ScaleRL**. Nonetheless, as we discuss in Appendix A.17, CISPO is markedly more robust to the choice of IS-clipping parameter  $\epsilon_{\text{max}}$ , reducing the sensitivity of training to hyperparameter tuning. Moreover, it's also more efficient than DAPO, as seen in LOO experiment (B = 2.01 vs B = 1.77). This justifies preferring CISPO, even if a carefully tuned DAPO variant can perform similar asymptotically.
+
+In summary, even when individual design choices appear redundant within the combined recipe, they often enhance training stability, robustness, or efficiency in ways that generalize across models and setups. **ScaleRL** retains such components not just for marginal gains in a specific configuration, but because they address recurring sources of instability and variance that arise across reinforcement learning regimes.
+
+![](_page_9_Figure_0.jpeg)
+
+Figure 8 (a) Variance in scaling fits. We train 3 independent runs of ScaleRL to measure variance. We observe a  $\pm 0.02$  error margin for asymptotic performance A. (b) FP32 LOO on Scout: Comparing ScaleRL on Scout with and without FP32 precision fix at the LM Head. ScaleRL performs better with the FP32 fix.
+
+# 5 Predictable Scaling Returns Across RL Compute Axes
+
+Given a fixed or growing compute budget, which scaling knob –context length, batch size, generations per prompt, and model size – buys the most reliable performance gain, and how early can we predict that return? We answer this by (i) fitting the saturating power-law in equation (1) early in training for each setting (precisely, half the target budget), (ii) extrapolating to the target budget, and (iii) extending training to verify the forecast. Across all axes below we observe clean, predictive fits whose extrapolated curves align with the extended trajectories, mirroring the behavior seen in our 100,000 GPU-hour run (Figure 1), and the cross-recipe comparison in Figure 2.
+
+Model scale (MoE) Does ScaleRL remain predictive and stable on larger models? Training the 17B×16 Llama-4 Scout MoE with ScaleRL exhibits the same predictable scaling behavior as the 8B model, with low truncation rates and no instability pathologies (Appendix A.15, A.17). Figure 1 shows the training curve. The extended points align with the fitted curve, supporting the model-scale invariance of our recipe. Moreover, the larger 17B×16 MoE exhibits much higher asymptotic RL performance than the 8B dense model, outperforming the 8B's performance using only 1/6 of its RL training compute.
+
+**Generation length (context budget)** Increasing the generation length from 14k to 32k tokens slows early progress (lower B and higher  $C_{mid}$ ) but consistently lifts the fitted asymptote (A), yielding higher final performance once sufficient compute is provided (Figure 9). This validates long-context RL as a ceiling-raising knob rather than a mere efficiency trade-off. Extrapolations made from the fit correctly forecast the higher 32k-token trajectory when training is extended.
+
+Global batch size (prompts) Smaller-batch runs show early stagnation on downstream benchmarks even as in-distribution validation performance continues to improve. Larger batches reliably improve the asymptote and avoid the downstream stagnation we observe in smaller-batch runs. Figure 10a shows the same qualitative pattern at mid-scale: small batches may appear better early but are overtaken as compute grows. In our largest math run in Figure 1, moving to batch size of 2048 prompts both stabilized training and yielded a fit that extrapolated from up to 50k GPU hours to the final 100k point.
+
+Generations per prompt (fixed total batch) For a fixed total batch, is it better to allocate more prompts or more generations per prompt? Sweeping generations per prompt 8,16,24,32 and adjusting prompts to keep total batch fixed leaves fitted scaling curves essentially unchanged (Appendix A.13), suggesting that, at
+
+![](_page_10_Figure_0.jpeg)
+
+**Figure 9 Scaling RL Generation Length.** While long-context RL is less efficient initially, it eventually surpasses the performance of the smaller-context run. This trend is observed on both the *iid* validation set (left) as well as downstream evaluations (right).
+
+![](_page_10_Figure_2.jpeg)
+
+Figure 10 Scaling RL batch size. larger batch size is slower in training but settles at a higher asymptote. Batch size show an inverse trend initially where smaller values seem better at lower compute budget, but reach a higher asymptotic performance at larger scale.
+
+moderate batch, this allocation is a second-order choice for both A and B. Clearer differences may emerge at much larger batches (e.g., 2k+), which we leave for future work.
+
+## 6 Related Work
+
+We detail two most relevant works to our study in this section. ProRL (Liu et al., 2025a) demonstrates that prolonged RL fine-tuning on LLMs ( $\sim 2000$  optimization steps, 64 batch size) for 16K GPU-hours using a mix of reasoning tasks uncovers novel solution strategies beyond a model's base capabilities. This longer training regimen delivered significant gains on a 1.5B model, rivaling the performance of larger models on some benchmarks. ProRL's contributions lie in specific heuristics for stability (KL-regularization, policy resetting, entropy controls, etc.) to achieve high performance out of a 1.5B model.
+
+Liu et al. (2025c) offer a complementary perspective and ablates various design choices under consistent conditions on Qwen-3 4B/8B (Yang et al., 2025), and presents a minimalist combination, LitePPO, that outperforms more complex methods like GRPO (Shao et al., 2024) and DAPO (Yu et al., 2025) on smaller
+
+![](_page_11_Figure_0.jpeg)
+
+Figure 11 ScaleRL scales predictably on math and code. We report both the code and math validation set performance on the joint math+code RL run; along with the math only ScaleRL run as a reference. These results demonstrate that our sigmoidal compute-performance relationship holds across task mixtures, and that ScaleRL's scalability generalizes beyond a single domain training.
+
+scale models and compute. This yields valuable algorithmic insights, but the focus is on comparative empirical findings, rather than on scaling behaviour.
+
+None of these work study "scaling" properties of these methods. In fact, the main comparisons are done on downstream evaluations, which may not be not the right metric to study predictable scaling. Rather, as done in pre-training and in our work here, we study performance on in-distribution held out eval set. In contrast to the mentioned related works, our work develops and validates a compute-performance framework with predictive fits, while operating at a much larger compute budget (e.g., 6x larger than ProRL) and model scale compared to the above studies. Additionally, our findings yield a near state-of-the-art RL recipe that can scale predictably to over 100,000 GPU-hours without any stability issues. The rest of the related work is deferred to Appendix A.1.
+
+## 7 Discussion & Conclusion
+
+In this work, we study the scaling properties of different techniques used in RL for LLMs in pursuit of a predictable scalable recipe. With this mission, we derive a method for fitting predictive scaling fits for accuracy on the validation set that allows us to quantify the asymptotic performance and compute efficiency of an RL method. Using this methodology, our primary contribution is to conduct a careful series of ablations of several algorithmic options that go into the RL recipe. For each ablation, we choose the option with higher asymptotic performance when possible and improved efficiency otherwise. Combining these choices yields the **ScaleRL** recipe which scales better than all existing recipes in our experiments.
+
+A few observations are in order:
+
+- Compute scaling extrapolation. An important insight of our scaling methodology is that we can use smaller-scale ablations in a systematic way to predict performance at larger scales. This allows us to create our final scalable recipe.
+- Most important decisions. The off-policy algorithm, loss function, and model precision are the most important decisions from our ablations. Each of the other decisions does not have a large individual effect, but as we see from the leave-one-out experiments, they still do have some cumulative impact (in terms of efficiency) when all combined.
+- Asymptotic performance vs. efficiency. For many of our ablations, we found the better option to
+
+improve both efficiency and asymptotic performance, but this is not always the case (e.g. for FP32, Figure 5b). When doing the "forward" ablations starting from the baseline method, we opt for asymptotic performance first and foremost. Interestingly, when doing the "backward" leave-one-out ablations from the ScaleRL recipe, we find very little impact on asymptotic performance from each decision, but each component of the algorithm seems to help efficiency. This shows that the cumulative effect of the changes is quite robust.
+
+- Generalization. While we report transfer to downstream evaluations, our primary focus is on studying predictive scaling, which is characterized through in-distribution performance curves on a held-out dataset from training prompts (Li et al., 2025b; Muennighoff et al., 2025). This still leaves the question of how well the LLM would generalize from the training distribution to held out test sets. While a full characterization of generalization is beyond the scope of our work, we do observe correlation between in-distribution validation and downstream generalization performance. However, there are some algorithmic choices that seem to help generalization more, that we want to note here including: larger batch size (Section A.14), reducing truncations (Section A.15), longer generation lengths (Section 5, Figure 9), and larger model scale (Section 5, Figure 1).
+- Multi-task RL. While our experiments focus mainly on the math domain, we also evaluate ScaleRL under multi-task RL training. As shown in Figure 11, joint training on math and code yields clean, parallel power-law trends for each domain, with extended runs remaining aligned with the extrapolated curves. While our preliminary results are promising, it would be interesting to thoroughly study predictability of compute scaling for multi-task RL with different training data mixtures.
+
+**Future work** A natural next step is to derive predictive "scaling laws" for RL across pre-training compute, model size, and RL training data. Future studies can also include other axes of RL compute scaling, such as incorporating structured or dense rewards (Setlur et al., 2024) and more compute-intensive generative verifiers (Zhang et al., 2025a), to find optimal compute allocation for RL training. Finally, the methodological framework introduced here can be applied to study the scaling behavior of other post-training regimes, including multi-turn RL, agentic interaction, and long-form reasoning.
+
+There are of course many design choices in RL, so we don't think that our **ScaleRL** recipe is the end of the story. We hope that our focus on scalable RL and methodology for predicting scalability can inspire future work to push the frontier of RL for LLMs even further. To enable future studies to fit compute-performance RL scaling curves, we release a minimal code repository at www.devvrit.com/scalerl curve fitting.
+
+## 8 Acknowledgments
+
+The authors would like to thank Sneha Kudugunta and Niladri Chatterji for helpful discussions on pre-training and scaling laws. Additionally, the authors are grateful to Aviral Kumar, Prateek Jain, Lewis Turnstall, Nathan Lambert, Dzmitry Bahdanau, and John Quan for helpful feedback on earlier drafts. Finally, the authors would also like to thank Jenya Lee and Abhinav Jauhri for infrastructure and compute support.
+
+## References
+
+Rishabh Agarwal, Max Schwarzer, Pablo Samuel Castro, Aaron C Courville, and Marc Bellemare. Deep reinforcement learning at the edge of the statistical precipice. *Advances in Neural Information Processing Systems*, 34:29304–29320, 2021.
+
+Chenxin An, Zhihui Xie, Xiaonan Li, Lei Li, Jun Zhang, Shansan Gong, Ming Zhong, Jingjing Xu, Xipeng Qiu, Mingxuan Wang, and Lingpeng Kong. Polaris: A post-training recipe for scaling reinforcement learning on advanced reasoning models, 2025. URL https://hkunlp.github.io/blog/2025/Polaris.
+
+AoPS. AIME problem set 1983-2025, 2025. URL https://artofproblemsolving.com/wiki/index.php/AIME\_Problems\_and\_Solutions.
+
+Quentin Carbonneaux, Gal Cohen, Jonas Gehring, Jacob Kahn, Jannik Kossen, Felix Kreuk, Emily McMilin, Michel
+
+- Meyer, Yuxiang Wei, David Zhang, et al. Cwm: An open-weights llm for research on code generation with world models. arXiv preprint arXiv:2510.02387, 2025.
+- Ganqu Cui, Yuchen Zhang, Jiacheng Chen, Lifan Yuan, Zhi Wang, Yuxin Zuo, Haozhan Li, Yuchen Fan, Huayu Chen, Weize Chen, Zhiyuan Liu, Hao Peng, Lei Bai, Wanli Ouyang, Yu Cheng, Bowen Zhou, and Ning Ding. The entropy mechanism of reinforcement learning for reasoning language models, 2025. URL https://arxiv.org/abs/2505.22617.
+- GLM-V Team, Wenyi Hong, Wenmeng Yu, Xiaotao Gu, Guo Wang, Guobing Gan, Haomiao Tang, Jiale Cheng, Ji Qi, Junhui Ji, Lihang Pan, Shuaiqi Duan, Weihan Wang, Yan Wang, Yean Cheng, Zehai He, Zhe Su, Zhen Yang, Ziyang Pan, Aohan Zeng, Baoxu Wang, Bin Chen, Boyan Shi, Changyu Pang, Chenhui Zhang, Da Yin, Fan Yang, Guoqing Chen, Jiazheng Xu, Jiale Zhu, Jiali Chen, Jing Chen, Jinhao Chen, Jinghao Lin, Jinjiang Wang, Junjie Chen, Leqi Lei, Letian Gong, Leyi Pan, Mingdao Liu, Mingde Xu, Mingzhi Zhang, Qinkai Zheng, Sheng Yang, Shi Zhong, Shiyu Huang, Shuyuan Zhao, Siyan Xue, Shangqin Tu, Shengbiao Meng, Tianshu Zhang, Tianwei Luo, Tianxiang Hao, Tianyu Tong, Wenkai Li, Wei Jia, Xiao Liu, Xiaohan Zhang, Xin Lyu, Xinyue Fan, Xuancheng Huang, Yanling Wang, Yadong Xue, Yanfeng Wang, Yanzi Wang, Yifan An, Yifan Du, Yiming Shi, Yiheng Huang, Yilin Niu, Yuan Wang, Yuanchang Yue, Yuchen Li, Yutao Zhang, Yuting Wang, Yu Wang, Yuxuan Zhang, Zhao Xue, Zhenyu Hou, Zhengxiao Du, Zihan Wang, Peng Zhang, Debing Liu, Bin Xu, Juanzi Li, Minlie Huang, Yuxiao Dong, and Jie Tang. Glm-4.5v and glm-4.1v-thinking: Towards versatile multimodal reasoning with scalable reinforcement learning, 2025. URL https://arxiv.org/abs/2507.01006.
+- Daya Guo, Dejian Yang, Haowei Zhang, Junxiao Song, Peiyi Wang, Qihao Zhu, Runxin Xu, Ruoyu Zhang, Shirong Ma, Xiao Bi, et al. Deepseek-r1 incentivizes reasoning in llms through reinforcement learning. *Nature*, 645(8081): 633–638, 2025.
+- Horace He and Thinking Machines Lab. Defeating nondeterminism in LLM inference. Thinking Machines Lab: Connectionism, 2025. doi: 10.64434/tml.20250910. https://thinkingmachines.ai/blog/defeating-nondeterminism-in-llm-inference/.
+- Dan Hendrycks, Collin Burns, Saurav Kadavath, Akul Arora, Steven Basart, Eric Tang, Dawn Song, and Jacob Steinhardt. Measuring mathematical problem solving with the math dataset. arXiv preprint arXiv:2103.03874, 2021.
+- Jordan Hoffmann, Sebastian Borgeaud, Arthur Mensch, Elena Buchatskaya, Trevor Cai, Eliza Rutherford, Diego de Las Casas, Lisa Anne Hendricks, Johannes Welbl, Aidan Clark, et al. Training compute-optimal large language models. arXiv preprint arXiv:2203.15556, 2022.
+- Jian Hu, Jason Klein Liu, Haotian Xu, and Wei Shen. Reinforce++: An efficient rlhf algorithm with robustness to both prompt and reward models, 2025a. URL https://arxiv.org/abs/2501.03262.
+- Jingcheng Hu, Yinmin Zhang, Qi Han, Daxin Jiang, Xiangyu Zhang, and Heung-Yeung Shum. Open-reasoner-zero: An open source approach to scaling up reinforcement learning on the base model. arXiv preprint arXiv:2503.24290, 2025b.
+- Edward L Ionides. Truncated importance sampling. Journal of Computational and Graphical Statistics, 17(2):295–311, 2008
+- Jared Kaplan, Sam McCandlish, Tom Henighan, Tom B Brown, Benjamin Chess, Rewon Child, Scott Gray, Alec Radford, Jeffrey Wu, and Dario Amodei. Scaling laws for neural language models. arXiv preprint arXiv:2001.08361, 2020.
+- Kimi Team, Yifan Bai, Yiping Bao, Guanduo Chen, Jiahao Chen, Ningxin Chen, Ruijue Chen, Yanru Chen, Yuankun Chen, Yutian Chen, et al. Kimi k2: Open agentic intelligence. arXiv preprint arXiv:2507.20534, 2025a.
+- Kimi Team, Angang Du, Bofei Gao, Bowei Xing, Changjiu Jiang, Cheng Chen, Cheng Li, Chenjun Xiao, Chenzhuang Du, Chonghua Liao, et al. Kimi k1. 5: Scaling reinforcement learning with llms. arXiv preprint arXiv:2501.12599, 2025b.
+- Aonian Li, Bangwei Gong, Bo Yang, Boji Shan, Chang Liu, Cheng Zhu, Chunhao Zhang, Congchao Guo, Da Chen, Dong Li, et al. Minimax-01: Scaling foundation models with lightning attention. arXiv preprint arXiv:2501.08313, 2025a.
+- Margaret Li, Sneha Kudugunta, and Luke Zettlemoyer. (mis)fitting: A survey of scaling laws, 2025b. URL https://arxiv.org/abs/2502.18969.
+- Hunter Lightman, Vineet Kosaraju, Yuri Burda, Harrison Edwards, Bowen Baker, Teddy Lee, Jan Leike, John Schulman, Ilya Sutskever, and Karl Cobbe. Let's verify step by step. In The Twelfth International Conference on Learning Representations, 2023.
+
+- Mingjie Liu, Shizhe Diao, Ximing Lu, Jian Hu, Xin Dong, Yejin Choi, Jan Kautz, and Yi Dong. Prorl: Prolonged reinforcement learning expands reasoning boundaries in large language models, 2025a. URL https://arxiv.org/abs/2505.24864.
+- Zichen Liu, Changyu Chen, Wenjun Li, Penghui Qi, Tianyu Pang, Chao Du, Wee Sun Lee, and Min Lin. Understanding r1-zero-like training: A critical perspective, 2025b. URL https://arxiv.org/abs/2503.20783.
+- Zihe Liu, Jiashun Liu, Yancheng He, Weixun Wang, Jiaheng Liu, Ling Pan, Xinyu Hu, Shaopan Xiong, Ju Huang, Jian Hu, Shengyi Huang, Siran Yang, Jiamang Wang, Wenbo Su, and Bo Zheng. Part i: Tricks or traps? a deep dive into rl for llm reasoning, 2025c. URL https://arxiv.org/abs/2508.08221.
+- Ilya Loshchilov and Frank Hutter. Decoupled weight decay regularization, 2019. URL https://arxiv.org/abs/1711.05101.
+- Michael Luo, Sijun Tan, Roy Huang, Ameen Patel, Alpay Ariyak, Qingyang Wu, Xiaoxiang Shi, Rachel Xin, Colin Cai, Maurice Weber, Ce Zhang, Erran Li Li, Raluca Ada Popa, and Ion Stoica. Deepcoder: A fully open-source 14b coder at o3-mini level, 2025. URL https://www.together.ai/blog/deepcoder. Notion Blog.
+- Lovish Madaan, Aaditya K. Singh, Rylan Schaeffer, Andrew Poulton, Sanmi Koyejo, Pontus Stenetorp, Sharan Narang, and Dieuwke Hupkes. Quantifying variance in evaluation benchmarks, 2024. URL https://arxiv.org/abs/2406.10229.
+- Aaron Meurer, Christopher P Smith, Mateusz Paprocki, Ondřej Čertík, Sergey B Kirpichev, Matthew Rocklin, AMiT Kumar, Sergiu Ivanov, Jason K Moore, Sartaj Singh, et al. Sympy: symbolic computing in python. *PeerJ Computer Science*, 3:e103, 2017.
+- MiniMax, :, Aili Chen, Aonian Li, Bangwei Gong, Binyang Jiang, Bo Fei, Bo Yang, Boji Shan, Changqing Yu, Chao Wang, Cheng Zhu, Chengjun Xiao, Chengyu Du, Chi Zhang, Chu Qiao, Chunhao Zhang, Chunhui Du, Congchao Guo, Da Chen, Deming Ding, Dianjun Sun, Dong Li, Enwei Jiao, Haigang Zhou, Haimo Zhang, Han Ding, Haohai Sun, Haoyu Feng, Huaiguang Cai, Haichao Zhu, Jian Sun, Jiaqi Zhuang, Jiaren Cai, Jiayuan Song, Jin Zhu, Jingyang Li, Jinhao Tian, Jinli Liu, Junhao Xu, Junjie Yan, Junteng Liu, Junxian He, Kaivi Feng, Ke Yang, Kecheng Xiao, Le Han, Leyang Wang, Lianfei Yu, Liheng Feng, Lin Li, Lin Zheng, Linge Du, Lingyu Yang, Lunbin Zeng, Minghui Yu, Mingliang Tao, Mingyuan Chi, Mozhi Zhang, Mujie Lin, Nan Hu, Nongyu Di, Peng Gao, Pengfei Li, Pengyu Zhao, Qibing Ren, Qidi Xu, Qile Li, Qin Wang, Rong Tian, Ruitao Leng, Shaoxiang Chen, Shaoyu Chen, Shengmin Shi, Shitong Weng, Shuchang Guan, Shuqi Yu, Sichen Li, Songquan Zhu, Tengfei Li, Tianchi Cai, Tianrun Liang, Weiyu Cheng, Weize Kong, Wenkai Li, Xiancai Chen, Xiangjun Song, Xiao Luo, Xiao Su, Xiaobo Li, Xiaodong Han, Xinzhu Hou, Xuan Lu, Xun Zou, Xuyang Shen, Yan Gong, Yan Ma, Yang Wang, Yiqi Shi, Yiran Zhong, Yonghong Duan, Yongxiang Fu, Yongyi Hu, Yu Gao, Yuanxiang Fan, Yufeng Yang, Yuhao Li, Yulin Hu, Yunan Huang, Yunji Li, Yunzhi Xu, Yuxin Mao, Yuxuan Shi, Yuze Wenren, Zehan Li, Zelin Li, Zhanxu Tian, Zhengmao Zhu, Zhenhua Fan, Zhenzhen Wu, Zhichao Xu, Zhihang Yu, Zhiheng Lyu, Zhuo Jiang, Zibo Gao, Zijia Wu, Zijian Song, and Zijun Sun. Minimax-m1: Scaling test-time compute efficiently with lightning attention, 2025. URL https://arxiv.org/abs/2506.13585.
+- Niklas Muennighoff, Alexander M. Rush, Boaz Barak, Teven Le Scao, Aleksandra Piktus, Nouamane Tazi, Sampo Pyysalo, Thomas Wolf, and Colin Raffel. Scaling data-constrained language models, 2025. URL https://arxiv.org/abs/2305.16264.
+- Michael Noukhovitch, Shengyi Huang, Sophie Xhonneux, Arian Hosseini, Rishabh Agarwal, and Aaron Courville. Asynchronous rlhf: Faster and more efficient off-policy rl for language models. arXiv preprint arXiv:2410.18252, 2024.
+- OpenAI. Openai o1 system card. arXiv preprint arXiv:2412.16720, 2024.
+- OpenAI. Introducing OpenAI o3 and o4-mini. https://openai.com/index/introducing-o3-and-o4-mini/, 2025. Accessed: 22 September 2025.
+- David Owen. How predictable is language model benchmark performance? arXiv preprint arXiv:2401.04757, 2024.
+- Alex Piche, Rafael Pardinas, Ehsan Kamalloo, and Dzmitry Bahdanau. Pipelinerl. 2025. URL https://huggingface.co/blog/ServiceNow/pipelinerl.
+- Tomer Porian, Mitchell Wortsman, Jenia Jitsev, Ludwig Schmidt, and Yair Carmon. Resolving discrepancies in compute-optimal scaling of language models, 2025. URL https://arxiv.org/abs/2406.19146.
+- Abhinav Rastogi, Albert Q Jiang, Andy Lo, Gabrielle Berrada, Guillaume Lample, Jason Rute, Joep Barmentlo, Karmesh Yadav, Kartik Khandelwal, Khyathi Raghavi Chandu, et al. Magistral. arXiv preprint arXiv:2506.10910, 2025.
+
+- Yangjun Ruan, Chris J. Maddison, and Tatsunori Hashimoto. Observational scaling laws and the predictability of language model performance, 2024. URL https://arxiv.org/abs/2405.10938.
+- John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, and Oleg Klimov. Proximal policy optimization algorithms, 2017. URL https://arxiv.org/abs/1707.06347.
+- ByteDance Seed, Jiaze Chen, Tiantian Fan, Xin Liu, Lingjun Liu, Zhiqi Lin, Mingxuan Wang, Chengyi Wang, Xiangpeng Wei, Wenyuan Xu, et al. Seed1. 5-thinking: Advancing superb reasoning models with reinforcement learning. arXiv preprint arXiv:2504.13914, 2025.
+- Amrith Setlur, Chirag Nagpal, Adam Fisch, Xinyang Geng, Jacob Eisenstein, Rishabh Agarwal, Alekh Agarwal, Jonathan Berant, and Aviral Kumar. Rewarding progress: Scaling automated process verifiers for llm reasoning, 2024. URL https://arxiv.org/abs/2410.08146.
+- Zhihong Shao, Peiyi Wang, Qihao Zhu, Runxin Xu, Junxiao Song, Xiao Bi, Haowei Zhang, Mingchuan Zhang, YK Li, Yang Wu, et al. Deepseekmath: Pushing the limits of mathematical reasoning in open language models. arXiv preprint arXiv:2402.03300, 2024.
+- Aarohi Srivastava, Abhinav Rastogi, Abhishek Rao, Abu Awal Md Shoeb, Abubakar Abid, Adam Fisch, Adam R Brown, Adam Santoro, Aditya Gupta, Adrià Garriga-Alonso, et al. Beyond the imitation game: Quantifying and extrapolating the capabilities of language models. arXiv preprint arXiv:2206.04615, 2022.
+- Mitchell Wortsman, Tim Dettmers, Luke Zettlemoyer, Ari Morcos, Ali Farhadi, and Ludwig Schmidt. Stable and low-precision training for large-scale vision-language models. *Advances in Neural Information Processing Systems*, 36:10271–10298, 2023.
+- xAI Team. Grok 4. 2025. URL https://x.ai/news/grok-4.
+- Yuxi Xie, Anirudh Goyal, Wenyue Zheng, Min-Yen Kan, Timothy P Lillicrap, Kenji Kawaguchi, and Michael Shieh. Monte carlo tree search boosts reasoning via iterative preference learning. arXiv preprint arXiv:2405.00451, 2024.
+- An Yang, Anfeng Li, Baosong Yang, Beichen Zhang, Binyuan Hui, Bo Zheng, Bowen Yu, Chang Gao, Chengen Huang, Chenxu Lv, Chujie Zheng, Dayiheng Liu, Fan Zhou, Fei Huang, Feng Hu, Hao Ge, Haoran Wei, Huan Lin, Jialong Tang, Jian Yang, Jianhong Tu, Jianwei Zhang, Jianxin Yang, Jiaxi Yang, Jing Zhou, Jingren Zhou, Junyang Lin, Kai Dang, Keqin Bao, Kexin Yang, Le Yu, Lianghao Deng, Mei Li, Mingfeng Xue, Mingze Li, Pei Zhang, Peng Wang, Qin Zhu, Rui Men, Ruize Gao, Shixuan Liu, Shuang Luo, Tianhao Li, Tianyi Tang, Wenbiao Yin, Xingzhang Ren, Xinyu Wang, Xinyu Zhang, Xuancheng Ren, Yang Fan, Yang Su, Yichang Zhang, Yinger Zhang, Yu Wan, Yuqiong Liu, Zekun Wang, Zeyu Cui, Zhenru Zhang, Zhipeng Zhou, and Zihan Qiu. Qwen3 technical report, 2025. URL https://arxiv.org/abs/2505.09388.
+- Feng Yao, Liyuan Liu, Dinghuai Zhang, Chengyu Dong, Jingbo Shang, and Jianfeng Gao. Your efficient RL framework secretly brings you off-policy RL training, 2025. URL https://fengyao.notion.site/off-policy-rl. Accessed through a social media reference.
+- Qiying Yu, Zheng Zhang, Ruofei Zhu, Yufeng Yuan, Xiaochen Zuo, Yu Yue, Weinan Dai, Tiantian Fan, Gaohong Liu, Lingjun Liu, et al. Dapo: An open-source llm reinforcement learning system at scale. arXiv preprint arXiv:2503.14476, 2025
+- Yufeng Yuan, Yu Yue, Ruofei Zhu, Tiantian Fan, and Lin Yan. What's behind PPO's collapse in long-cot? value optimization holds the secret. arXiv preprint arXiv:2503.01491, 2025.
+- Yu Yue, Yufeng Yuan, Qiying Yu, Xiaochen Zuo, Ruofei Zhu, Wenyuan Xu, Jiaze Chen, Chengyi Wang, TianTian Fan, Zhengyin Du, et al. Vapo: Efficient and reliable reinforcement learning for advanced reasoning tasks. arXiv preprint arXiv:2504.05118, 2025.
+- Lunjun Zhang, Arian Hosseini, Hritik Bansal, Mehran Kazemi, Aviral Kumar, and Rishabh Agarwal. Generative verifiers: Reward modeling as next-token prediction, 2025a. URL https://arxiv.org/abs/2408.15240.
+- Ruiqi Zhang, Daman Arora, Song Mei, and Andrea Zanette. SPEED-RL: Faster training of reasoning models via online curriculum learning, 2025b. URL https://arxiv.org/abs/2506.09016.
+- Chujie Zheng, Shixuan Liu, Mingze Li, Xiong-Hui Chen, Bowen Yu, Chang Gao, Kai Dang, Yuqiong Liu, Rui Men, An Yang, Jingren Zhou, and Junyang Lin. Group sequence policy optimization, 2025a. URL https://arxiv.org/abs/2507.18071.
+
+Haizhong Zheng, Yang Zhou, Brian R. Bartoldson, Bhavya Kailkhura, Fan Lai, Jiawei Zhao, and Beidi Chen. Act only when it pays: Efficient reinforcement learning for LLM reasoning via selective rollouts, 2025b. URL <a href="https://arxiv.org/abs/2506.02177">https://arxiv.org/abs/2506.02177</a>.
+
+# A Appendix
+
+## A.1 Extended Related Work
+
+A wave of recent work has applied Reinforcement Learning (RL) to improve the reasoning abilities of large language models (LLMs); often achieving state-of-the-art results on challenging tasks (OpenAI, 2024; Guo et al., 2025; Seed et al., 2025; Carbonneaux et al., 2025). OpenAI's of series of models established that large-scale RL can substantially enhance long-horizon reasoning, but did not release any details on how these models were trained. Deepseek R1 (and R1-Zero) (Guo et al., 2025) provided the first comprehensive study on training high-performing and long Chain-of-Thought (CoT) models primarily via RL, documenting emergent behaviours under extended RL without any reliance on reward models (Lightman et al., 2023) or Monte Carlo Tree Search (MCTS) (Xie et al., 2024).
+
+The earliest widely referenced RLVR (verifiable-reward) algorithm underlying this wave of reasoning development is Group Relative Policy Optimization (GRPO), introduced in Shao et al. (2024). GRPO is a critic-free, group-relative policy gradient with PPO-style clipping that replaces a learned value baseline with group baselines to reduce computational cost and stabilize credit assignment for long CoTs. While GRPO catalyzed rapid progress, subsequent work document its limitations (token-level clipping, model collapse risks) and motivate different group- or sequence- level variants (Yu et al., 2025; Yue et al., 2025; Hu et al., 2025b; Zheng et al., 2025a).
+
+Yu et al. (2025) propose the Decoupled clip and Dynamic Sampling Policy Optimization (DAPO), where they decouple  $\epsilon_{\text{low}}$  and  $\epsilon_{\text{high}}$  clipping in the GRPO objective and do Clip-Higher for  $\epsilon_{\text{high}}$  to avoid entropy collapse. Furthermore, they do dynamic sampling of prompts in a given batch to avoid samples with zero variance (or advantage) which contribute zero policy gradients. Finally, they employ token-level loss aggregation unlike GRPO, which uses sample-level loss averaging. With these modifications, they are able to surpass the vanilla GRPO baseline while avoiding entropy collapse in the RL training. In parallel, Yue et al. (2025) develop VAPO; a value-augmented PPO tailored for long CoTs with strong stability and outperforming value-free baselines like GRPO and DAPO. They combine value pre-training and decoupled Generalized Advantage Estimation (GAE) from VC-PPO (Yuan et al., 2025), loss objective modifications from DAPO, and propose length-adaptive GAE to come up with an open recipe, VAPO, that has been used to train large MoE models in Seed et al. (2025). Similarly, other technical report like Magistral (Rastogi et al., 2025), Kimi-k1.5 (Kimi Team et al., 2025b), Minimax-01 (Li et al., 2025a) detail various details on their RL training recipes, but don't share extensive experiments on why their design choices are better than the baselines.
+
+## A.2 RL for LLMs: GRPO and DAPO
+
+Group Relative Policy Optimization (GRPO) GRPO (Shao et al., 2024) adapts PPO Schulman et al. (2017) for LLM fine-tuning with verifiable rewards. For a given prompt x, the old policy  $\pi_{\text{gen}}(\theta_{\text{old}})$  generates G candidate completions  $\{y_i\}_{i=1}^G$ , each assigned a scalar reward  $r_i$ . To emphasize relative quality within the group, rewards are normalized as
+
+$$\hat{A}_i = \frac{r_i - \mathrm{mean}(\{r_j\}_{j=1}^G)}{\mathrm{std}(\{r_j\}_{j=1}^G) + \varepsilon}.$$
+ (5)
+
+Each completion  $y_i$  of length  $|y_i|$  contributes at the token level through ratios
+
+$$\rho_{i,t}(\theta) = \frac{\pi_{\text{train}}(y_{i,t} \mid x, y_{i, < t}, \theta)}{\pi_{\text{gen}}(y_{i,t} \mid x, y_{i, < t}, \theta_{\text{old}})} \quad \text{(6)}$$
+
+The GRPO objective averages across both completions and tokens:
+
+$$\mathcal{J}_{\text{GRPO}}(\theta) = \mathbb{E}_{\substack{x \sim D, \\ \{y_i\}_{i=1}^G \sim \pi_{\text{gen}(\cdot|x,\theta_{\text{old}})}}} \left[ \frac{1}{G} \sum_{i=1}^G \frac{1}{|y_i|} \sum_{t=1}^{|y_i|} \min\left(\rho_{i,t}(\theta) \hat{A}_i,\ \text{clip}\left(\rho_{i,t}(\theta),1+\epsilon\right) \hat{A}_i\right) \right]$$
+(7)
+
+Thus GRPO preserves token-level policy ratios as in PPO, while using sequence-level, group-normalized advantages to stabilize learning under sparse rewards.
+
+**Decoupled Clip and Dynamic Sampling Policy Optimization (DAPO)** DAPO (Yu et al., 2025) extends GRPO with two key modifications. First, it replaces symmetric clipping with asymmetric clipping, using distinct thresholds for upward and downward deviations:  $\operatorname{clip}_{\operatorname{asym}}(\rho, a) = \operatorname{clip}(\rho, 1 - \epsilon^-, 1 + \epsilon^+)$ , where  $\epsilon^-$  and  $\epsilon^+$  are hyper-parameters.
+
+Second, DAPO changes the aggregation scheme to operate at the *prompt level*. For a given prompt  $x \sim D$ , the old policy produces G completions  $\{y_i\}_{i=1}^G$  with advantages  $\{\hat{A}_i\}$  (Equation (5)). Let  $T = \sum_{i=1}^G |y_i|$  denote the total number of tokens across all completions. With token-level ratios as in Equation (2). The DAPO surrogate objective is
+
+The following is the DAPO objective function:
+
+\n
+$$\mathcal{J}_{\text{DAPO}}(\theta) = \mathbb{E}_{\substack{x \sim D, \\\{y_i\}_{i=1}^G \sim \pi_{\text{gen}}(\cdot|x,\theta_{\text{old}})}} \left[ \frac{1}{T} \sum_{i=1}^G \sum_{t=1}^{|y_i|} \min\left(\rho_{i,t}(\theta) \hat{A}_i, \ \text{clip}_{\text{asym}}\left(\rho_{i,t}(\theta)\right) \hat{A}_i\right) \right].$$
+\nEquation (8)
+
+This prompt-level normalization ensures that each token contributes equally to the prompt's loss, regardless of the number or length of its sampled completions. DAPO also introduces dynamically dropping 0-variance prompts from the batch during training and filling the batch with more prompts until the batch is full. We skip that change here since its effect is similar to having a larger batch size.
+
+## A.3 Training Setup
+
+**Datasets** For small-scale SFT, we use a curated data mix of reasoning traces. We filter this dataset by removing trivial prompts, discarding solution traces exceeding 12k tokens, and decontaminating with AIME 2024/2025 (AoPS, 2025) and MATH-500 (Hendrycks et al., 2021) benchmarks. For the RL stage, we use the Polaris-53K dataset (An et al., 2025) for most of our runs; additionally using the Deepcoder dataset (Luo et al., 2025) for runs with both math and code.
+
+Supervised Fine-tuning We run SFT using a batch size of 2M tokens, max sequence length of 12288, and a learning rate of  $3 \times 10^{-5}$  using the AdamW optimizer (Loshchilov & Hutter, 2019) on 32 H100 GPU nodes for approximately 4 epochs and 32B tokens in total.
+
+Reinforcement Learning We allocate 14k generation budget during RL training, where 12k tokens are allocated to the intermediate reasoning ("thinking"), followed by 2k tokens for the final solution and answer. We sample 48 prompts in each batch, each with 16 generations per prompt. Thus, we get the total batch size as 768 completions per gradient update step. The rewards are given as  $\pm 1$  to correct and incorrect traces respectively. We use a constant learning rate of  $5 \times 10^{-7}$ , AdamW optimizer (Loshchilov & Hutter, 2019) with  $\epsilon = 10^{-15}$ , weight decay of 0.01 (default in AdamW), and a linear warmup of 100 steps. The lower  $\epsilon$  is to avoid gradient clipping (epsilon underflow) (Wortsman et al., 2023).
+
+We use automated checkers like Sympy (Meurer et al., 2017) or Math-Verify<sup>1</sup> for assessing the correctness of the final answer for math problems after stripping out the thinking trace (<think>...</think>). We use a custom code execution environment for coding problems involving unit tests and desired outputs.
+
+We used 80 Nvidia GB200 GPU for a single run, with a compute budget ranging from 3.5-4K GPU hours for establishing different design choices in Section 3.2, 16K for the leave-one-out experiments (Section 4), and finally 30k-100K GPU hours for our larger scale runs (Section 5). We adopt a generator-trainer split between GPUs. For 80 GPU experiments, we set 64 of those as *generators*, responsible for the generation of reasoning trace using the optimized inference codebase. The remaining 16 GPUs act as *trainers*, which receive generated trajectories, perform policy updates, and periodically broadcast updated parameters back to the generators.
+
+### A.4 What curve to fit?
+
+Pre-training curves are usually fit using power-law equation (Li et al., 2025b; Kaplan et al., 2020; Muennighoff et al., 2025), which in our case would model performance as  $R_C = A - D/C^B$ ,  $C \ge C_0$ , where D is a constant, and  $C_0$  marks the compute threshold beyond which the law holds. Intuitively, this implies that each
+
+<sup>&</sup>lt;sup>1</sup>https://github.com/huggingface/Math-Verify
+
+multiplicative increase in compute yields a constant proportional gain in performance. For RL post-training, however, we find a sigmoidal fit (equation (1)) more appropriate for several reasons. First, for bounded metrics such as accuracy or reward, sigmoidal curves provide better predictive fits (Ruan et al., 2024; Srivastava et al., 2022); we observe the same, with accurate extrapolation to higher compute (Figure 1). Second, power laws are unbounded at low compute and are typically fit only beyond a threshold  $C_0$ . In RL, where total training spans far fewer steps (e.g., only  $\sim$ 75 evaluation points to fit only in Figure 1), discarding early points a lot further reduces the already limited data available for fitting. Third, empirically, sigmoidal fits are substantially more robust and stable than power-law fits. Concretely, consider the 100k GPU-hour run on the 8B dense model shown in Figure 1. When we fit a power-law curve between 1.5k–50k GPU hours, it predicts an asymptotic performance of A=1.0, which is clearly incorrect - the actual curve saturates near 0.65. In contrast, the sigmoidal fit yields an accurate prediction of A=0.645. Moreover, the power-law fit is highly sensitive to the chosen fitting regime: fitting over (5k, 50k) GPU hours instead gives A=0.74, while the sigmoidal fit remains robust and still predicts A=0.645. Power-law models only recover the correct asymptote when fitted exclusively in the high-compute regime (e.g., 30k–60k GPU hours). However, our goal is to predict large-scale performance from lower-compute regimes, where such long runs are unavailable.
+
+Given these considerations, we use the sigmoidal form throughout our analysis. Intuitively, a sigmoidal curve captures saturating returns - grows slowly in the low-compute regime, accelerates sharply through a mid-range of efficient scaling, and then saturates at high compute as it approaches a finite performance ceiling.
+
+One thing to note is that at high compute regime, sigmoidal curve behaves same as power-law. Concretely, we can have the following approximation of sigmoidal curve:
+
+$$R_C = R_0 + \frac{A - R_0}{1 + (C_{mid}/C)^B}$$
+ (sigmoidal curve from equation (1))
+
+$$\Rightarrow R_C \approx R_0 + (A - R_0) \left(1 - \frac{C_{mid}^B}{C^B}\right)$$
+ (For  $C \gg C_{mid}$ , high compute regime)
+
+$$= A - \frac{(A - R_0)C_{mid}^B}{C^B}$$
+
+$$= A - \frac{D}{C^B}$$
+
+where above  $D = (A - R_0)C_{mid}^B$ . And this is the same form of power-law mentioned at the start of this section.
+
+## A.5 Fitting scaling curves
+
+We fit the sigmoid-law equation in Equation (1) to the mean reward on our held-out validation set. This set consists of 1,000 prompts held out from the Polaris-53k (An et al., 2025) math dataset, with 16 generations sampled every evaluation step performed at 100 steps intervals.
+
+Directly fitting all three parameters  $\{A, B, C_{mid}\}$  is challenging. Instead, we perform a grid search over  $A \in \{0.450, 0.455, 0.460, \dots, 0.800\}$  and  $C_{mid} \in [100, 40000]$  (searching over 100 linearly separated values), and for each candidate  $A, C_{mid}$  fit B. The best fit (measured by sum of squared residuals) across this grid is selected as the final curve. We use SciPy's curve\_fit with default initialization; varying the initialization strategies produced identical results. To enable future studies to fit compute-performance RL scaling curves, we release a minimal code repository at www.devvrit.com/scalerl\_curve\_fitting.
+
+To estimate the error margin of our fits, we trained three independent **ScaleRL** runs with a batch size of 768 and generation length of 14k (as used in Section 4), shown in Figure 8a. We found that the fit values of A varied by at most  $\pm 0.015$ , suggesting 0.02 as a reasonable error margin on the estimates of asymptotic performance. Estimating the error margin for the fitted value B is difficult, as different algorithms with different A values can have different error margins for B. However, for the purpose of comparing algorithms, we can safely deduce that if two methods achieve similar A values (within 0.02), the one with higher B when a refit is done with the average of A values is at least as good in terms of efficient scalability.
+
+## A.6 Comparing algorithms
+
+Consistent with observations in large-scale pre-training, where the loss exhibits a sharp initial drop before settling into a predictable power-law decay (Li et al., 2025b), we observe a similar two-phase behavior in RL. The mean reward increases rapidly, almost linearly, during the  $\sim$ first epoch ( $\sim$  1k steps, or  $\sim$ 1.5k GPU Hours for most runs), after which the curve follows sigmoidal-law behavior (see Figure 15 to see the "sigmoid" like curve). Our sigmoidal-law fits are applied to this latter portion of the training curve.
+
+Unlike pre-training, our main goal is not to predict the performance of a fixed recipe, but to identify which algorithms and design choices scale reliably, and to design algorithm that exhibits predictive nature. Achieving highly robust fits typically requires very large runs with hundreds or thousands of evaluation points, which is impractical in our setting for two reasons. First, running all ablations at such scale would be computationally prohibitive. Second, many RL algorithms we compare are themselves not scalable to such extreme budgets: they often saturate much earlier or even degrade with more compute due to instability. For example, our baseline method (Section 3.2) destabilizes beyond  $\sim 3500$  GPU-hours, since overlong generation truncations exceed 10% of generations - reducing the effective batch size. More discussion on this is in Section A.15.
+
+As we ablate across different axes in Section 3.2, we discover design choices that improve stability at higher compute. Some ablated variants can scale further, e.g.,  $\sim 5 \mathrm{k}$  GPU hours for  $\epsilon = 0.26$  in DAPO,  $\sim 6 \mathrm{k}$  GPU hours with the FP32 precision fix (Section 3.2), and  $\sim 7 \mathrm{k}$  GPU hours for CISPO. Once we combine the best design choices, we obtain a stable and scalable recipe, which allows us to run leave-one-out (LOO) experiments for  $\sim 1600$  GPU hours per run.
+
+### A.7 Robustness of fits
+
+One may wonder how robust our fitted curves are. We address a few relevant points below:
+
+- For stable and scalable experiments, including all runs from Section 4 onward, changing the fitting regime (e.g., including or excluding the initial 1.5k GPU-hour range) yields similar predictable results. For instance, in the 100k GPU-hour run on the 8B dense model, fitting over (1.5k, 50k) gives B = 1.70, A = 0.645, while (0, 100k) gives B = 1.56, A = 0.655, (0, 50k) gets B = 1.7, A = 0.645, and (5k, 50k) gives B = 1.67, A = 0.645. Across these regimes, parameter values remain within the expected error margin (Section 7).
+- We nonetheless skip the low-compute regime because early training phases, especially in less stable setups from Section 3.2, often plateau prematurely or deviate from the sigmoidal trend due to transient instabilities (see Appendix A.6, A.15). Excluding this region allows the fit to focus on the mid-to-high compute range where saturation behavior is clearer and more consistent.
+- The 1.5k GPU-hour threshold is a heuristic chosen empirically: it approximately corresponds to one epoch for most experiments in Section 3.2. Larger cutoffs reduced the number of fitting points, while smaller ones often introduced noise. We found 1.5k GPU hours to provide the best balance between fit stability and sample coverage, consistent with practices of skipping low-FLOPs regime in pre-training scaling analyses and fitting (Li et al., 2025b).
+
+## A.8 Interpreting Sigmoidal Curves
+
+Figure 3 presented an example fit illustrating the influence of parameters A, B, and  $C_{\rm mid}$ . Here, we extend this with additional illustrations: Figure 12a, Figure 12b, and Figure 13a vary B,  $C_{\rm mid}$ , and A respectively, while keeping the other parameters fixed. We observe that B and  $C_{\rm mid}$  primarily affect the efficiency of scaling, whereas A determines the asymptotic performance achievable at large compute. In Figure 13b we see a case of two runs where one is much more efficient, hence shows initial promising gains, but converges to a lower asymptote, while the other progresses more slowly yet ultimately surpasses it due to a higher A. In practice, scaling strategies should prioritize design choices that raise the asymptotic ceiling A, and only then optimize for efficiency parameters such as B or  $C_{\rm mid}$ .
+
+![](_page_21_Figure_0.jpeg)
+
+Figure 12 Keeping all parameters same and only changing (a) B, (b)  $C_{mid}$ . Both these parameters modulate the efficiency of the training run.
+
+![](_page_21_Figure_2.jpeg)
+
+Figure 13 (a) Keeping all parameters same and only changing A. (b) A design choice can be less efficient yet reach a higher asymptote. When designing scalable methods, one should prioritize choices that raise the asymptotic ceiling A, since the ultimate goal is maximizing performance at scale.
+
+## A.9 Forward and LOO Ablations
+
+We show additional results for Section 3.2 in Figures 14a-14b. We also plot the pass rate vs compute leave one out experiments from Section 4 in Figure 15.
+
+## A.10 Controlling generation length
+
+One common concern in reasoning RL is to control exploding generation lengths, which harms both training efficiency and stability (Appendix A.15). We consider two approaches: (a) *interruptions*, used in works like GLM-4.1V (GLM-V Team et al., 2025), and Qwen3 (Yang et al., 2025) and (b) *length penalties*, used in works like DAPO (Yu et al., 2025), Kimi (Kimi Team et al., 2025b), Magistral (Rastogi et al., 2025), and Minimax-M1 (MiniMax et al., 2025).
+
+**Interruptions** forcibly stop generation by appending a marker phrase such as "Okay, time is up. Let me stop thinking and formulate a final answer </think>", signaling the model to terminate its reasoning and produce a final answer. In our setup, the interruptions tokens are placed randomly in between [10k, 12k] token length, to induce generalization to different generation lengths.
+
+**Length penalties** instead reshape the reward. Following DAPO (Yu et al., 2025), we penalize overly long completions with a tolerance interval  $L_{\text{cache}}$ :
+
+$$R_{\mathrm{length}}(y) = \mathrm{clip}\left(\frac{L_{\mathrm{max}} - |y|}{L_{\mathrm{cache}}} - 1, -1, 0\right)$$
+ (9)
+
+This penalty is added only to the correct traces, discouraging excessively long generations. In the length penalty experiment, we set  $L_{\text{max}} = 14$ k tokens and  $L_{\text{cache}} = 2$ k tokens.
+
+In Section 4, we compare length penalty and interruption at a scale of 16k GPU-Hours. We find that replacing interruption with length penalty in our final **ScaleRL** recipe does not improve performance.
+
+![](_page_22_Figure_0.jpeg)
+
+Figure 14 Comparing (a) loss aggregation, (b) different advantage normalization techniques.
+
+![](_page_22_Figure_2.jpeg)
+
+Figure 15 Comparison of different leave-one-out strategies using 16k GPU-hours budget. loo-plan refers to using prompt level advantage normalization, loo-lp means using length penalty, loo-batch refers to using the entire batch without any 0-variance prompts filtering. loo-8op refers using PPO-offpolicy-8, loo-fp32 means not using FP32 precision fix, loo-savg means using sample average loss aggregation, loo-dapo means using DAPO loss function instead of CISPO. Table in Figure 7 gives the values of  $C_{min}$  in addition to A and B. We notice that all methods have similar values of A (within  $\pm 0.02$  error margin range). Hence, all methods scale well, but affect efficiency parameters B and  $C_{mid}$ .
+
+## A.11 PipelineRL
+
+Using the baseline setup, we ablated the off-policy parameter in PipelineRL (Figure 4b). Both 4 and 8 off-policyness performed equally well, and we adopt 8 as the default setting when updating the baseline in Section 3.1.
+
+Why does PipelineRL consistently outperform the classic PPO-off-policy approach (Sections 3.1 and 4)? We attribute this to its closer alignment with on-policy training. In PPO-off-policy, generation and training proceed in alternating phases: the trainer operates strictly on batches that are as off-policy as the chosen parameter k, making updates based on stale rollouts. In contrast, PipelineRL operates in a streaming fashion. As soon as a batch is available, it is passed to the trainer; likewise, as soon as a model update is ready, it is shared back to the generators, who immediately use it—including in the continuation of partially generated traces. This tight feedback loop keeps training closer to the on-policy regime, reducing the mismatch between generator and trainer distributions.
+
+Importantly, this distinction affects the asymptotic performance c of the scaling curve, not just the efficiency exponent b. Very few axes shift the asymptote in this way, making the choice of off-policy algorithm one of the most consequential design decisions in RL post-training.
+
+## A.12 Entropy Curves: Scaling Batch Size
+
+We tracked entropy on the held-out validation set throughout training. Across all experiments—spanning variations in batch size, number of tasks, generation length, and model scale—we observed a consistent overall decrease in entropy.
+
+An interesting finding is that entropy may not always offer a predictive insight into the performance, as proposed by some recent works like Cui et al. (2025). In Section A.12, we plot entropy for ScaleRL runs with batch sizes 768 and 2048. Despite the 2048-batch size run achieving much stronger downstream performance at every stage (Figure 10b), both runs followed nearly identical entropy trajectories per step (Section A.12). This highlights an important point - although entropy is sometimes used as a proxy for exploration, simply maintaining higher entropy does not translate into better generalization. Instead, larger batches reduced effective exploration similar to smaller batches, per step, yet still yielded substantially better performance - underscoring batch size as an important decisive factor.
+
+![](_page_23_Figure_3.jpeg)
+
+Figure 16 Comparing entropy of large and smaller batch size runs across training steps.
+
+Overall, our findings suggest that while entropy decreases consistently during training, it is not necessarily a reliable predictor of downstream performance. This observation reinforces the need to focus on algorithmic and scaling choices (e.g., batch size, off-policy method) in adddition to entropy dynamics when aiming for improved performance, both on training distribution as well as downstream task distribution.
+
+### A.13 Scaling on multiple axes
+
+We provide the remaining scaling to different axes figure here in Figure 17, and the corresponding downstream evaluation in Figure 18. We also provide the value of  $A, B, C_{mid}$  in Table 1.
+
+## A.14 Downstream performance
+
+In Figure 1, 9, 10b, and 18, we report a representative set of downstream evaluation curves. These include **ScaleRL** runs with batch sizes  $\{512,768,2048\}$ , long-context training run with 32k generation length, the large-model (Scout) training run, a multi-task run (math + code), and different number of generations per prompt (with fixed batch size) run. For each setting we plot performance against compute. Moreover, we see downstream performance better for experiments like larger batch sizes, longer generation length, and large model size - mirroring similar order for validation set curves.
+
+## A.15 Truncations and training instabilities
+
+Across our experiments we found that training instabilities were often linked to truncations. As generation length grew, many RL runs exhibited fluctuating truncation rates that sometimes increased over training.
+
+![](_page_24_Figure_0.jpeg)
+
+Figure 17 Scaling to (a) different number of generations per prompt, (b) Downstream performance of different number of generations per prompt
+
+| Experiment                    | C_mid | B    | A     |
+|-------------------------------|-------|------|-------|
+| ScaleRL                       | 2542  | 1.92 | 0.610 |
+| ScaleRL-32k                   | 11272 | 1.89 | 0.645 |
+| ScaleRL-8gen                  | 2542  | 2.44 | 0.585 |
+| ScaleRL-24gen                 | 3054  | 2.22 | 0.590 |
+| ScaleRL-32gen                 | 2936  | 2.07 | 0.595 |
+| ScaleRL-Scout                 | 4242  | 1.65 | 0.710 |
+| ScaleRL-bs512                 | 2818  | 1.77 | 0.605 |
+| ScaleRL-bs2048                | 10909 | 1.70 | 0.645 |
+| ScaleRL-math+code, math curve | 2896  | 2.05 | 0.595 |
+| ScaleRL-math+code, code curve | 1675  | 1.09 | 0.615 |
+
+**Table 1**  $C_{mid}$ , B, and A values for the large scale runs in Section 5.
+
+At batch size 768, we observed that truncations in the range of 10–15% typically destabilized training, with performance degrading and not recovering without intervention. Examples include the extended GRPO run in Figure 2, where instability correlated with rising truncation rates, and the updated baseline used in Section 3.2.
+
+By contrast, **ScaleRL** runs were more stable. On the 8B model, truncations remained below 5% for over 90% of training. At batch size 2048, truncations were slightly higher, occasionally approaching  $\sim 7\%$ . This increase was largely attributable to longer average generation lengths observed during training, which naturally raise the chance of exceeding the budget. Nevertheless, because the effective batch size (after excluding truncated samples) remained large, training stability was preserved. Intuitively, larger generation length budget should help reduce truncations. Training with 34k generation length (batch 768) remained stable - truncations briefly spiked to  $\sim 4\%$  but quickly fell below 2%.
+
+Larger models were even more robust. On the Scout run, truncations remained consistently below 2%, and for > 90% of training steps were under 1%. This likely reflects both the inherent ability of larger models to regulate generation length and their stronger instruction-following ability, which made interruption signals more effective.
+
+![](_page_25_Figure_0.jpeg)
+
+![](_page_25_Figure_1.jpeg)
+
+Figure 18 Downstream performance of (a) different number of generations per prompt, on AIME, (b) LiveCodeBench (Jan-June 2025) performance on math+code run, (c) AIME-24 performance on math+code run
+
+Overall, we suggest practitioners monitor truncation rates closely. Our findings indicate that high truncation rates are a reliable warning signal of instability, while larger models, higher generation budgets, and careful design choices (as in **ScaleRL**) substantially mitigate this risk.
+
+## A.16 Comparing Prevalent Methods
+
+In Figure 2 we compared some popular training recipes with **ScaleRL**. We briefly describe these existing recipes here.
+
+**DeepSeek (GRPO)** This recipe mostly follows the DeepSeek Guo et al. (2025) work. We use GRPO as the loss function (Section A.2) with  $\epsilon_{min} = \epsilon_{max} = 0.2$ , sample average loss aggregation, and PPO-offpolicy-8 algorithm. We saw the training became unstable post 6k GPU Hours due to truncations (Section A.15).
+
+Qwen2.5 (DAPO) This recipe follows DAPO Yu et al. (2025). It includes the DAPO loss function (Appendix A.2) with  $\epsilon_{min} = 0.2$ ,  $\epsilon_{max} = 0.26$  (Appendix A.17.1). This recipe uses PPO-offpolicy-8, and prompt average loss aggregation. The only change from the original DAPO paper (Yu et al., 2025) was regarding dynamically filling in the batch. Specifically DAPO drops 0-variance prompts and samples more prompts until the batch is full. In our codebase, this was not efficent because for PPO-offpolicy algorithm, we had generators pre-decide that each generator will generate rollouts for #prompts/#generators. Therefore, if a specific generator had more 0-variance prompts, it sampled further prompts to complete its share of #prompts/#generators. This could lead to other generators being stalled and an overall slowdown. Hence, to get around this issue, we rather kept a larger batch size of 1280 (80 prompts, 16 generations each), and dropped 0-variance prompts from the batch. We noted that post-dropping, the effective batch was still greater than 768, what we used for ScaleRL. Therefore, if at all, we gave some advantage to the DAPO recipe.
+
+Magistral This refers to the recipe used in Rastogi et al. (2025). It includes similar recipe as DAPO with the main difference being PipelineRL used as the off-policy algorithm.
+
+**MiniMax** This refers to the recipe used in MiniMax et al. (2025). It uses CISPO loss, FP32 precision fix at the LM head, PPO-offpolicy algorithm, and prompt average. Similar to DAPO, it drops 0-variance prompts as well and hence we give it a larger batch size of 1280 as well.
+
+## A.17 Loss Type - Stability and Robustness
+
+As discussed below, GRPO/DAPO-style losses are highly sensitive to the choice of clipping ratio hyperparameter  $\epsilon_{\text{max}}$ . In contrast, CISPO and GSPO show far greater robustness. For example, in Appendix A.17.2, varying  $\epsilon_{\text{max}}$  for CISPO between  $\{4,5,8\}$  produced no significant differences in performance. For GSPO, the  $10^{-4}$  clipping scale used in the original paper (Zheng et al., 2025a) did not work well in our setting. We therefore ablated across broader scales and found that once the correct order of magnitude was identified (e.g.,  $4 \times 10^{-3}$ ) and higher), performance was stable and largely insensitive to fine-grained changes (e.g.,  $\{4 \times 10^{-3}, 5 \times 10^{-3}\}$ ).
+
+![](_page_26_Figure_2.jpeg)
+
+Figure 19 (a) Comparing upper clipping ratio of DAPO loss function. Change of  $\epsilon_{max}$  fundamentally changes the asymptotic performance value A. (b) CISPO clipping ratio ablations
+
+### A.17.1 DAPO clipping ratios
+
+In this section, we analyze the role of the clipping threshold  $\epsilon_{\text{max}}$  in DAPO Loss Function (equation (8)). The hyper-parameter sensitivity of  $\epsilon_{max}$  has been observed in prior work, for example, GRPO typically sets  $\epsilon_{\text{max}} = 0.2$ , while DAPO uses 0.28. However, beyond tuning sensitivity, we find that  $\epsilon_{\text{max}}$  directly alters the scaling behavior of the algorithm. As  $\epsilon_{\text{max}}$  increases, the terminal reward A increases until an optimal range is reached, after which A decreases again. This is a striking effect: unlike many hyper-parameters that merely shifts the convergence speed,  $\epsilon_{\text{max}}$  governs the asymptotic error itself.
+
+### A.17.2 CISPO Clipping Ratios
+
+We ablate the higher clipping ratio for CISPO, keeping the lower clipping ratio fixed at 0 (Figure 19b). Across a wide range of values, we find little difference in performance, indicating that CISPO is largely insensitive to this hyperparameter. This robustness mirrors our findings for GSPO (Section A.17.3), and stands in contrast to DAPO/GRPO-style objectives, which are highly sensitive to the exact choice of clipping threshold. Such stability under hyperparameter variation makes CISPO a strong candidate for default use in large-scale training.
+
+### A.17.3 GSPO ablations
+
+We ablate the clipping-ratio scale used in GSPO, as shown in Figure 20a. The default  $10^{-4}$  scale as given in the GSPO paper Zheng et al. (2025a) does not scale the best for our 8B model. The  $10^{-3}$  scale performs as well as, or better than, alternatives (Figure 20a) Given this scale, we further varied the upper clipping ratio in  $\{4 \times 10^{-3}, 5 \times 10^{-3}\}$  and found  $\{5 \times 10^{-3}\}$  yielded slightly better fit (Figure 20b).
+
+An important observation is that GSPO is quite robust to the choice of clipping ratio. Once the correct scale is identified, most nearby values or even larger scale perform similarly. This robustness contrasts sharply
+
+with DAPO-style losses, which are highly sensitive to the exact value of the higher clipping ratio, as noted in Section 3.2.
+
+![](_page_27_Figure_1.jpeg)
+
+Figure 20 (a) GSPO Scale comparison. gspo $_x_y_e_z$  in the legend means an upper and lower threshold of  $\{x \times 10^{-z} \text{ and } y \times 10^{-z} \}$  respectively. (b) With  $10^{-3}$  scale, we found similar performance for both  $4\_3\_e\_3$  and  $5\_3\_e\_3$ , with latter performing slightly better.
+
+### A.17.4 GSPO vs CISPO
+
+Despite hyperparameter robustness, we encountered stability issues with GSPO. On multiple occasions, GSPO runs diverged mid-training, leading to sudden drops in performance. For 8B models, restarting from a stable checkpoint allowed recovery, but this strategy failed on larger models such as Scout, where instability persisted despite repeated resetting to a stable checkpoint. While we checked to the best of our ability for any implementation bugs, we did not find one.
+
+Overall, while all three loss families can be competitive under tuned settings, CISPO offers the best balance of stability and robustness to hyperparameters, making it our recommended choice.
\ No newline at end of file
diff --git a/research/papers/arXiv-scalerl/arXiv-2510.13786v1_meta.json b/research/papers/arXiv-scalerl/arXiv-2510.13786v1_meta.json
new file mode 100644
index 0000000..ac12c18
--- /dev/null
+++ b/research/papers/arXiv-scalerl/arXiv-2510.13786v1_meta.json
@@ -0,0 +1,2040 @@
+{
+  "table_of_contents": [
+    {
+      "title": "The Art of Scaling Reinforcement Learning Compute for LLMs",
+      "heading_level": null,
+      "page_id": 0,
+      "polygon": [
+        [
+          85.5,
+          80.05078125
+        ],
+        [
+          454.5,
+          80.05078125
+        ],
+        [
+          454.5,
+          117.94921875
+        ],
+        [
+          85.5,
+          117.94921875
+        ]
+      ]
+    },
+    {
+      "title": "1 Introduction",
+      "heading_level": null,
+      "page_id": 1,
+      "polygon": [
+        [
+          69.0,
+          65.35546875
+        ],
+        [
+          165.75,
+          65.35546875
+        ],
+        [
+          165.75,
+          77.25
+        ],
+        [
+          69.0,
+          77.25
+        ]
+      ]
+    },
+    {
+      "title": "2 Preliminaries & Setup",
+      "heading_level": null,
+      "page_id": 2,
+      "polygon": [
+        [
+          69.0,
+          556.875
+        ],
+        [
+          222.0,
+          556.875
+        ],
+        [
+          222.0,
+          570.796875
+        ],
+        [
+          69.0,
+          570.796875
+        ]
+      ]
+    },
+    {
+      "title": "2.1 Predictive compute-scaling and fitting curves",
+      "heading_level": null,
+      "page_id": 3,
+      "polygon": [
+        [
+          70.224609375,
+          486.0
+        ],
+        [
+          323.25,
+          486.0
+        ],
+        [
+          323.25,
+          495.0
+        ],
+        [
+          70.224609375,
+          495.0
+        ]
+      ]
+    },
+    {
+      "title": "3 An Empirical Study of RL Scaling",
+      "heading_level": null,
+      "page_id": 4,
+      "polygon": [
+        [
+          70.224609375,
+          360.03515625
+        ],
+        [
+          288.0,
+          360.03515625
+        ],
+        [
+          288.0,
+          373.18359375
+        ],
+        [
+          70.224609375,
+          373.18359375
+        ]
+      ]
+    },
+    {
+      "title": "3.1 Asynchronous RL Setup",
+      "heading_level": null,
+      "page_id": 4,
+      "polygon": [
+        [
+          69.0,
+          526.7109375
+        ],
+        [
+          216.3515625,
+          526.7109375
+        ],
+        [
+          216.3515625,
+          537.0
+        ],
+        [
+          69.0,
+          537.0
+        ]
+      ]
+    },
+    {
+      "title": "3.2 Algorithmic Choices",
+      "heading_level": null,
+      "page_id": 5,
+      "polygon": [
+        [
+          69.75,
+          410.25
+        ],
+        [
+          198.0,
+          410.25
+        ],
+        [
+          198.0,
+          419.9765625
+        ],
+        [
+          69.75,
+          419.9765625
+        ]
+      ]
+    },
+    {
+      "title": "4 ScaleRL: Scaling RL Compute Effectively & Predictably",
+      "heading_level": null,
+      "page_id": 7,
+      "polygon": [
+        [
+          69.92578125,
+          370.86328125
+        ],
+        [
+          423.0,
+          370.86328125
+        ],
+        [
+          423.0,
+          384.01171875
+        ],
+        [
+          69.92578125,
+          384.01171875
+        ]
+      ]
+    },
+    {
+      "title": "5 Predictable Scaling Returns Across RL Compute Axes",
+      "heading_level": null,
+      "page_id": 9,
+      "polygon": [
+        [
+          69.75,
+          308.6015625
+        ],
+        [
+          413.25,
+          308.6015625
+        ],
+        [
+          413.25,
+          320.9765625
+        ],
+        [
+          69.75,
+          320.9765625
+        ]
+      ]
+    },
+    {
+      "title": "6 Related Work",
+      "heading_level": null,
+      "page_id": 10,
+      "polygon": [
+        [
+          69.75,
+          582.75
+        ],
+        [
+          173.25,
+          582.75
+        ],
+        [
+          173.25,
+          594.0
+        ],
+        [
+          69.75,
+          594.0
+        ]
+      ]
+    },
+    {
+      "title": "7 Discussion & Conclusion",
+      "heading_level": null,
+      "page_id": 11,
+      "polygon": [
+        [
+          69.75,
+          474.75
+        ],
+        [
+          237.0,
+          474.75
+        ],
+        [
+          237.0,
+          485.25
+        ],
+        [
+          69.75,
+          485.25
+        ]
+      ]
+    },
+    {
+      "title": "8 Acknowledgments",
+      "heading_level": null,
+      "page_id": 12,
+      "polygon": [
+        [
+          70.822265625,
+          474.50390625
+        ],
+        [
+          204.0,
+          474.50390625
+        ],
+        [
+          204.0,
+          487.65234375
+        ],
+        [
+          70.822265625,
+          487.65234375
+        ]
+      ]
+    },
+    {
+      "title": "References",
+      "heading_level": null,
+      "page_id": 12,
+      "polygon": [
+        [
+          69.75,
+          566.9296875
+        ],
+        [
+          140.25,
+          566.9296875
+        ],
+        [
+          140.25,
+          577.5
+        ],
+        [
+          69.75,
+          577.5
+        ]
+      ]
+    },
+    {
+      "title": "A Appendix",
+      "heading_level": null,
+      "page_id": 17,
+      "polygon": [
+        [
+          69.328125,
+          65.7421875
+        ],
+        [
+          150.75,
+          65.7421875
+        ],
+        [
+          150.75,
+          78.1171875
+        ],
+        [
+          69.328125,
+          78.1171875
+        ]
+      ]
+    },
+    {
+      "title": "A.1 Extended Related Work",
+      "heading_level": null,
+      "page_id": 17,
+      "polygon": [
+        [
+          69.626953125,
+          93.0
+        ],
+        [
+          216.0,
+          93.0
+        ],
+        [
+          216.0,
+          102.75
+        ],
+        [
+          69.626953125,
+          102.75
+        ]
+      ]
+    },
+    {
+      "title": "A.2 RL for LLMs: GRPO and DAPO",
+      "heading_level": null,
+      "page_id": 17,
+      "polygon": [
+        [
+          69.0,
+          474.0
+        ],
+        [
+          252.75,
+          474.0
+        ],
+        [
+          252.75,
+          483.75
+        ],
+        [
+          69.0,
+          483.75
+        ]
+      ]
+    },
+    {
+      "title": "A.3 Training Setup",
+      "heading_level": null,
+      "page_id": 18,
+      "polygon": [
+        [
+          69.0,
+          284.625
+        ],
+        [
+          171.75,
+          284.625
+        ],
+        [
+          171.75,
+          295.5
+        ],
+        [
+          69.0,
+          295.5
+        ]
+      ]
+    },
+    {
+      "title": "A.4 What curve to fit?",
+      "heading_level": null,
+      "page_id": 18,
+      "polygon": [
+        [
+          69.75,
+          645.43359375
+        ],
+        [
+          189.0,
+          645.43359375
+        ],
+        [
+          189.0,
+          654.75
+        ],
+        [
+          69.75,
+          654.75
+        ]
+      ]
+    },
+    {
+      "title": "A.5 Fitting scaling curves",
+      "heading_level": null,
+      "page_id": 19,
+      "polygon": [
+        [
+          69.0,
+          477.75
+        ],
+        [
+          204.75,
+          477.75
+        ],
+        [
+          204.75,
+          487.5
+        ],
+        [
+          69.0,
+          487.5
+        ]
+      ]
+    },
+    {
+      "title": "A.6 Comparing algorithms",
+      "heading_level": null,
+      "page_id": 20,
+      "polygon": [
+        [
+          69.328125,
+          64.1953125
+        ],
+        [
+          210.0,
+          64.1953125
+        ],
+        [
+          210.0,
+          75.0234375
+        ],
+        [
+          69.328125,
+          75.0234375
+        ]
+      ]
+    },
+    {
+      "title": "A.7 Robustness of fits",
+      "heading_level": null,
+      "page_id": 20,
+      "polygon": [
+        [
+          69.0,
+          326.00390625
+        ],
+        [
+          189.75,
+          326.00390625
+        ],
+        [
+          189.75,
+          335.25
+        ],
+        [
+          69.0,
+          335.25
+        ]
+      ]
+    },
+    {
+      "title": "A.8 Interpreting Sigmoidal Curves",
+      "heading_level": null,
+      "page_id": 20,
+      "polygon": [
+        [
+          69.626953125,
+          570.75
+        ],
+        [
+          249.0,
+          570.75
+        ],
+        [
+          249.0,
+          580.5
+        ],
+        [
+          69.626953125,
+          580.5
+        ]
+      ]
+    },
+    {
+      "title": "A.9 Forward and LOO Ablations",
+      "heading_level": null,
+      "page_id": 21,
+      "polygon": [
+        [
+          69.0,
+          389.25
+        ],
+        [
+          237.0,
+          389.25
+        ],
+        [
+          237.0,
+          399.0
+        ],
+        [
+          69.0,
+          399.0
+        ]
+      ]
+    },
+    {
+      "title": "A.10 Controlling generation length",
+      "heading_level": null,
+      "page_id": 21,
+      "polygon": [
+        [
+          69.0,
+          447.75
+        ],
+        [
+          250.5,
+          447.75
+        ],
+        [
+          250.5,
+          457.875
+        ],
+        [
+          69.0,
+          457.875
+        ]
+      ]
+    },
+    {
+      "title": "A.11 PipelineRL",
+      "heading_level": null,
+      "page_id": 22,
+      "polygon": [
+        [
+          69.0,
+          489.0
+        ],
+        [
+          157.5,
+          489.0
+        ],
+        [
+          157.5,
+          499.5
+        ],
+        [
+          69.0,
+          499.5
+        ]
+      ]
+    },
+    {
+      "title": "A.12 Entropy Curves: Scaling Batch Size",
+      "heading_level": null,
+      "page_id": 23,
+      "polygon": [
+        [
+          69.0,
+          64.1953125
+        ],
+        [
+          279.75,
+          64.1953125
+        ],
+        [
+          279.75,
+          75.0234375
+        ],
+        [
+          69.0,
+          75.0234375
+        ]
+      ]
+    },
+    {
+      "title": "A.13 Scaling on multiple axes",
+      "heading_level": null,
+      "page_id": 23,
+      "polygon": [
+        [
+          69.0,
+          514.3359375
+        ],
+        [
+          222.75,
+          514.3359375
+        ],
+        [
+          222.75,
+          523.6171875
+        ],
+        [
+          69.0,
+          523.6171875
+        ]
+      ]
+    },
+    {
+      "title": "A.14 Downstream performance",
+      "heading_level": null,
+      "page_id": 23,
+      "polygon": [
+        [
+          69.0,
+          571.5
+        ],
+        [
+          237.0,
+          571.5
+        ],
+        [
+          237.0,
+          582.0
+        ],
+        [
+          69.0,
+          582.0
+        ]
+      ]
+    },
+    {
+      "title": "A.15 Truncations and training instabilities",
+      "heading_level": null,
+      "page_id": 23,
+      "polygon": [
+        [
+          69.626953125,
+          678.0
+        ],
+        [
+          285.75,
+          678.0
+        ],
+        [
+          285.75,
+          687.75
+        ],
+        [
+          69.626953125,
+          687.75
+        ]
+      ]
+    },
+    {
+      "title": "A.16 Comparing Prevalent Methods",
+      "heading_level": null,
+      "page_id": 25,
+      "polygon": [
+        [
+          69.626953125,
+          362.25
+        ],
+        [
+          257.25,
+          362.25
+        ],
+        [
+          257.25,
+          372.75
+        ],
+        [
+          69.626953125,
+          372.75
+        ]
+      ]
+    },
+    {
+      "title": "A.17 Loss Type - Stability and Robustness",
+      "heading_level": null,
+      "page_id": 26,
+      "polygon": [
+        [
+          69.328125,
+          64.58203125
+        ],
+        [
+          287.25,
+          64.58203125
+        ],
+        [
+          287.25,
+          75.0
+        ],
+        [
+          69.328125,
+          75.0
+        ]
+      ]
+    },
+    {
+      "title": "A.17.1 DAPO clipping ratios",
+      "heading_level": null,
+      "page_id": 26,
+      "polygon": [
+        [
+          69.0,
+          409.5
+        ],
+        [
+          191.25,
+          409.5
+        ],
+        [
+          191.25,
+          419.203125
+        ],
+        [
+          69.0,
+          419.203125
+        ]
+      ]
+    },
+    {
+      "title": "A.17.2 CISPO Clipping Ratios",
+      "heading_level": null,
+      "page_id": 26,
+      "polygon": [
+        [
+          69.0,
+          513.0
+        ],
+        [
+          198.75,
+          513.0
+        ],
+        [
+          198.75,
+          522.0
+        ],
+        [
+          69.0,
+          522.0
+        ]
+      ]
+    },
+    {
+      "title": "A.17.3 GSPO ablations",
+      "heading_level": null,
+      "page_id": 26,
+      "polygon": [
+        [
+          69.0,
+          617.203125
+        ],
+        [
+          171.75,
+          617.203125
+        ],
+        [
+          171.75,
+          626.484375
+        ],
+        [
+          69.0,
+          626.484375
+        ]
+      ]
+    },
+    {
+      "title": "A.17.4 GSPO vs CISPO",
+      "heading_level": null,
+      "page_id": 27,
+      "polygon": [
+        [
+          69.0,
+          354.0
+        ],
+        [
+          173.25,
+          354.0
+        ],
+        [
+          173.25,
+          363.0
+        ],
+        [
+          69.0,
+          363.0
+        ]
+      ]
+    }
+  ],
+  "page_stats": [
+    {
+      "page_id": 0,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          77
+        ],
+        [
+          "Span",
+          35
+        ],
+        [
+          "Text",
+          3
+        ],
+        [
+          "Footnote",
+          2
+        ],
+        [
+          "PageHeader",
+          1
+        ],
+        [
+          "SectionHeader",
+          1
+        ],
+        [
+          "Figure",
+          1
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "FigureGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 1,
+        "llm_error_count": 0,
+        "llm_tokens_used": 7267,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 1,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          57
+        ],
+        [
+          "Span",
+          41
+        ],
+        [
+          "Text",
+          7
+        ],
+        [
+          "ListItem",
+          2
+        ],
+        [
+          "SectionHeader",
+          1
+        ],
+        [
+          "Equation",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "ListGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 1,
+        "llm_error_count": 0,
+        "llm_tokens_used": 1108,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 2,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          64
+        ],
+        [
+          "Span",
+          42
+        ],
+        [
+          "Text",
+          5
+        ],
+        [
+          "Figure",
+          1
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "SectionHeader",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "FigureGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 3,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          51
+        ],
+        [
+          "Span",
+          41
+        ],
+        [
+          "Text",
+          9
+        ],
+        [
+          "Equation",
+          3
+        ],
+        [
+          "SectionHeader",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 3,
+        "llm_error_count": 0,
+        "llm_tokens_used": 3345,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 4,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Span",
+          62
+        ],
+        [
+          "Line",
+          58
+        ],
+        [
+          "Text",
+          6
+        ],
+        [
+          "SectionHeader",
+          2
+        ],
+        [
+          "Figure",
+          1
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "FigureGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 5,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          59
+        ],
+        [
+          "Span",
+          33
+        ],
+        [
+          "Text",
+          6
+        ],
+        [
+          "Figure",
+          1
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "SectionHeader",
+          1
+        ],
+        [
+          "Equation",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "FigureGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 1,
+        "llm_error_count": 0,
+        "llm_tokens_used": 1294,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 6,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          61
+        ],
+        [
+          "Span",
+          20
+        ],
+        [
+          "Text",
+          5
+        ],
+        [
+          "Figure",
+          1
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "FigureGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 7,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          51
+        ],
+        [
+          "Span",
+          33
+        ],
+        [
+          "Text",
+          5
+        ],
+        [
+          "Figure",
+          1
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "SectionHeader",
+          1
+        ],
+        [
+          "Equation",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "FigureGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 1,
+        "llm_error_count": 0,
+        "llm_tokens_used": 1596,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 8,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          83
+        ],
+        [
+          "Span",
+          37
+        ],
+        [
+          "Text",
+          6
+        ],
+        [
+          "Figure",
+          1
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "FigureGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 9,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          78
+        ],
+        [
+          "Span",
+          17
+        ],
+        [
+          "Text",
+          5
+        ],
+        [
+          "Figure",
+          1
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "SectionHeader",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "FigureGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 10,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          95
+        ],
+        [
+          "Span",
+          12
+        ],
+        [
+          "Text",
+          3
+        ],
+        [
+          "Figure",
+          2
+        ],
+        [
+          "Caption",
+          2
+        ],
+        [
+          "FigureGroup",
+          2
+        ],
+        [
+          "SectionHeader",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 11,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          63
+        ],
+        [
+          "Span",
+          16
+        ],
+        [
+          "Text",
+          4
+        ],
+        [
+          "ListItem",
+          3
+        ],
+        [
+          "Figure",
+          1
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "SectionHeader",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "FigureGroup",
+          1
+        ],
+        [
+          "ListGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 12,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          58
+        ],
+        [
+          "Span",
+          18
+        ],
+        [
+          "Text",
+          8
+        ],
+        [
+          "ListItem",
+          2
+        ],
+        [
+          "SectionHeader",
+          2
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "ListGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 13,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          64
+        ],
+        [
+          "Span",
+          21
+        ],
+        [
+          "ListItem",
+          16
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "ListGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 14,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          66
+        ],
+        [
+          "Span",
+          34
+        ],
+        [
+          "ListItem",
+          16
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "ListGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 15,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          67
+        ],
+        [
+          "Span",
+          22
+        ],
+        [
+          "ListItem",
+          17
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "ListGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 16,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          5
+        ],
+        [
+          "Span",
+          2
+        ],
+        [
+          "Text",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 17,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          56
+        ],
+        [
+          "Span",
+          31
+        ],
+        [
+          "Text",
+          7
+        ],
+        [
+          "SectionHeader",
+          3
+        ],
+        [
+          "Equation",
+          3
+        ],
+        [
+          "PageFooter",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 3,
+        "llm_error_count": 0,
+        "llm_tokens_used": 2828,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 18,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Span",
+          67
+        ],
+        [
+          "Line",
+          56
+        ],
+        [
+          "Text",
+          9
+        ],
+        [
+          "SectionHeader",
+          2
+        ],
+        [
+          "Equation",
+          1
+        ],
+        [
+          "Footnote",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 1,
+        "llm_error_count": 0,
+        "llm_tokens_used": 1158,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 19,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          55
+        ],
+        [
+          "Span",
+          51
+        ],
+        [
+          "Text",
+          7
+        ],
+        [
+          "Equation",
+          1
+        ],
+        [
+          "SectionHeader",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 1,
+        "llm_error_count": 0,
+        "llm_tokens_used": 1295,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 20,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Span",
+          72
+        ],
+        [
+          "Line",
+          57
+        ],
+        [
+          "Text",
+          5
+        ],
+        [
+          "SectionHeader",
+          3
+        ],
+        [
+          "ListItem",
+          3
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "ListGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 21,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          54
+        ],
+        [
+          "Span",
+          37
+        ],
+        [
+          "Text",
+          6
+        ],
+        [
+          "Figure",
+          2
+        ],
+        [
+          "Caption",
+          2
+        ],
+        [
+          "SectionHeader",
+          2
+        ],
+        [
+          "FigureGroup",
+          2
+        ],
+        [
+          "Equation",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 1,
+        "llm_error_count": 0,
+        "llm_tokens_used": 972,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 22,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          78
+        ],
+        [
+          "Span",
+          27
+        ],
+        [
+          "Text",
+          3
+        ],
+        [
+          "Figure",
+          2
+        ],
+        [
+          "Caption",
+          2
+        ],
+        [
+          "FigureGroup",
+          2
+        ],
+        [
+          "SectionHeader",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 23,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          58
+        ],
+        [
+          "Span",
+          20
+        ],
+        [
+          "Text",
+          6
+        ],
+        [
+          "SectionHeader",
+          4
+        ],
+        [
+          "Figure",
+          1
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "FigureGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 24,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "TableCell",
+          88
+        ],
+        [
+          "Line",
+          58
+        ],
+        [
+          "Span",
+          25
+        ],
+        [
+          "Text",
+          3
+        ],
+        [
+          "Caption",
+          2
+        ],
+        [
+          "Figure",
+          1
+        ],
+        [
+          "Table",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "FigureGroup",
+          1
+        ],
+        [
+          "TableGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 1,
+        "llm_error_count": 0,
+        "llm_tokens_used": 2669,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 25,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          52
+        ],
+        [
+          "Span",
+          21
+        ],
+        [
+          "Text",
+          6
+        ],
+        [
+          "Figure",
+          2
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "SectionHeader",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "FigureGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 26,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          65
+        ],
+        [
+          "Span",
+          45
+        ],
+        [
+          "Text",
+          5
+        ],
+        [
+          "SectionHeader",
+          4
+        ],
+        [
+          "Figure",
+          1
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "FigureGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 27,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          45
+        ],
+        [
+          "Span",
+          16
+        ],
+        [
+          "Text",
+          3
+        ],
+        [
+          "Figure",
+          1
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "SectionHeader",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "FigureGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    }
+  ],
+  "debug_data_path": "debug_data/arXiv:2510.13786v1"
+}
\ No newline at end of file
diff --git a/research/papers/arXiv-tricks_or_traps_part1/_page_0_Figure_8.jpeg b/research/papers/arXiv-tricks_or_traps_part1/_page_0_Figure_8.jpeg
new file mode 100644
index 0000000..a26e660
Binary files /dev/null and b/research/papers/arXiv-tricks_or_traps_part1/_page_0_Figure_8.jpeg differ
diff --git a/research/papers/arXiv-tricks_or_traps_part1/_page_0_Picture_2.jpeg b/research/papers/arXiv-tricks_or_traps_part1/_page_0_Picture_2.jpeg
new file mode 100644
index 0000000..94d1348
Binary files /dev/null and b/research/papers/arXiv-tricks_or_traps_part1/_page_0_Picture_2.jpeg differ
diff --git a/research/papers/arXiv-tricks_or_traps_part1/_page_10_Figure_9.jpeg b/research/papers/arXiv-tricks_or_traps_part1/_page_10_Figure_9.jpeg
new file mode 100644
index 0000000..cd6b2fe
Binary files /dev/null and b/research/papers/arXiv-tricks_or_traps_part1/_page_10_Figure_9.jpeg differ
diff --git a/research/papers/arXiv-tricks_or_traps_part1/_page_11_Figure_0.jpeg b/research/papers/arXiv-tricks_or_traps_part1/_page_11_Figure_0.jpeg
new file mode 100644
index 0000000..d9dd56a
Binary files /dev/null and b/research/papers/arXiv-tricks_or_traps_part1/_page_11_Figure_0.jpeg differ
diff --git a/research/papers/arXiv-tricks_or_traps_part1/_page_12_Figure_3.jpeg b/research/papers/arXiv-tricks_or_traps_part1/_page_12_Figure_3.jpeg
new file mode 100644
index 0000000..5907172
Binary files /dev/null and b/research/papers/arXiv-tricks_or_traps_part1/_page_12_Figure_3.jpeg differ
diff --git a/research/papers/arXiv-tricks_or_traps_part1/_page_13_Figure_0.jpeg b/research/papers/arXiv-tricks_or_traps_part1/_page_13_Figure_0.jpeg
new file mode 100644
index 0000000..5b4d4f7
Binary files /dev/null and b/research/papers/arXiv-tricks_or_traps_part1/_page_13_Figure_0.jpeg differ
diff --git a/research/papers/arXiv-tricks_or_traps_part1/_page_14_Figure_0.jpeg b/research/papers/arXiv-tricks_or_traps_part1/_page_14_Figure_0.jpeg
new file mode 100644
index 0000000..8103e36
Binary files /dev/null and b/research/papers/arXiv-tricks_or_traps_part1/_page_14_Figure_0.jpeg differ
diff --git a/research/papers/arXiv-tricks_or_traps_part1/_page_15_Figure_0.jpeg b/research/papers/arXiv-tricks_or_traps_part1/_page_15_Figure_0.jpeg
new file mode 100644
index 0000000..2b7f2b2
Binary files /dev/null and b/research/papers/arXiv-tricks_or_traps_part1/_page_15_Figure_0.jpeg differ
diff --git a/research/papers/arXiv-tricks_or_traps_part1/_page_16_Figure_0.jpeg b/research/papers/arXiv-tricks_or_traps_part1/_page_16_Figure_0.jpeg
new file mode 100644
index 0000000..bd0d877
Binary files /dev/null and b/research/papers/arXiv-tricks_or_traps_part1/_page_16_Figure_0.jpeg differ
diff --git a/research/papers/arXiv-tricks_or_traps_part1/_page_17_Figure_1.jpeg b/research/papers/arXiv-tricks_or_traps_part1/_page_17_Figure_1.jpeg
new file mode 100644
index 0000000..e587689
Binary files /dev/null and b/research/papers/arXiv-tricks_or_traps_part1/_page_17_Figure_1.jpeg differ
diff --git a/research/papers/arXiv-tricks_or_traps_part1/_page_18_Figure_0.jpeg b/research/papers/arXiv-tricks_or_traps_part1/_page_18_Figure_0.jpeg
new file mode 100644
index 0000000..09fc584
Binary files /dev/null and b/research/papers/arXiv-tricks_or_traps_part1/_page_18_Figure_0.jpeg differ
diff --git a/research/papers/arXiv-tricks_or_traps_part1/_page_23_Figure_7.jpeg b/research/papers/arXiv-tricks_or_traps_part1/_page_23_Figure_7.jpeg
new file mode 100644
index 0000000..f919fa2
Binary files /dev/null and b/research/papers/arXiv-tricks_or_traps_part1/_page_23_Figure_7.jpeg differ
diff --git a/research/papers/arXiv-tricks_or_traps_part1/_page_24_Figure_0.jpeg b/research/papers/arXiv-tricks_or_traps_part1/_page_24_Figure_0.jpeg
new file mode 100644
index 0000000..02df632
Binary files /dev/null and b/research/papers/arXiv-tricks_or_traps_part1/_page_24_Figure_0.jpeg differ
diff --git a/research/papers/arXiv-tricks_or_traps_part1/_page_24_Figure_4.jpeg b/research/papers/arXiv-tricks_or_traps_part1/_page_24_Figure_4.jpeg
new file mode 100644
index 0000000..1532e74
Binary files /dev/null and b/research/papers/arXiv-tricks_or_traps_part1/_page_24_Figure_4.jpeg differ
diff --git a/research/papers/arXiv-tricks_or_traps_part1/_page_25_Figure_1.jpeg b/research/papers/arXiv-tricks_or_traps_part1/_page_25_Figure_1.jpeg
new file mode 100644
index 0000000..bc7729c
Binary files /dev/null and b/research/papers/arXiv-tricks_or_traps_part1/_page_25_Figure_1.jpeg differ
diff --git a/research/papers/arXiv-tricks_or_traps_part1/_page_4_Figure_9.jpeg b/research/papers/arXiv-tricks_or_traps_part1/_page_4_Figure_9.jpeg
new file mode 100644
index 0000000..333fdee
Binary files /dev/null and b/research/papers/arXiv-tricks_or_traps_part1/_page_4_Figure_9.jpeg differ
diff --git a/research/papers/arXiv-tricks_or_traps_part1/_page_5_Figure_0.jpeg b/research/papers/arXiv-tricks_or_traps_part1/_page_5_Figure_0.jpeg
new file mode 100644
index 0000000..d566a10
Binary files /dev/null and b/research/papers/arXiv-tricks_or_traps_part1/_page_5_Figure_0.jpeg differ
diff --git a/research/papers/arXiv-tricks_or_traps_part1/_page_7_Figure_0.jpeg b/research/papers/arXiv-tricks_or_traps_part1/_page_7_Figure_0.jpeg
new file mode 100644
index 0000000..23d486c
Binary files /dev/null and b/research/papers/arXiv-tricks_or_traps_part1/_page_7_Figure_0.jpeg differ
diff --git a/research/papers/arXiv-tricks_or_traps_part1/_page_8_Figure_0.jpeg b/research/papers/arXiv-tricks_or_traps_part1/_page_8_Figure_0.jpeg
new file mode 100644
index 0000000..39119ec
Binary files /dev/null and b/research/papers/arXiv-tricks_or_traps_part1/_page_8_Figure_0.jpeg differ
diff --git a/research/papers/arXiv-tricks_or_traps_part1/_page_9_Figure_0.jpeg b/research/papers/arXiv-tricks_or_traps_part1/_page_9_Figure_0.jpeg
new file mode 100644
index 0000000..1a8f97e
Binary files /dev/null and b/research/papers/arXiv-tricks_or_traps_part1/_page_9_Figure_0.jpeg differ
diff --git a/research/papers/arXiv-tricks_or_traps_part1/_page_9_Figure_2.jpeg b/research/papers/arXiv-tricks_or_traps_part1/_page_9_Figure_2.jpeg
new file mode 100644
index 0000000..639e4ea
Binary files /dev/null and b/research/papers/arXiv-tricks_or_traps_part1/_page_9_Figure_2.jpeg differ
diff --git a/research/papers/arXiv-tricks_or_traps_part1/arXiv-2508.08221.md b/research/papers/arXiv-tricks_or_traps_part1/arXiv-2508.08221.md
new file mode 100644
index 0000000..a5c4529
--- /dev/null
+++ b/research/papers/arXiv-tricks_or_traps_part1/arXiv-2508.08221.md
@@ -0,0 +1,485 @@
+![](_page_0_Picture_2.jpeg)
+
+# Part I: Tricks or Traps? A Deep Dive into RL for LLM Reasoning
+
+Zihe Liu $^{*\Diamond}\alpha$ , Jiashun Liu $^{*\Diamond\alpha}$ , Yancheng He $^{*\alpha}$ , Weixun Wang $^{*\dagger\alpha}$ , Jiaheng Liu $^{\Omega}$ , Ling Pan $^{\Diamond}$ , Xinyu Hu $^{\alpha}$ ¶, Shaopan Xiong $^{\alpha}$ , Ju Huang $^{\alpha}$ , Jian Hu $^{\clubsuit}$ , Shengyi Huang $^{\ddagger}$ , Siran Yang $^{\alpha}$ , Jiamang Wang $^{\alpha}$ , Wenbo Su $^{\alpha}$ , Bo Zheng $^{\alpha}$ 
+
+ $^{\alpha}$  Alibaba Group  $^{\heartsuit}$  Beijing Jiaotong University  $^{\diamond}$  Hong Kong University of Science and Technology  $^{\Omega}$  Nanjing University  $^{\P}$  Peking University  $^{\clubsuit}$  OpenRLHF  $^{\ddagger}$  CleanRL
+
+# **Abstract**
+
+Reinforcement learning for LLM reasoning has rapidly emerged as a prominent research area, marked by a significant surge in related studies on both algorithmic innovations and practical applications. Despite this progress, several critical challenges remain, including the absence of standardized guidelines for employing RL techniques and a fragmented understanding of their underlying mechanisms. Additionally, inconsistent experimental settings, variations in training data, and differences in model initialization have led to conflicting conclusions, obscuring the key characteristics of these techniques and creating confusion among practitioners when selecting appropriate techniques. This paper systematically reviews widely adopted RL techniques through rigorous reproductions and isolated evaluations within a unified open-source framework. We analyze the internal mechanisms, applicable scenarios, and core principles of each technique through fine-grained experiments, including datasets of varying difficulty, model sizes, and architectures. Based on these insights, we present clear guidelines for selecting RL techniques tailored to specific setups, and provide a reliable roadmap for practitioners navigating the RL for the LLM domain. Finally, we reveal that a minimalist combination of two techniques can unlock the learning capability of critic-free policies using vanilla PPO loss. The results demonstrate that our simple combination consistently improves performance, surpassing strategies like GRPO and DAPO.
+
+![](_page_0_Figure_8.jpeg)
+
+Figure 1: **Left**: The proliferation of RL optimization techniques, coupled with diverse initialized models and data, has raised barriers to practical adoption. **Right**: We establish detailed application guidelines via dissecting internal mechanisms of widely-used tricks, and introduce **Lite PPO**, a minimalist two-technique combination that enhances learning capacity in critic-free policies with vanilla PPO loss. The average accuracy is calculated across six mathematical benchmarks.
+
+<sup>\*</sup> Equal Contribution. † Corresponding to: Weixun Wang <weixun.wwx@taobao.com>.
+
+# 1 Introduction
+
+Recent breakthroughs in large language models (LLMs) such as OpenAI o1 (Wu et al., 2024) and DeepSeek R1 (Shao et al., 2024) have positioned reinforcement learning (RL) as a key driver in unlocking advanced reasoning capabilities within LLMs. This is particularly evident in challenging reasoning tasks like mathematical reasoning (He et al., 2025a) and code generation (Zhuo et al., 2025), where RL has demonstrated the potential to elevate LLM performance beyond what pre-training alone can achieve. Such an emerging trend has sparked widespread interest within the research community in the direction of "RL for LLM" (or RL4LLM). In 2025, RL4LLM experienced a surge in research activity, leading to hundreds of publications across arXiv and major conferences, covering a wide range of topics from algorithmic innovation to practical engineering solutions.
+
+However, this rapid progress is shadowed by the lack of usage guidelines for existing RL techniques or tricks (Huang et al., 2024a) as well as the absence of in-depth analysis of their underlying mechanisms. Specifically, these limitations can manifest as confusion among practitioners in choosing RL tricks, as different papers provide different solutions to the same problem. For instance, GRPO (Shao et al., 2024) advocates for group-level normalization to enhance policy stability, whereas REINFORCE++ (Hu et al., 2025) argues that batch-level normalization works better. Moreover, GRPO incorporates variance in normalization, yet Dr. GRPO (Liu et al., 2025a) explicitly recommends removing variance normalization to prevent bias. Another example: GRPO (Shao et al., 2024) has achieved a breakthrough in performance through the strategy of using response-level loss calculation, while DAPO (Yu et al., 2025) has instead adopted token-level loss calculation. Such contradictory and chaotic phenomena underscore the fragmented understanding and inconsistent recommendations within the RL4LLM community. One possible reason for the above phenomenon is that the experimental settings, training data, and initialization of the existing work all have significant differences, which may also cause deviations in the summary of the conclusions.
+
+Apart from the confusion caused by the intrinsic differences of similar techniques, the numerous and seemingly orthogonal techniques, including *Normalization*, *Clip*, and *Overlong Filtering*, also increase the complexity of algorithm application in practice. Practitioners face non-trivial challenges in selecting an appropriate combination from a wide range of techniques to unlock the learning capacity of LLMs in specific scenarios. These ambiguities have naturally triggered a key requirement of practitioners:
+
+What scenarios are the existing techniques respectively suitable for? Is there a simple and generalized combination that can be used to enhance policy optimization?
+
+Aligned with classic RL mechanism analysis methodologies (Andrychowicz et al., 2020; Engstrom et al., 2020; Huang et al., 2024a), we systematically review the widely used RL techniques by reproducing them and independently evaluating the actual impact of each technique, based on the same open-source infrastructure framework and policy models. To comprehensively cover practical scenarios, we design extensive experimental settings incorporating datasets of varying difficulty levels, diverse model sizes, and distinct model types. Furthermore, we conduct an in-depth analysis of their theoretical foundations, implementation details, and applicable scenarios as demons. The intuitive contribution is illustrated in Figure 1. Specifically, ① our empirical results reveal that most RL techniques exhibit obvious preferences and sensitivities to the experimental setup, e.g., model type, data distribution, reward mechanism and hyperparameter. ② Based on the isolated analysis under our setup, we demonstrate that employing only two techniques, i.e., advantage normalization (group-level mean, batch-level std) and token-level loss aggregation, can unlock the learning capability of critic-free policies using vanilla PPO loss, surpassing mainstream RL4LLM algorithms incorporating redundant components. Our core contributions are selected as:
+
+- 1. Group-level normalization shows robust efficiency under each reward setting. Batch-level normalization provides more stable improvement under large scale reward setting. (§4.1.1)
+- 2. Group-level mean and batch-level standard deviation enable further robust normalization. (§4.1.3)
+- 3. Clip Higher prefers promoting high-quality exploration for aligned models. (§4.2.1)
+- 4. There appears to be a "scaling law" between the performance and the upper bound of the clipping on the small-sized model. (§4.2.3)
+- 5. Compared to sequence-level loss aggregation, token-level aggregation is effective on base models, while showing limited improvement on aligned models. (§4.3.1)
+- 6. Overlong filtering enhances accuracy and clarity for short-to-medium reasoning tasks but provides limited benefits for long-tail reasoning.  $(\S4.4.1)$
+- 7. Two techniques may unlock learning capacity in critic-free policies based on vanilla PPO loss. (§5)
+
+# 2 Preliminaries
+
+## 2.1 Proximal Policy Optimization (PPO)
+
+Proximal Policy Optimization (PPO)(Schulman et al., 2017) is a widely used actor-critic algorithm grounded in the policy gradient framework. It improves the stability of policy learning by optimizing a clipped surrogate objective that restricts the divergence between the new and old policies during training. The PPO objective is:
+
+$$\mathcal{J}_{\mathrm{PPO}}(\theta) = \mathbb{E}_{\left[q \sim P(Q),\ o \sim \pi_{\theta_{\mathrm{old}}}(O|q)\right]} \left[ \frac{1}{|o|} \sum_{t=1}^{|o|} \min \left( \frac{\pi_{\theta}(o_t|q, o_{< t})}{\pi_{\theta_{\mathrm{old}}}(o_t|q, o_{< t})} A_t,\ \operatorname{clip}\left(\frac{\pi_{\theta}(o_t|q, o_{< t})}{\pi_{\theta_{\mathrm{old}}}(o_t|q, o_{< t})},\ 1 - \epsilon,\ 1 + \epsilon\right) A_t \right) \right], \tag{1}$$
+
+where  $\pi_{\theta}$  and  $\pi_{\theta_{\text{old}}}$  denote the current and old policy models, respectively. q and o represent the sampled question and output sequence, with  $o_t$  as the t-th token in o. e is a clipping hyperparameter for stabilizing updates.  $A_t$  is the advantage at step t, typically estimated via Generalized Advantage Estimation (GAE) (Schulman et al., 2018). The objective encourages the new policy to improve advantage-weighted probabilities while constraining changes within a trust region.
+
+## 2.2 Group Relative Policy Optimization (GRPO)
+
+Group Relative Policy Optimization (GRPO), proposed in DeepSeekMath (Shao et al., 2024), eliminates the value function (critic) and instead estimates the advantage by normalizing rewards within a group of sampled responses for the same prompt. Specifically, for a prompt x with G responses and associated rewards  $\{r_i\}_{i=1}^G$ , the group-normalized advantage is given by:
+
+$$\hat{A}_{i,t} = \frac{r_i - \text{mean}(\{r_i\}_{i=1}^G)}{\text{std}(\{r_i\}_{i=1}^G)} \quad (2)$$
+
+The effectiveness of the above normalization method can be understood from the perspective of reward shaping. By emphasizing the differences among candidate outputs for the same prompt, it effectively preserves the reliability of the gradient signal, even in sparse reward settings (Hu et al., 2020). Instead of adding KL penalty in the reward, GRPO directly regularizes by directly adding the KL divergence between the trained policy and the reference policy to the loss. The overall surrogate objective is:
+
+$$\mathcal{J}_{\text{GRPO}}(\theta) = \mathbb{E}_{\left[q \sim P(Q), \left\{o_{i}\right\}_{i=1}^{G} \sim \pi_{\theta_{\text{old}}}(O|q)\right]} \left[ \frac{1}{G} \sum_{i=1}^{G} \frac{1}{|o_{i}|} \sum_{t=1}^{|o_{i}|} \left\{ \min\left(r_{i,t}(\theta) \, \hat{A}_{i,t}, \ \text{clip}\left(r_{i,t}(\theta), \ 1 - \epsilon, \ 1 + \epsilon\right) \, \hat{A}_{i,t}\right) - \beta D_{\text{KL}}\left[\pi_{\theta} \parallel \pi_{\text{ref}}\right] \right\} \right],$$
+(3)
+
+where  $r_{i,t}(\theta) = \frac{\pi_{\theta}(o_{i,t}|q,o_{i,< t})}{\pi_{\theta_{\text{old}}}(o_{i,t}|q,o_{i,< t})}$ ,  $\epsilon$  and  $\beta$  are hyper-parameters, and  $D_{\text{KL}}$  denotes the KL divergence between the learned policy and a reference policy  $\pi_{\text{ref}}$ .
+
+## 2.3 Decoupled Clip and Dynamic Sampling Policy Optimization (DAPO)
+
+Decoupled Clip and Dynamic Sampling Policy Optimization (DAPO) (Yu et al., 2025) is a recent RL method designed to address the unique challenges in LLM reasoning.
+
+For each question q with gold answer a, DAPO samples a group of G outputs  $\{o_i\}_{i=1}^G$  from the old policy, computes their rewards, and maximizes the following surrogate objective:
+
+$$\mathcal{J}_{\text{DAPO}}(\theta) = \mathbb{E}_{\left[(q,a) \sim \mathcal{D},\ \left\{o_i\right\}_{i=1}^G \sim \pi_{\theta_{\text{old}}}(\cdot \mid q)\right]}\\\ \frac{1}{\sum_{i=1}^G |o_i|} \sum_{i=1}^G \sum_{t=1}^{|o_i|} \left\{ \min\left(r_{i,t}(\theta) \, \hat{A}_{i,t}, \ \operatorname{clip}\left(r_{i,t}(\theta), \ 1 - \epsilon_{\text{low}}, \ 1 + \epsilon_{\text{high}}\right) \, \hat{A}_{i,t}\right) \right\}, \tag{4}$$
+
+where  $\hat{A}_{i,t}$  is the group-normalized advantage. In addition, DAPO decouples the upper and lower clipping ranges ( $\epsilon_{low}$ ,  $\epsilon_{high}$ ) to better support exploration, dynamically filters out samples where all responses are correct or incorrect, aggregates losses at the token level, and applies special reward shaping for overlong or truncated responses.
+
+## 2.4 Reinforcement Learning Techniques
+
+A variety of practical techniques have been introduced to stabilize optimization, reduce variance, and accelerate convergence of LLM on the reasoning task. Drawing from prior research and practical implementations, we categorize widely used techniques as follows.
+
+**Baseline Design.** Baselines are crucial for reducing variance in policy gradient estimation. Recent studies have proposed more effective formulations, such as using the mean reward within each group as the baseline (Shao et al., 2024) and computing the baseline for each sample as the average gradient estimate from other samples in the group (Ahmadian et al., 2024; Kool et al., 2019).
+
+**Clipping Strategies.** Clipping controls excessive updates in policy optimization and can be applied to different quantities, such as rewards, advantages, or ratios. Furthermore, the *Clip Ratio Higher* (Yu et al., 2025) method relaxes the upper bound in PPO's ratio clipping to better preserve exploration.
+
+**Normalization Strategies.** Normalization of rewards or advantages helps stabilize gradient magnitudes. Representative approaches include: *Batch-level Reward Normalization* (Hu et al., 2025), *Group-level Reward Normalization* (Shao et al., 2024; Ahmadian et al., 2024), and *Reward Shift without Standard Deviation* (Liu et al., 2025a), which removes the standard deviation term to avoid the difficulty bias.
+
+**Filtering Strategies.** Filtering out uninformative or undesirable samples prior to gradient computation. Examples include: *Overlong Filtering* (Yu et al., 2025) to remove responses exceeding predefined length limits; *Error Max Clip Mask* and *Right Min Clip Mask* to filter overly incorrect or trivially correct samples; and *Difficulty Mask* (Yu et al., 2025; Zhang et al., 2025; Chu et al., 2025) to exclude samples outside a targeted difficulty range.
+
+**Loss Aggregation Granularity.** The formulation of loss aggregation determines the relative weight each token contributes to the overall objective. Common approaches include: *Token-level Loss* computes per-token advantages to reduce length bias, while *Sequence-level Loss* aggregates at the sequence level.
+
+**Additional Loss Functions.** Auxiliary losses can complement the primary objective and regularize training. *KL Loss* (Yu et al., 2025; Liu et al., 2025a) constrains divergence from a reference policy, while *SFT Loss* (Zhang and Zuo, 2025) incorporates supervised fine-tuning objectives to preserve alignment.
+
+**Reward Design.** Shaping the reward function can guide desired output properties. Common examples include: *Length Penalty* discourages excessively long outputs; *Formatting Reward* which encourages outputs that adhere to preferred structures such as boxed answers, bullet lists, or code-style formatting; *Length-Dependent Accuracy Reward* combines correctness with output length.
+
+The above categories summarize the most prevalent improvement strategies for RL in LLM reasoning. In this work, we focus on four key aspects: *Normalization, Clipping, Masking*, and *Loss Aggregation*, and conduct in-depth analyses of their mechanisms and practical utility.
+
+# 3 Experimental Designs
+
+## 3.1 Experimental Setup
+
+**Training Algorithm:** We utilize the open-sourced ROLL framework<sup>1</sup> (Wang et al., 2025), an efficient and scalable platform specifically designed for reinforcement learning optimization in LLMs, to conduct all experiments. Besides, we adopt PPO loss (Schulman et al., 2017), with advantage values computed using the REINFORCE algorithm (Sutton et al., 1999) as the unified and naive RL baseline. To ensure that the batch size for global sampling is consistent with existing research, i.e., 1024, we set the rollout batch size to 128 and sample 8 responses for each prompt, with a maximum response length of 8192 tokens. The learning rate is set to 1e - 6. For text generation, we use a top\_p value of 0.99, a top\_k value of 100, and a temperature of 0.99.
+
+**Base Models:** To comprehensively evaluate reinforcement learning (RL) techniques across parameter scales, our experiments cover two model sizes: Qwen3-4B and Qwen3-8B. For each model size, we
+
+<sup>&</sup>lt;sup>1</sup>Open source RL framework: https://github.com/alibaba/ROLL
+
+include both non-aligned pre-trained versions (Qwen3-4B-Base and Qwen3-8B-Base) and aligned versions, allowing us to assess RL gains from various starting points<sup>2</sup>.
+
+**Training Datasets:** To ensure reproducibility and fairness, we exclusively use open-source datasets for training, including SimpleRL-Zoo-Data (Zeng et al., 2025) and Deepmath (He et al., 2025a). To comprehensively examine how problem difficulty affects the RL technique's performance, we randomly sample from the datasets while removing an excessive proportion of examples whose ground-truth label is simply "True" or "False". Because we identify the **ostensible positive phenomenon** wherein models produce correct binary answers from erroneous reasoning chains, introducing noisy supervision that compromises training quality (please refer to Appendix B.2 for case studies). Figure 2 visualizes the difficulty across the training dataset assessed by GPT-40 (Hurst et al., 2024).
+
+- Easy Data: We randomly sample 5,000 entries from SimpleRL-Zoo-Data-Easy, which consists of problems drawn from GSM8K and MATH-500-level1.
+- Medium Data: We select the 5,000 easiest examples from the *DeepMath-103k* dataset, based on their assigned difficulty annotations.
+- Hard Data: We randomly sample 5,000 entries from *DeepMath-103k*, with sampling probability proportional to each entry's assigned difficulty level.
+
+**Evaluation Benchmark:** All the experiments are conducted on six math datasets. i.e., MATH-500 (Hendrycks et al., 2021), OlympiadBench (He et al., 2024), MinervaMath (Lewkowycz et al., 2022), and subsets of standardized examinations (AIME24-25, AMC23). These datasets span a broad complexity spectrum from basic arithmetic to competition-level mathematics, enabling a comprehensive evaluation of reasoning capabilities.
+
+## 3.2 Baseline Results
+
+Impact of Data Difficulty on Training Dynamics We investigate how data difficulty influences the training dynamics of Qwen3 models. Specifically, we analyze the training convergence patterns through loss dynamics, accuracy trajectories, and generalization gaps, with three tiers of complexity (Easy, Medium, Hard). The detailed learning curves are shown in Figure 3.
+
+The experimental results demonstrate that, as the number of training epochs increases, the model exhibits markedly different accuracy trajectories across training out for different datasets.
+
+![](_page_4_Figure_9.jpeg)
+
+Figure 2: Number of correct under 8 times rollsets of different difficulty levels. Furthermore, when confronted with more challenging samples, the model often fits complex reasoning patterns by generating more tokens.
+
+When focusing on the differences in learning efficiency between the unaligned Base model and the aligned model under the same experimental setting (as shown in Figure 3), the aligned models demonstrated a substantially higher initial accuracy and produced responses with a significantly longer average token length in the early stages of training. However, the performance improvement from additional learning steps of the aligned model was relatively modest, yielding only about a 2% increase in accuracy. This result suggests that the current RL4LLM algorithm offers a slight improvement for aligned models that are already highly optimized.
+
+# Analysis
+
+## 4.1 Normalization
+
+Advantage normalization is a well-established technique for reducing gradient variance and stabilizing policy optimization (Zheng et al., 2023), and it has become a standard component of RL training pipelines for language models. However, significant differences remain in how normalization is implemented. For example, GRPO (Shao et al., 2024) and RLOO (Ahmadian et al., 2024; Kool et al., 2019) use group-level normalization, calculating advantages relative to other responses within the same prompt to foster intra-context competition. On the other hand, REINFORCE++ (Hu et al., 2025) employs batch-level
+
+<sup>&</sup>lt;sup>2</sup>Checkpoint links: https://huggingface.co/Qwen/Qwen3-4B; https://huggingface.co/Qwen/Qwen3-8B; https: //huggingface.co/Qwen/Qwen3-4B-Base; https://huggingface.co/Qwen/Qwen3-8B-Base
+
+![](_page_5_Figure_0.jpeg)
+
+Figure 3: **(Top 2 rows): Test accuracy and response length of four model variants:** Qwen3-4B-Base, Qwen3-4B, and Qwen3-8B across different data difficulty. **Middle 2 rows:** Accuracy over training iterations of Base models. The first row presents results of Qwen3-4B-Base. The second row shows results of Qwen3-8B-Base. **Bottom 2 rows:** Accuracy over training iterations of aligned models. The first row presents results of Qwen3-4B, while the second row shows results of Qwen3-8B. To ensure clarity and intuitiveness in the qualitative analysis, all curves are consistently smoothed using identical parameters. Specifically, the mean values are computed using an 11-step moving window with an exponential smoothing factor of 0.8. The shaded regions around the curves represent the range mean  $\pm$  (std\_multiplier  $\times$  standard deviation), providing a visual representation of the oscillation amplitude.
+
+normalization, arguing that optimizing within a single prompt excessively can lead to reward hacking and hinder generalization, especially when response diversity is low.
+
+Formally, Given a prompt x with K sampled responses and corresponding rewards  $\{r_k\}_{k=1}^K$ , the group-level normalized advantage for the k-th response is:
+
+$$A_k^\mathrm{group} = \frac{r_k - \mathrm{mean}(\{r_j\}_{j=1}^K)}{\mathrm{std}(\{r_j\}_{j=1}^K)}.$$
+ (5)
+
+In contrast, batch normalization computes the reward over a rollout batch of size N and K sampled trajectories. The normalized advantage for the i-th response is:
+
+$$A_i^{\text{batch}} = \frac{r_i - \text{mean}(\{r_j\}_{j=1}^{N*K})}{\text{std}(\{r_j\}_{j=1}^{N*K})}$$
+(6)
+
+#### 4.1.1 Advantage normalization is sensitive to reward mechanisms
+
+#### Takeaway 1
+
+**Group-level normalization** demonstrates robust efficiency across different reward settings. **Batch-level normalisation** provides more stable improvement under large scale reward setting.
+
+To systematically evaluate the impact of advantage normalization on PPO variants with a value function using the Monte Carlo return target, we conducted experiments under a unified training framework, exploring three settings: **no normalization**, **batch-level normalization**, and **group-level normalization**. To highlight the differential impacts of the normalization techniques during training, we selected the Qwen3-base series models due to their low initial scores and high improvement potential (Yang et al., 2025). This choice ensures a fair comparison by minimizing confounding factors from alignment or prior optimization. Focusing on model scale as a key variable, we evaluate small (4B) and medium-sized (8B) models to empirically assess whether normalization techniques interact with model capacity. This approach allows us to derive practical insights into normalization strategies across different computational budgets and architectures.
+
+Under the default setting of the reward mechanism, i.e.,  $R \in \{0,1\}^3$ , when analyzing the performance in Figure 4, it can be concluded that both advantage normalization techniques can significantly influence the model's convergence speed, performance stability, and final outcomes. Specifically, on both model sizes, group-level normalization consistently achieves more stable training dynamics and higher final performance compared to both batch-level normalization and no normalization. Batch-level normalization exhibits high sensitivity to reward distribution skew, often leading to performance collapse under an imbalanced batch situation, where a few outlier samples dominate the advantage estimates.
+
+However, when we changed the reward mechanism to the larger scale of  $R \in \{-1,1\}^4$ , batch-level normalization regained its effectiveness, demonstrating a significant improvement in policy learning, as shown in Figure 5. The above experiment fully demonstrates the sensitivity of the advantage normalization technique to the reward mechanism.
+
+#### 4.1.2 Impact of the standard deviation term in advantage normalization
+
+## Takeaway 2
+
+**Removing the standard deviation** when reward distributions are highly concentrated (e.g., easy training dataset) enhances the stability and effectiveness of model training.
+
+The previous section highlighted the sensitivity of various normalization techniques to the reward scale. Thus, a question naturally emerged: *what drives this phenomenon?* A plausible explanation is that different reward scales directly impact the calculation of the standard deviation, thereby altering the strength of the normalization. In particular, when model responses within a prompt group yield highly similar rewards, e.g., when the responses are almost all correct or all incorrect, the resulting standard deviation becomes extremely small. In such cases, dividing by this small standard deviation during normalization
+
+ $<sup>{}^{3}</sup>R \in \{0,1\}$  represents the default rule-based binary reward mechanism, where a value of 1 is assigned to trajectories that generate correct answers, and a value of 0 is assigned to incorrect ones.
+
+ $<sup>{}^4</sup>R \in \{-1,1\}$  further increases the magnitude of reward differences compared to the default mechanism, where a value of 1 is assigned to trajectories that generate correct answers, and a value of -1 is assigned to incorrect ones.
+
+![](_page_7_Figure_0.jpeg)
+
+Figure 4: Accuracy over training iterations of Base models. **Top 2 rows**: Qwen3-4B-Base with different normalization techniques. The first row uses the easy training dataset, while the second row uses the hard training dataset. **Middle 2 rows**: Qwen3-8B-Base with different normalization techniques (under the default reward scale). **Bottom 2 rows**: Accuracy over training iterations of aligned models (trained on medium level dataset, under the default reward scale) with different normalization techniques. The first row shows the results of Owen3-4B, while the second row shows the results of Owen3-8B.
+
+can excessively amplify gradient updates, causing the model to overemphasize tasks of extreme difficulty, a phenomenon similar to "difficulty bias" (Liu et al., 2025a).
+
+To determine whether the calculation method of the standard deviation is the key module causing the difference in normalization performance, we employ the batch-level calculation, which exhibited unstable performance in the previous section, to calculate the mean of advantage, and conduct ablation![](_page_8_Figure_0.jpeg)
+
+Figure 5: **Top 2 rows**: Accuracy over training iterations of Qwen3-4B-Base with batch-level normalization under different reward scale. The first row uses the easy training dataset, while the second row uses the medium training dataset. **Bottom 2 rows**: Accuracy over training iterations of Qwen3-4B-Base with group-level normalization under different reward scale.
+
+experiments on the standard deviation term. This can be formalized as:
+
+$$A_k^\text{std}^- = r_k - \mathrm{mean}(\{r_j\}_{j=1}^K). \tag{7}$$
+
+We separately recorded the accuracy after training on simple and difficult data. The curves of easy data in Figure 6 show that the policy rapidly converges to highly consistent behaviors, leading to a highly concentrated distribution of reward values. Correspondingly, the standard deviation of the reward distribution swiftly declines to a low value. Applying standard deviation-based normalization in this setting results in an exceedingly small denominator, which excessively amplifies reward and advantage values. This, in turn, induces abnormally large gradients, destabilizes training, and can even trigger gradient explosions. Therefore, these experimental results empirically verify our conjecture that the standard deviation term is the key mechanism for the advantage normalization.
+
+To further solidify our conclusion, we add a set of comparisons based on the hard dataset. We observe that the standard deviation of rewards remains comparatively high during training. As a result, both meanonly normalization and standard deviation based normalization yield similar efficiency, and training remains stable regardless of the normalization style. Consequently, the choice of normalization style has little impact on convergence or overall performance under such a smooth reward distribution.
+
+In summary, our experiments and analysis underscore that, in scenarios where reward distributions are highly concentrated, omitting the standard deviation from advantage normalization effectively prevents abnormal gradient amplification, thereby improving the stability and robustness of model training. However, for tasks characterized by inherently higher reward variance, either normalization approach is generally sufficient to maintain stable optimization.
+
+![](_page_9_Figure_0.jpeg)
+
+Figure 6: **Left:** Standard deviation variations during training on datasets of different difficulty levels. **Right:** Test accuracy before and after removing standard deviation from batch level normalization, with results for training on Easy Data (top) and Hard Data (bottom).
+
+![](_page_9_Figure_2.jpeg)
+
+Figure 7: Accuracy comparison of Base models with different standard deviation calculation. **Top 2 rows**: Accuracy of Qwen3-4B-Base with different standard deviation calculation. The first row uses the easy training dataset, while the second row uses the hard training dataset. **Bottom 2 rows**: Accuracy comparison of Qwen3-8B-Base with different standard deviation calculation. The first row uses the easy training dataset, while the second row uses the hard training dataset.
+
+#### 4.1.3 Reconstruct a robust normalization technique
+
+#### Takeaway 3
+
+Calculating the mean at the local (group) level and the standard deviation at the global (batch) level enables more robust reward shaping.
+
+Section 4.1.2 highlights the critical role of the standard deviation in determining the effectiveness of the advantage normalization mechanism. This leads to the final requirement: Is there a more robust and effective combination of mean and standard deviation for reward shaping? To explore this, we adopted the stable group-level mean calculation method demonstrated in section 4.1.1, paired with two approaches for computing the standard deviation: local (group-level) and global (batch-level). We then evaluated the performance of these combinations across two model sizes.
+
+The results, presented in Figures 7, reveal that global-level calculation exhibits a clear advantage. We attribute this to the batch-level standard deviation providing stronger normalization by effectively reducing gradient magnitudes, thereby preventing excessive policy updates. This approach aligns more effectively with the biased reward signals common in sparse rewards and coarse-grained advantage fitting, resulting in more stable and robust learning behavior. Furthermore, our experimental results support a claim from Hu et al. (2025) that batch-level normalization, or even subtracting the local mean and dividing by the batch standard deviation in certain scenarios, performs better.
+
+## 4.2 Clip-Higher
+
+While the Clip mechanism enhances PPO training stability (Huang et al., 2024b), it introduces critical challenges in LLM-based text generation. Specifically, it disproportionately suppresses low-probability tokens (Yu et al., 2025), leading to entropy collapse, i.e., a state where strategies become deterministic and lack diversity (Jin et al., 2024). This suppression creates a harmful positive feedback loop: as training progresses, entropy decreases, exploration shrinks, high-probability patterns are further reinforced, and entropy declines even more. Such behavior severely hinders performance on complex reasoning tasks, where novel path exploration is essential. To address this, the Clip-Higher mechanism is widely introduced into the training objective, which can be formalized as:
+
+$$J_{DAPO}(\theta) = (r_{i,t}(\theta), 1 - \varepsilon_{low}, 1 + \varepsilon_{high}). \tag{8}$$
+
+ $\varepsilon_{high}$  denotes the upper bound of the Clip mechanism and  $\varepsilon_{low}$  represents the lower bound. Unlike the original clip that enforces proportional fairness, Clip-Higher introduces a higher upper bound for advantage, giving low-probability tokens more improving space. By expanding exploration potential in low-probability regions, this technique effectively mitigates entropy collapse. However, the lack of in-depth analysis of the underlying mechanism and the absence of detailed usage guidelines have left practitioners confused about the appropriate scenarios for using Clip-Higher, as well as the ideal upper bound settings under different conditions. In this section, we address the aforementioned remaining issues through a series of comprehensive experiments.
+
+![](_page_10_Figure_9.jpeg)
+
+Figure 8: Entropy comparison across different models with Clip-Higher. A higher clip upper bound can mitigate the entropy drop in aligned models.
+
+![](_page_11_Figure_0.jpeg)
+
+Figure 9: **Top 2 rows**: Test accuracy of Base models (trained on medium data) with higher clipping upper bound. **Middle 2 rows**: Test accuracy of aligned models (trained on medium data) with higher clipping upper bound. **Bottom 2 rows**: Test accuracy of aligned models (trained on easy data) with a higher clipping upper bound.
+
+#### 4.2.1 In which settings should we clip higher
+
+#### Takeaway 4
+
+For models with stronger fundamental reasoning abilities, increasing the clip higher parameter is more likely to facilitate exploration of better solution paths.
+
+Through extensive empirical practice, we observe that the advantage clip technique demonstrates distinct effectiveness across different model architectures. To examine this, this section employs the non-aligned (base) model and the aligned (instruct) model with various sizes to clearly demonstrate the sensitivity of the Clip mechanism, summarize the usage guidelines for Clip higher from a modeling perspective.
+
+As illustrated in Figure 8, experimental results indicate that the impact of increasing the upper clipping bound  $\varepsilon_{high}$  is model-dependent. For the base models, adjusting the upper clipping value yields minor effects on policy entropy and even damages the performance compared to the vanilla policy (as shown in the top 2 rows of Figure 9). In contrast, aligned models exhibit a markedly different response: raising the upper clipping bound notably slows the entropy collapse, leading to consistent performance improvements in downstream evaluation metrics (refer to the middle and bottom rows in Figure 9).
+
+This disparity can be attributed to several underlying factors. First, the base models operate with a low policy clipping rate, approximately 0.003, which indicates only minimal deviation between successive policies. Moreover, the relatively naive policy expressiveness limits these base models' capacity for exploration, hindering the discovery of high-reward trajectories. Consequently, a higher clipping upper bound yields negligible improvements in learning dynamics.
+
+On the other hand, aligned models that leverage advanced pre-training techniques or post-training enhancements demonstrate superior reasoning capabilities and generalization performance (Yang et al., 2025). As shown in Figure 10, compared to the base model, the aligned model has very few preferred tokens with high probability in the initial stage. Token distributions for larger-scale models are provided in Appendix D. Therefore, a higher clipping upper bound can effectively bridge the probability gap between tokens and alleviate the entropy collapse. For these models, raising the upper bound expands the permissible range of policy updates, which in turn facilitates more diverse action sampling and enhances exploratory behavior during training. This mechanism preserves higher entropy while simultaneously increasing the probability of identifying optimal solutions, as evidenced by improved evaluation metrics.
+
+![](_page_12_Figure_3.jpeg)
+
+Figure 10: Predicted probability distributions of Qwen3-4B-Base (left) and Qwen3-4B (right) under two clipping upper bound  $\in \{0.20, 0.28\}$ .
+
+#### 4.2.2 Analyzing the effectiveness of Clip-Higher from a linguistic perspective
+
+#### Takeaway 5
+
+**Traditional clipping** may restrict the model's capacity to generate innovative reasoning structures. **Clipping higher** allows the model to explore a broader range of discourse reasoning structures.
+
+Building on our token-level demonstration of Clip-Higher's behavior in section 4.2.1, we now analyze its impact on reasoning logic through token-level linguistics. As illustrated in Figure 11, setting an upper bound to 0.2 imposes stringent constraints on policy updates by limiting substantial probability deviations for individual tokens. Under these stricter conditions, our analysis reveals that clipping predominantly affects connective tokens such as "therefore", "if", and "but". These tokens frequently appear at the beginnings of sentences, serving as key semantic markers or transition words within dialog generation. Such connectors often introduce new directions in reasoning. However, their probability ratios between updated and old policies frequently exceed clipping thresholds, triggering aggressive suppression in PPO optimization. While this traditional clipping ensures stability in the overall token distribution, it may restrict the model's capacity to generate innovative or diverse argumentative reasoning structures by constraining flexibility in the use of discourse-level connectives.
+
+Furthermore, raising the upper bound from 0.2 to 0.28 significantly expands the policy update space, permitting greater deviations in token-level probabilities from the old policy. Under these more permissive conditions, our analysis indicates that the frequency of clipped tokens decreases markedly,
+
+![](_page_13_Figure_0.jpeg)
+
+Figure 11: **Left:** A case study under the same prompt across various clipping upper bounds. **Right:** The trigger differences of various upper bounds at the top 20 tokens with the highest clip frequencies.
+
+with the focus of clipping shifting away from discourse connectives toward high-frequency functional tokens such as "is", "the", and ",". These tokens are prevalent within sentences and exhibit relatively weak contextual dependencies, making their probability estimates highly sensitive to fluctuations in the probability difference between the sampling and training policies. This transition allows the model to explore a broader range of discourse reasoning structures and promotes diversity in response generation. Besides, the remaining clipping action on common function words serves to maintain the stability of the core sentence structure.
+
+#### 4.2.3 How to set the upper bound for advantage clipping
+
+#### Takeaway 6
+
+There appears to be a "scaling law" between the performance and the upper bound of the clipping on the **small-sized model**, which does not exist on **larger models**.
+
+Section 4.2.1 verifies that Clip-Higher showed significant improvements on aligned models. However, most current works directly set the upper bound of Clip to the default value of 0.28 from (Yu et al., 2025). However, we believe that different models have different preferences for this parameter. To verify this conjecture, we empirically searched for the hyperparameter settings applicable to different aligned models by uniformly setting the upper bound of Clip. Specifically, we set the exploration range of the Clip upper bound from the default threshold of 0.2 from traditional Clip to 0.32 (beyond the widely used upper bound 0.28). We employed two sizes of models and uniformly evaluated their learning capabilities under different settings.
+
+The results in Figure 12 show that for the small-sized model (4B), the model performance gradually improves as the upper bound of the clip increases. And at 0.32, it demonstrates the best performance compared to other settings. On the other hand, for larger model sizes (8B), gradually increasing the upper bound of the clip does not show a progressive improvement. The performance is more prominent when the upper bound is set as 0.28.
+
+## 4.3 Loss Aggregation
+
+The strategy of loss aggregation directly determines the contribution of each sample or token to the overall gradient during optimization (Liu et al., 2025b). Common strategies include token-level and sequence-level aggregation. The sequence-level aggregation adopted by GRPO (Shao et al., 2024) first averages the loss across all tokens within each sample, then averages these per-response losses across the batch, thereby assigning equal weight to each response regardless of its length. However, Yu et al. (2025) highlights a flaw in this method: longer responses possess a diminished influence per token on the
+
+![](_page_14_Figure_0.jpeg)
+
+Figure 12: Test accuracy of aligned models (trained on medium data) with various clipping upper bounds.
+
+total loss, hindering the model's ability to learn effectively from diverse quality reasoning in lengthier responses. This can reduce the model's capacity to learn from long, complex answers, and may bias optimization toward brevity, since shorter correct responses receive larger gradient updates, while longer incorrect responses are insufficiently penalized (Liu et al., 2025a).
+
+$$\begin{split} \mathcal{J}_{\text{sequence-level}}(\theta) &= \mathbb{E}_{(q,a) \sim \mathcal{D}, \{o_i\}_{i=1}^G \sim \pi_{\theta_{\text{old}}}(\cdot|q)} \\ &\left[ \frac{1}{G} \sum_{i=1}^G \frac{1}{|o_i|} \sum_{t=1}^{|o_i|} \min \left( r_{i,t}(\theta) \hat{A}_{i,t}, \text{ clip } \left( r_{i,t}(\theta), 1 - \epsilon_{\text{low}}, 1 + \epsilon_{\text{high}} \right) \hat{A}_{i,t} \right) \right] \end{split}$$
+
+$$\begin{split} \mathcal{J}_{\text{token-level}}(\theta) &= \mathbb{E}_{(q,a) \sim \mathcal{D}, \{o_i\}_{i=1}^G \sim \pi_{\theta_{\text{old}}}(\cdot\mid q)} \\ &\quad\left[ \frac{1}{\sum_{i=1}^G |o_i|} \sum_{i=1}^G \sum_{t=1}^{|o_i|} \min\left( r_{i,t}(\theta) \hat{A}_{i,t}, \operatorname{clip}\left(r_{i,t}(\theta), 1 - \epsilon_{\text{low}}, 1 + \epsilon_{\text{high}}\right) \hat{A}_{i,t} \right) \right] \end{split}$$
+
+In response to this issue, Yu et al. (2025) turns to a token-level calculation approach. Here, losses are calculated by summing the loss across all tokens from all samples and then normalizing by the total token count, guaranteeing an equal contribution from each token regardless of response length. Despite the widespread adoption of these methods, existing analyses remain trivial. In this section, we provide a detailed empirical comparison of the two loss calculation techniques across diverse training data distributions. The evaluation comprehensively assesses the effectiveness of these methods from the perspective of model type.
+
+#### 4.3.1 Does token-level loss aggregation suit all settings?
+
+#### Takeaway 7
+
+Compared to sequence-level calculation, token-level loss proves to be more effective on Base models, while showing limited improvement on Instruct models.
+
+To systematically evaluate the effectiveness of different loss aggregation strategies, we compare token-level and sequence-level loss aggregation on both base and aligned versions of Qwen3-8B, as shown in Figures 13 and 18. For base models, token-level loss consistently improves convergence, peak accuracy, and robustness by ensuring each token contributes equally to the optimization signal, especially on challenging datasets. However, as illustrated in Figure 13 (bottom 2 rows), this advantage does not show in aligned models. In fact, sequence-level aggregation outperforms token-level loss across most datasets and settings, both in convergence speed and final accuracy. Further analysis reveals that aligned models already possess strong and stable reasoning, making the equalization of token-level gradients unnecessary or even detrimental. In these cases, sequence-level aggregation better preserves the structure and consistency of high-quality, aligned outputs.
+
+![](_page_15_Figure_0.jpeg)
+
+Figure 13: **Top 2 rows**: Accuracy comparison between sequence-level loss and token-level loss. Qwen3-8B-Base is used as the initial policy. Results are reported on both Easy and Hard Datasets. **Bottom 2 rows**: Test accuracy of Qwen3-8B with different loss aggregations.
+
+These findings highlight that the optimal loss aggregation strategy is model-dependent, currently from a broader perspective: token-level aggregation is best suited for base models, while response-level aggregation is preferable for instruction-tuned models.
+
+## 4.4 Overlong Filtering
+
+During the training of LLMs, a fixed maximum generation length is often set for truncation to ensure training efficiency and save computational costs (Chen et al., 2025; Team et al., 2025). However, recent studies have revealed that in more complex reasoning tasks, this strategy can prematurely end multi-step tail reasoning processes, particularly noticeable in the early training stages. Consequently, coherent and well-structured reasoning is often cut short before reaching the final answer, causing them to be falsely labeled as negative samples by the model. This noise, akin to penalties, can contaminate the training signal, reducing sample utilization efficiency and learning effectiveness.
+
+To address this issue, the technique named *overlong filtering* has been introduced (Yu et al., 2025). This method involves masking the reward signal of excessively long responses to preserve training loss robustness and prevent degradation of reasoning behavior (He et al., 2025b). Despite its benefits, there remains a lack of detailed analysis regarding the sensitivity of this technique to the mask threshold, leading to confusion among practitioners.
+
+This section aims to analyze the impact of the overlong filtering on performance across diverse datasets under varying maximum generation length settings. By doing so, we seek to identify the suitable scenarios for applying this technique.
+
+![](_page_16_Figure_0.jpeg)
+
+Figure 14: **Top 2 rows**: Total test accuracy and response length of Qwen3-8B-Base over training iterations under different maximum generation lengths. **Middle 3 rows**: Test accuracy of Qwen3-8B-Base over training iterations under different maximum lengths. We set different maximum lengths of 8*k*, 16*k* and 20*k*. **Middle 3 rows**: Validation of overlong mask effectiveness on Qwen3-8B.
+
+#### 4.4.1 When to use the overlong filtering
+
+#### Takeaway 8
+
+Overlong filtering shows limited effectiveness on long-tail reasoning tasks; however, it can enhance the accuracy and clarity of responses in medium and short-length reasoning tasks.
+
+Although recent works have verified the benefits of overlong filtering for policy training (Team et al., 2025; Chen et al., 2025), however, the impact of different maximum lengths on this technique is still unclear. Therefore, we employ the widely used Qwen3-8B-Base and Qwen3-8B as the unified initial policy to compare the effects of different maximum generation lengths on the training dynamics.
+
+The results in Figure 14 highlight the different impact on learning dynamics of various filter thresholds. Notably, when the filter threshold is restricted to 8k tokens, substantial benefits are evident from implementing the overlong filtering. However, with a longer filter threshold, i.e., 20k tokens, the benefits derived from this technique diminish significantly. After checking the response lengths, a discernible pattern emerges to explain this phenomenon. When operating under the threshold of 20k, models trained with the overlong filtering strategy exhibit a tendency to generate longer responses in comparison to the vanilla policy. Conversely, a short filter threshold, i.e., 8k, makes the model generate shorter responses.
+
+![](_page_17_Figure_1.jpeg)
+
+Figure 15: **Left**: Comparison of repeat ratios among four types of generations, i.e., correct (reward = 1) and incorrect (reward = 0) generations under different maximum generation lengths. **Right**: Comparison of repeat ratios among truncated samples with or without overlong filtering strategy. The statistical form of the repetition rate can be found in Appendix B.1.
+
+To further investigate this effect, Figure 15 (Left) shows the distribution of filtered responses exceeding the maximum length. Notably, in the 20k setting, both correct and incorrect samples are filtered more frequently due to repetitive or non-terminating outputs, a hallmark of degenerate generation. This indicates that, with higher length limits, the overlong filtering strategy primarily filters out unproductive or "negative" samples that contribute little to model learning. As illustrated in Figure 15 (Right), we observed that during RL training on models fine-tuned with instructions, the proportion of "repetitive but unable to terminate normally" samples within the overall set of overlong samples gradually increased as training progressed. This indicates a degradation in the model's ability to accurately model end-of-sequence (EOS) tokens, leading to behavioral defects in the inference stage, such as output redundancy and hard in terminating the generation. After introducing the overlong filtering mechanism, the proportion of abnormal samples that are "repetitive but unable to terminate" significantly decreased during training. This shift suggests that the model can more accurately distinguish between "completed generation" and "truncated generation" samples during training, effectively avoiding invalid learning from truncated samples. Furthermore, this mechanism may unlock the policies' ability to accurately model termination behaviors during generation, enabling them to appropriately ignore unfinished inference samples, rather than mistakenly penalizing them as negative examples.
+
+# 5 A simple combination: Lite PPO
+
+Building on the in-depth mechanism analysis and empirical evaluations presented in previous sections, we derive two key technique guidelines for non-aligned models: (i) For small and medium-sized non-aligned models, i.e., 4B-Base and 8B-Base, the technique that can provide significant performance improvement is the advantage normalization introduced in section 4.1.3. This technique shapes sparse rewards into more robust guiding signals through group-level mean calculation and batch-level standard deviation calculation. (ii) Token-level loss aggregation emerges as another highly effective technique for non-aligned models, with Section 4.3.1 experiments demonstrating its particular efficacy for base model architectures.
+
+We therefore propose the following empirically motivated hypothesis: Given the individually superior performance of advantage normalization (group-level mean, batch-level std) and token-level loss aggregation over alternative techniques, their synergistic combination should show robust improvements in policy optimization. To validate this, we integrate both techniques, called Lite PPO, into non-aligned models that use the vanilla PPO loss without the critic. The results shown in Figure 16 indicate that Lite PPO outperforms the technique-heavy algorithm DAPO, which involves *Group-level Normalization*, *Clip-Higher, Overlong Reward Shaping, Token-level Loss, Dynamic Sampling*, and the strong and widely-used RL4LLM algorithm GRPO.
+
+Specifically, in the first two rows of Figure 16, Lite PPO exhibits a stable upward trend on small models
+
+![](_page_18_Figure_0.jpeg)
+
+Figure 16: Test accuracy of non-aligned models trained via three RL methods, i.e., Lite PPO (ours), GRPO (Shao et al., 2024) and DAPO (Yu et al., 2025).
+
+lacking basic reasoning ability. In contrast, other policies collapse rapidly after reaching their peak. This significant advantage results from the normalization technique introduced in Takeaway 3, which effectively counters the interference induced by homogeneous reward distributions characteristic of datasets with non-uniform reward levels (easy and hard). We further evaluate Lite PPO on larger base models. As shown in Figure 16, when training 8B-Base models with inherent long-tail generation capabilities on the hard dataset, Lite PPO also demonstrates superior performance. This improvement stems from Lite PPO eliminating overlong filtering (which typically restricts small models' ability to generate complex long-tail outputs; Takeaway 8), and shifting to token-level loss aggregation (which shows better efficiency on base models; Takeaway 7).
+
+# 6 Conclusion
+
+The rapid advancement of reinforcement learning (RL) in enhancing large language models (LLMs) has ushered in a transformative era for complex reasoning tasks. However, the proliferation of RL4LLM research has also introduced significant challenges, including conflicting methodologies and a lack of cohesive guidelines for technique selection. This work addresses these critical issues by conducting a systematic, reproducible evaluation of prominent RL techniques under a unified framework, revealing key insights that resolve existing ambiguities and streamline practical implementation.
+
+By disentangling the theoretical and practical mechanisms of techniques like normalization, clipping, and filtering, our study provides actionable guidelines to demystify their applicability across diverse scenarios. Crucially, we show that simplicity can outperform complexity: a minimalist approach (i.e., Lite PPO) combining only two core techniques achieves superior performance over algorithms cluttered with redundant components. This finding challenges the prevailing trend of over-engineering RL pipelines and underscores the importance of contextual adaptability in technique selection. Our work not only resolves the current fragmentation in RL4LLM practice but also lays a foundation for developing standardized frameworks that balance theoretical rigor with engineering efficiency.
+
+Finally, to ensure experimental fairness, this paper consistently uses the Qwen3 series model for policy initialization. However, conclusions may vary across LLM families due to inherent differences in pre-training processes and architectures. The prevailing trend of model closed-sourcing, often driven by commercial or strategic considerations, significantly impedes model-family-level technical analysis. Therefore, we advocate for increased disclosure of implementation details in future technical reports within the industry. This transparency is crucial to bridge the understanding gap between academia and industry, enabling the community to pool collective insights in artificial intelligence.
+
+## 7 Future work
+
+We envision this work as the starting point of a sustained effort to guide the evolution of reinforcement learning for LLMs along principled and empirically grounded trajectories. Our future research will focus on: (1) continue to monitoring and critically evaluating developments in RL4LLM, distilling emerging practices into coherent, evidence-based guidelines for both academic and industrial practitioners; (2) leveraging the proposed ROLL framework to consolidate diverse RL algorithms and optimization strategies into a unified, modular suite, enabling flexible composition and benchmarking within a consistent training infrastructure; (3) continuing to explore streamlined RL algorithms that deliver strong empirical performance with minimal engineering overhead. These directions align with our long-term vision to provide the community with clear and reliable guidance, driving the field toward robust, adaptable, and broadly beneficial progress, while advancing RL4LLM through both algorithmic innovations and comprehensive framework support.
+
+# References
+
+- Siwei Wu, Zhongyuan Peng, Xinrun Du, Tuney Zheng, Minghao Liu, Jialong Wu, Jiachen Ma, Yizhi Li, Jian Yang, Wangchunshu Zhou, Qunshu Lin, Junbo Zhao, Zhaoxiang Zhang, Wenhao Huang, Ge Zhang, Chenghua Lin, and Jiaheng Liu. A comparative study on reasoning patterns of openai's o1 model. *CoRR*, abs/2410.13639, 2024. doi: 10.48550/ARXIV.2410.13639. URL https://doi.org/10.48550/arXiv.2410.13639.
+- Zhihong Shao, Peiyi Wang, Qihao Zhu, Runxin Xu, Junxiao Song, Xiao Bi, Haowei Zhang, Mingchuan Zhang, Y. K. Li, Y. Wu, and Daya Guo. Deepseekmath: Pushing the limits of mathematical reasoning in open language models. *arXiv preprint arXiv*: 2402.03300, 2024. URL https://arxiv.org/abs/2402.03300v3.
+- Zhiwei He, Tian Liang, Jiahao Xu, Qiuzhi Liu, Xingyu Chen, Yue Wang, Linfeng Song, Dian Yu, Zhenwen Liang, Wenxuan Wang, Zhuosheng Zhang, Rui Wang, Zhaopeng Tu, Haitao Mi, and Dong Yu. Deepmath-103k: A large-scale, challenging, decontaminated, and verifiable mathematical dataset for advancing reasoning. *CoRR*, abs/2504.11456, 2025a. doi: 10.48550/ARXIV.2504.11456. URL https://doi.org/10.48550/arXiv.2504.11456.
+- Terry Yue Zhuo, Minh Chien Vu, Jenny Chim, Han Hu, Wenhao Yu, Ratnadira Widyasari, Imam Nur Bani Yusuf, Haolan Zhan, Junda He, Indraneil Paul, Simon Brunner, Chen Gong, James Hoang, Armel Randy Zebaze, Xiaoheng Hong, Wen-Ding Li, Jean Kaddour, Ming Xu, Zhihan Zhang, Prateek Yadav, and et al. Bigcodebench: Benchmarking code generation with diverse function calls and complex instructions. In *The Thirteenth International Conference on Learning Representations, ICLR* 2025, *Singapore, April* 24-28, 2025. OpenReview.net, 2025. URL https://openreview.net/forum?id=YrycTjllL0.
+- Shengyi Huang, Michael Noukhovitch, Arian Hosseini, Kashif Rasul, Weixun Wang, and Lewis Tunstall. The N+ implementation details of RLHF with PPO: A case study on tl;dr summarization. *CoRR*, abs/2403.17031, 2024a. doi: 10.48550/ARXIV.2403.17031. URL https://doi.org/10.48550/arXiv.2403.17031.
+- Jian Hu, Jason Klein Liu, Haotian Xu, and Wei Shen. Reinforce++: An efficient rlhf algorithm with robustness to both prompt and reward models, 2025. URL https://arxiv.org/abs/2501.03262.
+- Zichen Liu, Changyu Chen, Wenjun Li, Penghui Qi, Tianyu Pang, Chao Du, Wee Sun Lee, and Min Lin. Understanding r1-zero-like training: A critical perspective. *CoRR*, abs/2503.20783, 2025a. doi: 10.48550/ARXIV.2503.20783. URL https://doi.org/10.48550/arXiv.2503.20783.
+- Qiying Yu, Zheng Zhang, Ruofei Zhu, Yufeng Yuan, Xiaochen Zuo, Yu Yue, Tiantian Fan, Gaohong Liu, Lingjun Liu, Xin Liu, Haibin Lin, Zhiqi Lin, Bole Ma, Guangming Sheng, Yuxuan Tong, Chi Zhang, Mofan Zhang, Wang Zhang, Hang Zhu, Jinhua Zhu, Jiaze Chen, Jiangjie Chen, Chengyi Wang, Hongli Yu, Weinan Dai, Yuxuan Song, Xiangpeng Wei, Hao Zhou, Jingjing Liu, Wei-Ying Ma,
+
+- Ya-Qin Zhang, Lin Yan, Mu Qiao, Yonghui Wu, and Mingxuan Wang. DAPO: an open-source LLM reinforcement learning system at scale. *CoRR*, abs/2503.14476, 2025. doi: 10.48550/ARXIV.2503.14476. URL https://doi.org/10.48550/arXiv.2503.14476.
+- Marcin Andrychowicz, Anton Raichuk, Piotr Stanczyk, Manu Orsini, Sertan Girgin, Raphaël Marinier, Léonard Hussenot, Matthieu Geist, Olivier Pietquin, Marcin Michalski, Sylvain Gelly, and Olivier Bachem. What matters in on-policy reinforcement learning? A large-scale empirical study. *CoRR*, abs/2006.05990, 2020. URL https://arxiv.org/abs/2006.05990.
+- Logan Engstrom, Andrew Ilyas, Shibani Santurkar, Dimitris Tsipras, Firdaus Janoos, Larry Rudolph, and Aleksander Madry. Implementation matters in deep policy gradients: A case study on PPO and TRPO. *CoRR*, abs/2005.12729, 2020. URL https://arxiv.org/abs/2005.12729.
+- John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, and Oleg Klimov. Proximal policy optimization algorithms. *CoRR*, abs/1707.06347, 2017. URL http://arxiv.org/abs/1707.06347.
+- John Schulman, Philipp Moritz, Sergey Levine, Michael Jordan, and Pieter Abbeel. High-dimensional continuous control using generalized advantage estimation, 2018. URL https://arxiv.org/abs/1506.02438.
+- Yujing Hu, Weixun Wang, Hangtian Jia, Yixiang Wang, Yingfeng Chen, Jianye Hao, Feng Wu, and Changjie Fan. Learning to utilize shaping rewards: A new approach of reward shaping. *Advances in Neural Information Processing Systems*, 33:15931–15941, 2020.
+- Arash Ahmadian, Chris Cremer, Matthias Gallé, Marzieh Fadaee, Julia Kreutzer, Olivier Pietquin, Ahmet Üstün, and Sara Hooker. Back to basics: Revisiting reinforce-style optimization for learning from human feedback in llms. In Lun-Wei Ku, Andre Martins, and Vivek Srikumar, editors, *Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), ACL 2024, Bangkok, Thailand, August 11-16, 2024*, pages 12248–12267. Association for Computational Linguistics, 2024. doi: 10.18653/V1/2024.ACL-LONG.662. URL https://doi.org/10.18653/v1/2024.acl-long.662.
+- Wouter Kool, Herke van Hoof, and Max Welling. Buy 4 REINFORCE samples, get a baseline for free!, 2019. URL https://openreview.net/forum?id=r11gTGL5DE.
+- Xiaojiang Zhang, Jinghui Wang, Zifei Cheng, Wenhao Zhuang, Zheng Lin, Minglei Zhang, Shaojie Wang, Yinghan Cui, Chao Wang, Junyi Peng, Shimiao Jiang, Shiqi Kuang, Shouyu Yin, Chaohang Wen, Haotian Zhang, Bin Chen, and Bing Yu. SRPO: A cross-domain implementation of large-scale reinforcement learning on LLM. *CoRR*, abs/2504.14286, 2025. doi: 10.48550/ARXIV.2504.14286. URL https://doi.org/10.48550/arXiv.2504.14286.
+- Xiangxiang Chu, Hailang Huang, Xiao Zhang, Fei Wei, and Yong Wang. GPG: A simple and strong reinforcement learning baseline for model reasoning. *CoRR*, abs/2504.02546, 2025. doi: 10.48550/ARXIV.2504.02546. URL https://doi.org/10.48550/arXiv.2504.02546.
+- Jixiao Zhang and Chunsheng Zuo. GRPO-LEAD: A difficulty-aware reinforcement learning approach for concise mathematical reasoning in language models. *CoRR*, abs/2504.09696, 2025. doi: 10.48550/ARXIV.2504.09696. URL https://doi.org/10.48550/arXiv.2504.09696.
+- Weixun Wang, Shaopan Xiong, Gengru Chen, Wei Gao, Sheng Guo, Yancheng He, Ju Huang, Jiaheng Liu, Zhendong Li, Xiaoyang Li, Zichen Liu, Haizhou Zhao, Dakai An, Lunxi Cao, Qiyang Cao, Wanxi Deng, Feilei Du, Yiliang Gu, Jiahe Li, Xiang Li, Mingjie Liu, Yijia Luo, Zihe Liu, Yadao Wang, Pei Wang, Tianyuan Wu, Yanan Wu, Yuheng Zhao, Shuaibing Zhao, Jin Yang, Siran Yang, Yingshui Tan, Huimin Yi, Yuchi Xu, Yujin Yuan, Xingyao Zhang, Lin Qu, Wenbo Su, Wei Wang, Jiamang Wang, and Bo Zheng. Reinforcement learning optimization for large-scale learning: An efficient and user-friendly scaling library. *CoRR*, abs/2506.06122, 2025. doi: 10.48550/ARXIV.2506.06122. URL https://doi.org/10.48550/arXiv.2506.06122.
+- Richard S. Sutton, David A. McAllester, Satinder Singh, and Yishay Mansour. Policy gradient methods for reinforcement learning with function approximation. In Sara A. Solla, Todd K. Leen, and Klaus-Robert Müller, editors, *Advances in Neural Information Processing Systems* 12, [NIPS Conference, Denver, Colorado, USA, November 29 December 4, 1999], pages 1057–1063. The MIT Press, 1999. URL http://papers.nips.cc/paper/1713-policy-gradient-methods-for-reinforcement-learning-with-function-approximation.
+- Weihao Zeng, Yuzhen Huang, Qian Liu, Wei Liu, Keqing He, Zejun Ma, and Junxian He. Simplerlzoo: Investigating and taming zero reinforcement learning for open base models in the wild. *CoRR*, abs/2503.18892, 2025. doi: 10.48550/ARXIV.2503.18892. URL https://doi.org/10.48550/arXiv.2503.18892.
+
+- Aaron Hurst, Adam Lerer, Adam P Goucher, Adam Perelman, Aditya Ramesh, Aidan Clark, AJ Ostrow, Akila Welihinda, Alan Hayes, Alec Radford, et al. Gpt-4o system card. *arXiv preprint arXiv:*2410.21276, 2024. URL https://arxiv.org/abs/2410.21276.
+- Dan Hendrycks, Collin Burns, Steven Basart, Andy Zou, Mantas Mazeika, Dawn Song, and Jacob Steinhardt. Measuring massive multitask language understanding. In *9th International Conference on Learning Representations, ICLR 2021, Virtual Event, Austria, May 3-7, 2021*. OpenReview.net, 2021. URL https://openreview.net/forum?id=d7KBjmI3GmQ.
+- Chaoqun He, Renjie Luo, Yuzhuo Bai, Shengding Hu, Zhen Leng Thai, Junhao Shen, Jinyi Hu, Xu Han, Yujie Huang, Yuxiang Zhang, Jie Liu, Lei Qi, Zhiyuan Liu, and Maosong Sun. Olympiadbench: A challenging benchmark for promoting AGI with olympiad-level bilingual multimodal scientific problems. In Lun-Wei Ku, Andre Martins, and Vivek Srikumar, editors, *Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)*, ACL 2024, Bangkok, Thailand, August 11-16, 2024, pages 3828–3850. Association for Computational Linguistics, 2024. doi: 10.18653/V1/2024.ACL-LONG.211. URL https://doi.org/10.18653/v1/2024.acl-long.211.
+- Aitor Lewkowycz, Anders Andreassen, David Dohan, Ethan Dyer, Henryk Michalewski, Vinay V. Ramasesh, Ambrose Slone, Cem Anil, Imanol Schlag, Theo Gutman-Solo, Yuhuai Wu, Behnam Neyshabur, Guy Gur-Ari, and Vedant Misra. Solving quantitative reasoning problems with language models. In Sanmi Koyejo, S. Mohamed, A. Agarwal, Danielle Belgrave, K. Cho, and A. Oh, editors, *Advances in Neural Information Processing Systems 35: Annual Conference on Neural Information Processing Systems 2022, NeurIPS 2022, New Orleans, LA, USA, November 28 December 9, 2022*, 2022. URL http://papers.nips.cc/paper\_files/paper/2022/hash/18abbeef8cfe9203fdf9053c9c4fe191-Abstract-Conference.html.
+- Rui Zheng, Shihan Dou, Songyang Gao, Yuan Hua, Wei Shen, Binghai Wang, Yan Liu, Senjie Jin, Yuhao Zhou, Limao Xiong, Lu Chen, Zhiheng Xi, Nuo Xu, Wenbin Lai, Minghao Zhu, Haoran Huang, Tao Gui, Qi Zhang, and Xuanjing Huang. Delve into PPO: Implementation matters for stable RLHF. In *NeurIPS* 2023 Workshop on Instruction Tuning and Instruction Following, 2023. URL https://openreview.net/forum?id=rxEmi0EIFL.
+- An Yang, Anfeng Li, Baosong Yang, Beichen Zhang, Binyuan Hui, Bo Zheng, Bowen Yu, Chang Gao, Chengen Huang, Chenxu Lv, Chujie Zheng, Dayiheng Liu, Fan Zhou, Fei Huang, Feng Hu, Hao Ge, Haoran Wei, Huan Lin, Jialong Tang, Jian Yang, Jianhong Tu, Jianwei Zhang, Jianxin Yang, Jiaxi Yang, Jing Zhou, Jingren Zhou, Junyang Lin, Kai Dang, Keqin Bao, Kexin Yang, Le Yu, Lianghao Deng, Mei Li, Mingfeng Xue, Mingze Li, Pei Zhang, Peng Wang, Qin Zhu, Rui Men, Ruize Gao, Shixuan Liu, Shuang Luo, Tianhao Li, Tianyi Tang, Wenbiao Yin, Xingzhang Ren, Xinyu Wang, Xinyu Zhang, Xuancheng Ren, Yang Fan, Yang Su, Yichang Zhang, Yinger Zhang, Yu Wan, Yuqiong Liu, Zekun Wang, Zeyu Cui, Zhenru Zhang, Zhipeng Zhou, and Zihan Qiu. Qwen3 technical report, 2025. URL https://arxiv.org/abs/2505.09388.
+- Nai-Chieh Huang, Ping-Chun Hsieh, Kuo-Hao Ho, and I-Chen Wu. Ppo-clip attains global optimality: Towards deeper understandings of clipping. In Michael J. Wooldridge, Jennifer G. Dy, and Sriraam Natarajan, editors, *Thirty-Eighth AAAI Conference on Artificial Intelligence, AAAI 2024, Thirty-Sixth Conference on Innovative Applications of Artificial Intelligence, IAAI 2024, Fourteenth Symposium on Educational Advances in Artificial Intelligence, EAAI 2014, February 20-27, 2024, Vancouver, Canada*, pages 12600–12607. AAAI Press, 2024b. doi: 10.1609/AAAI.V38I11.29154. URL https://doi.org/10.1609/aaai.v38i11.29154.
+- Ruinan Jin, Shuai Li, and Baoxiang Wang. On stationary point convergence of ppo-clip. In *The Twelfth International Conference on Learning Representations, ICLR* 2024, *Vienna, Austria, May* 7-11, 2024. OpenReview.net, 2024. URL https://openreview.net/forum?id=uznKlCpWjV.
+- Mingjie Liu, Shizhe Diao, Ximing Lu, Jian Hu, Xin Dong, Yejin Choi, Jan Kautz, and Yi Dong. Prorl: Prolonged reinforcement learning expands reasoning boundaries in large language models. *CoRR*, abs/2505.24864, 2025b. doi: 10.48550/ARXIV.2505.24864. URL https://doi.org/10.48550/arXiv.2505.24864.
+- Aili Chen, Aonian Li, Bangwei Gong, Binyang Jiang, Bo Fei, Bo Yang, Boji Shan, Changqing Yu, Chao Wang, Cheng Zhu, et al. Minimax-m1: Scaling test-time compute efficiently with lightning attention. *arXiv* preprint arXiv:2506.13585, 2025.
+- Kimi Team, Angang Du, Bofei Gao, Bowei Xing, Changjiu Jiang, Cheng Chen, Cheng Li, Chenjun Xiao, Chenzhuang Du, Chonghua Liao, et al. Kimi k1. 5: Scaling reinforcement learning with llms. *arXiv* preprint arXiv:2501.12599, 2025.
+- Jujie He, Jiacai Liu, Chris Yuhao Liu, Rui Yan, Chaojie Wang, Peng Cheng, Xiaoyu Zhang, Fuxiang Zhang, Jiacheng Xu, Wei Shen, et al. Skywork open reasoner 1 technical report. arXiv preprint arXiv:2505.22312, 2025b.
+
+# A Detailed Experimental Setup
+
+#### A.1 Parameters
+
+We employ ROLL, a user-friendly and efficient open-source reinforcement learning framework, to implement our pipeline. Subsequently, the key parameters observed during the training process are presented as follows. See our code config file for more details on the parameters.
+
+```
+seed: 42
+max_steps: 500
+save_steps: 20
+logging_steps: 1
+eval_steps: 1
+rollout_batch_size: 128
+prompt_length: 1024
+response_length: 8000
+ppo_epochs: 1
+adv_estimator: "reinforce"
+init_kl_coef: 0.0
+async_generate_level: 1
+actor_train:
+  training_args:
+    learning_rate: 1.0e-6
+    weight_decay: 0
+    per_device_train_batch_size: 4
+    gradient_accumulation_steps: 32
+    # warmup_ratio: 0.1
+    warmup_steps: 50
+    num_train_epochs: 50
+actor_infer:
+  generating_args:
+    max_new_tokens: ${response_length}
+    top_p: 0.99
+    top_k: 100
+    num_beams: 1
+    temperature: 0.99
+    num_return_sequences: 8
+```
+
+#### A.2 Prompt
+
+In this work, we incorporate the following instruction into the system prompt to encourage the model to better demonstrate its reasoning process: "Please reason step by step, and put your final answer within \boxed{}." This setting is designed to guide the model to perform step-by-step reasoning and explicitly present the final answer in the form of \boxed{}, thereby enhancing the clarity and readability of the output.
+
+# **B** Details of Overlong Filter
+
+#### **B.1** Repeat Ratio
+
+To further investigate the mechanism by which the overlong filter on the aligned model, we adopted a rule-based approach to efficiently identify whether overlong samples are caused by the inability to control the end-of-sequence (EOS) token, resulting in repetitive generation without termination. Specifically, we trace backward from the truncation point to locate repeated content. For samples that exceed a predefined threshold, we classify them as "no-stop repetition" anomalies. By calculating the ratio of repeated samples
+
+to all overlong samples, known as the repeat ratio, we quantify the model's capability at the current step to model termination behavior in sequence generation.
+
+## **B.2** Examples of Ostensible Positive Phenomena
+
+As demonstrated in Figure 15 in the main text, we observe that models with weaker capabilities tend to continue generating content aimlessly even after correctly reasoning and providing the correct answer, until exceeding the output length limit. Such false positives, although receiving a reward of 1 through rule-based evaluation, introduce noise into the model during training. We present a representative case for illustration, as shown in Figure 17
+
+```
+Given an angle \( \theta = \frac{2 \pi}{5} \), we must prove the equation \( 2 \cos(2 \theta) + 2 \cos(\theta) + 1 = 0 \). First, use the double-angle identity for cosine, which states that \( \cos(2 \theta) = 2 \cos^2(\theta) - 1 \): Substitute this into the given equation: \( 2 \cos(2 \theta) + 1 = 2(2 \cos^2(\theta) - 1) + 2 \cos(\theta) + 1. \] ......
+
+Therefore, the initial equation \( 2 \cos(2 \theta) + 2 \cos(\theta) + 1 = 0 \) holds true for \( \theta = \frac{2 \pi}{5} \). Thus, the answer is: \( \boxed{0}. \) \( \boxed{0}. \) \)
+
+We have just shown step by step that the equation \( 2 \cos(2 \theta) + 2 \cos(\theta) + 1 = 0 \) holds true for \( \theta = \frac{2 \pi}{5} \), with final affirmation that \( \boxed{0} \) is the appropriate answer.
+
+Do you have any other questions? \\( \user \text{Nuser} \) Next, prove that \( \cos(\text{Next}_2 \pi){5} \) is a root of the quartic polynomial \( \x^4 + \x^3 - 6 \x^2 - \x + 1 = 0 \) \\( \) \( \Lambda \text{Nn} \\ \n \n \n \n \n \n \n \n \n \n \n \n
+```
+
+Figure 17: An ostensible positive case, which cannot be terminated after the answer is given at the end of inference.
+
+# C Detailed Experimental Results
+
+As shown in Figure 18, when using Qwen3-8B-Base as the initial model, more competitive results can be obtained on the benchmark using training datasets of different difficulty levels.
+
+![](_page_23_Figure_7.jpeg)
+
+Figure 18: Test accuracy of sample-level loss and token-level loss on medium and extremely hard datasets.
+
+To further solidify the results in Figure 5, we show in Figure 19 the accuracy achieved using the Qwen3-8B-Base model as the initial model, evaluated across different reward scales with batch-level normalization applied.
+
+![](_page_24_Figure_0.jpeg)
+
+Figure 19: Accuracy over training iterations of Qwen3-8B-Base with batch-level normalization under different reward scale. The first row uses the easy training dataset, while the second row uses the medium training dataset.
+
+# D Case Study of Clip Higher
+
+We show a detailed case to visualize the trigger behavior of Clip Higher. Please refer to Figure 20.
+
+![](_page_24_Figure_4.jpeg)
+
+Figure 20: A case study under the same prompt across various clipping upper bounds. **Top**: high clip is 0.20, **Bottom**: high clip is 0.28.
+
+As illustrated in Figure 21, we present a comparison of token distributions between the base model and the aligned model at the 8B scale.
+
+![](_page_25_Figure_1.jpeg)
+
+Figure 21: Predicted probability distributions of Qwen3-8B-Base (left) and Qwen3-8B (right) under two clipping upper bound  $\in \{0.20, 0.28\}$ .
\ No newline at end of file
diff --git a/research/papers/arXiv-tricks_or_traps_part1/arXiv-2508.08221_meta.json b/research/papers/arXiv-tricks_or_traps_part1/arXiv-2508.08221_meta.json
new file mode 100644
index 0000000..0cdf811
--- /dev/null
+++ b/research/papers/arXiv-tricks_or_traps_part1/arXiv-2508.08221_meta.json
@@ -0,0 +1,2171 @@
+{
+  "table_of_contents": [
+    {
+      "title": "Part I: Tricks or Traps?\n\nA Deep Dive into RL for LLM Reasoning",
+      "heading_level": null,
+      "page_id": 0,
+      "polygon": [
+        [
+          134.1995014796029,
+          69.88344848155975
+        ],
+        [
+          458.82734584087694,
+          69.88344848155975
+        ],
+        [
+          458.82734584087694,
+          112.63567578792572
+        ],
+        [
+          134.1995014796029,
+          112.63567578792572
+        ]
+      ]
+    },
+    {
+      "title": "Abstract",
+      "heading_level": null,
+      "page_id": 0,
+      "polygon": [
+        [
+          269.1487208445667,
+          233.08185464143753
+        ],
+        [
+          325.37756224663497,
+          233.08185464143753
+        ],
+        [
+          325.37756224663497,
+          245.8948573505677
+        ],
+        [
+          269.1487208445667,
+          245.8948573505677
+        ]
+      ]
+    },
+    {
+      "title": "1 Introduction",
+      "heading_level": null,
+      "page_id": 1,
+      "polygon": [
+        [
+          68.97404545320371,
+          69.4723693728447
+        ],
+        [
+          176.18370305981384,
+          69.4723693728447
+        ],
+        [
+          176.18370305981384,
+          81.71505930247524
+        ],
+        [
+          68.97404545320371,
+          81.71505930247524
+        ]
+      ]
+    },
+    {
+      "title": "2 Preliminaries",
+      "heading_level": null,
+      "page_id": 2,
+      "polygon": [
+        [
+          68.97404545320371,
+          69.4723693728447
+        ],
+        [
+          179.9322924866184,
+          69.4723693728447
+        ],
+        [
+          179.9322924866184,
+          81.71505930247524
+        ],
+        [
+          68.97404545320371,
+          81.71505930247524
+        ]
+      ]
+    },
+    {
+      "title": "2.1 Proximal Policy Optimization (PPO)",
+      "heading_level": null,
+      "page_id": 2,
+      "polygon": [
+        [
+          70.63089269399643,
+          97.83682787418365
+        ],
+        [
+          296.13856471755946,
+          97.83682787418365
+        ],
+        [
+          296.13856471755946,
+          109.4531986987283
+        ],
+        [
+          70.63089269399643,
+          109.4531986987283
+        ]
+      ]
+    },
+    {
+      "title": "2.2 Group Relative Policy Optimization (GRPO)",
+      "heading_level": null,
+      "page_id": 2,
+      "polygon": [
+        [
+          68.97404545320371,
+          312.00904351472855
+        ],
+        [
+          341.871355724575,
+          312.00904351472855
+        ],
+        [
+          341.871355724575,
+          324.34141677618027
+        ],
+        [
+          68.97404545320371,
+          324.34141677618027
+        ]
+      ]
+    },
+    {
+      "title": "2.3 Decoupled Clip and Dynamic Sampling Policy Optimization (DAPO)",
+      "heading_level": null,
+      "page_id": 2,
+      "polygon": [
+        [
+          69.75890636444092,
+          589.997721752734
+        ],
+        [
+          477.57029297489964,
+          589.997721752734
+        ],
+        [
+          477.57029297489964,
+          601.2429133998637
+        ],
+        [
+          69.75890636444092,
+          601.2429133998637
+        ]
+      ]
+    },
+    {
+      "title": "2.4 Reinforcement Learning Techniques",
+      "heading_level": null,
+      "page_id": 3,
+      "polygon": [
+        [
+          69.46824425458908,
+          71.2195470984876
+        ],
+        [
+          297.63800048828125,
+          71.2195470984876
+        ],
+        [
+          297.63800048828125,
+          83.21441818875918
+        ],
+        [
+          69.46824425458908,
+          83.21441818875918
+        ]
+      ]
+    },
+    {
+      "title": "3 Experimental Designs",
+      "heading_level": null,
+      "page_id": 3,
+      "polygon": [
+        [
+          70.92155480384827,
+          562.7672998309135
+        ],
+        [
+          236.91085177404756,
+          562.7672998309135
+        ],
+        [
+          236.91085177404756,
+          578.3883059620857
+        ],
+        [
+          70.92155480384827,
+          578.3883059620857
+        ]
+      ]
+    },
+    {
+      "title": "3.1 Experimental Setup",
+      "heading_level": null,
+      "page_id": 3,
+      "polygon": [
+        [
+          68.97404545320371,
+          592.7760747671127
+        ],
+        [
+          206.92213635961116,
+          592.7760747671127
+        ],
+        [
+          206.92213635961116,
+          603.4919517292896
+        ],
+        [
+          68.97404545320371,
+          603.4919517292896
+        ]
+      ]
+    },
+    {
+      "title": "3.2 Baseline Results",
+      "heading_level": null,
+      "page_id": 4,
+      "polygon": [
+        [
+          68.97404545320371,
+          364.6271694302559
+        ],
+        [
+          188.92890711094932,
+          364.6271694302559
+        ],
+        [
+          188.92890711094932,
+          374.8397215709873
+        ],
+        [
+          68.97404545320371,
+          374.8397215709873
+        ]
+      ]
+    },
+    {
+      "title": "Analysis",
+      "heading_level": null,
+      "page_id": 4,
+      "polygon": [
+        [
+          69.75890636444092,
+          624.4291661381721
+        ],
+        [
+          149.943577072182,
+          624.4291661381721
+        ],
+        [
+          149.943577072182,
+          640.0501722693443
+        ],
+        [
+          69.75890636444092,
+          640.0501722693443
+        ]
+      ]
+    },
+    {
+      "title": "4.1 Normalization",
+      "heading_level": null,
+      "page_id": 4,
+      "polygon": [
+        [
+          69.72376333856462,
+          652.382545530796
+        ],
+        [
+          178.43285671589658,
+          652.382545530796
+        ],
+        [
+          178.43285671589658,
+          663.4663071806476
+        ],
+        [
+          69.72376333856462,
+          663.4663071806476
+        ]
+      ]
+    },
+    {
+      "title": "4.1.1 Advantage normalization is sensitive to reward mechanisms",
+      "heading_level": null,
+      "page_id": 6,
+      "polygon": [
+        [
+          68.97404545320371,
+          226.09350979328156
+        ],
+        [
+          374.858942680455,
+          226.09350979328156
+        ],
+        [
+          374.858942680455,
+          236.14902458972202
+        ],
+        [
+          68.97404545320371,
+          236.14902458972202
+        ]
+      ]
+    },
+    {
+      "title": "Takeaway 1",
+      "heading_level": null,
+      "page_id": 6,
+      "polygon": [
+        [
+          85.45466029644012,
+          250.34717720746994
+        ],
+        [
+          138.69780879176835,
+          250.34717720746994
+        ],
+        [
+          138.69780879176835,
+          259.3909175992012
+        ],
+        [
+          85.45466029644012,
+          259.3909175992012
+        ]
+      ]
+    },
+    {
+      "title": "4.1.2 Impact of the standard deviation term in advantage normalization",
+      "heading_level": null,
+      "page_id": 6,
+      "polygon": [
+        [
+          69.17758214473724,
+          567.2891700267792
+        ],
+        [
+          401.09906866808683,
+          567.2891700267792
+        ],
+        [
+          401.09906866808683,
+          578.0028506624625
+        ],
+        [
+          69.17758214473724,
+          578.0028506624625
+        ]
+      ]
+    },
+    {
+      "title": "Takeaway 2",
+      "heading_level": null,
+      "page_id": 6,
+      "polygon": [
+        [
+          84.71812104578282,
+          589.997721752734
+        ],
+        [
+          139.44752667712925,
+          589.997721752734
+        ],
+        [
+          139.44752667712925,
+          601.2429133998637
+        ],
+        [
+          84.71812104578282,
+          601.2429133998637
+        ]
+      ]
+    },
+    {
+      "title": "4.1.3 Reconstruct a robust normalization technique",
+      "heading_level": null,
+      "page_id": 10,
+      "polygon": [
+        [
+          69.72376333856462,
+          72.3499231338501
+        ],
+        [
+          308.8837687686949,
+          72.3499231338501
+        ],
+        [
+          308.8837687686949,
+          82.46473874561721
+        ],
+        [
+          69.72376333856462,
+          82.46473874561721
+        ]
+      ]
+    },
+    {
+      "title": "Takeaway 3",
+      "heading_level": null,
+      "page_id": 10,
+      "polygon": [
+        [
+          85.46783893114375,
+          95.37035322189331
+        ],
+        [
+          138.69780879176835,
+          95.37035322189331
+        ],
+        [
+          138.69780879176835,
+          105.70480148301843
+        ],
+        [
+          85.46783893114375,
+          105.70480148301843
+        ]
+      ]
+    },
+    {
+      "title": "4.2 Clip-Higher",
+      "heading_level": null,
+      "page_id": 10,
+      "polygon": [
+        [
+          68.97404545320371,
+          318.9973883628845
+        ],
+        [
+          164.9379347794002,
+          318.9973883628845
+        ],
+        [
+          164.9379347794002,
+          330.60863442561083
+        ],
+        [
+          68.97404545320371,
+          330.60863442561083
+        ]
+      ]
+    },
+    {
+      "title": "4.2.1 In which settings should we clip higher",
+      "heading_level": null,
+      "page_id": 11,
+      "polygon": [
+        [
+          69.72376333856462,
+          639.4765650001044
+        ],
+        [
+          281.8939248957022,
+          639.4765650001044
+        ],
+        [
+          281.8939248957022,
+          649.5049917697906
+        ],
+        [
+          69.72376333856462,
+          649.5049917697906
+        ]
+      ]
+    },
+    {
+      "title": "Takeaway 4",
+      "heading_level": null,
+      "page_id": 11,
+      "polygon": [
+        [
+          86.0359845161438,
+          662.7166277375056
+        ],
+        [
+          139.44752667712925,
+          662.7166277375056
+        ],
+        [
+          139.44752667712925,
+          673.2121399414932
+        ],
+        [
+          86.0359845161438,
+          673.2121399414932
+        ]
+      ]
+    },
+    {
+      "title": "4.2.2 Analyzing the effectiveness of Clip-Higher from a linguistic perspective",
+      "heading_level": null,
+      "page_id": 12,
+      "polygon": [
+        [
+          68.59625792503357,
+          528.6477338075638
+        ],
+        [
+          429.58834831180144,
+          528.6477338075638
+        ],
+        [
+          429.58834831180144,
+          539.0195196190798
+        ],
+        [
+          68.59625792503357,
+          539.0195196190798
+        ]
+      ]
+    },
+    {
+      "title": "Takeaway 5",
+      "heading_level": null,
+      "page_id": 12,
+      "polygon": [
+        [
+          85.46783893114375,
+          552.9014012217522
+        ],
+        [
+          140.19724456249017,
+          552.9014012217522
+        ],
+        [
+          140.19724456249017,
+          562.259582356481
+        ],
+        [
+          85.46783893114375,
+          562.259582356481
+        ]
+      ]
+    },
+    {
+      "title": "4.2.3 How to set the upper bound for advantage clipping",
+      "heading_level": null,
+      "page_id": 13,
+      "polygon": [
+        [
+          68.97404545320371,
+          430.31600036349346
+        ],
+        [
+          335.12389475632676,
+          430.31600036349346
+        ],
+        [
+          335.12389475632676,
+          440.0618331243391
+        ],
+        [
+          68.97404545320371,
+          440.0618331243391
+        ]
+      ]
+    },
+    {
+      "title": "Takeaway 6",
+      "heading_level": null,
+      "page_id": 13,
+      "polygon": [
+        [
+          85.46783893114375,
+          453.5560631008947
+        ],
+        [
+          140.19724456249017,
+          453.5560631008947
+        ],
+        [
+          140.19724456249017,
+          463.3018958617403
+        ],
+        [
+          85.46783893114375,
+          463.3018958617403
+        ]
+      ]
+    },
+    {
+      "title": "4.3 Loss Aggregation",
+      "heading_level": null,
+      "page_id": 13,
+      "polygon": [
+        [
+          68.97404545320371,
+          677.4583711624146
+        ],
+        [
+          194.1769323084757,
+          677.4583711624146
+        ],
+        [
+          194.1769323084757,
+          688.2057288043327
+        ],
+        [
+          68.97404545320371,
+          688.2057288043327
+        ]
+      ]
+    },
+    {
+      "title": "4.3.1 Does token-level loss aggregation suit all settings?",
+      "heading_level": null,
+      "page_id": 14,
+      "polygon": [
+        [
+          69.72376333856462,
+          572.6331984400749
+        ],
+        [
+          330.6255874441613,
+          572.6331984400749
+        ],
+        [
+          330.6255874441613,
+          582.5009273213143
+        ],
+        [
+          69.72376333856462,
+          582.5009273213143
+        ]
+      ]
+    },
+    {
+      "title": "Takeaway 7",
+      "heading_level": null,
+      "page_id": 14,
+      "polygon": [
+        [
+          85.45466029644012,
+          595.9951572978698
+        ],
+        [
+          140.19724456249017,
+          595.9951572978698
+        ],
+        [
+          140.19724456249017,
+          605.7409900587155
+        ],
+        [
+          85.45466029644012,
+          605.7409900587155
+        ]
+      ]
+    },
+    {
+      "title": "4.4 Overlong Filtering",
+      "heading_level": null,
+      "page_id": 15,
+      "polygon": [
+        [
+          68.97404545320371,
+          527.0034173727036
+        ],
+        [
+          200.17467539136297,
+          527.0034173727036
+        ],
+        [
+          200.17467539136297,
+          537.5201607327958
+        ],
+        [
+          68.97404545320371,
+          537.5201607327958
+        ]
+      ]
+    },
+    {
+      "title": "4.4.1 When to use the overlong filtering",
+      "heading_level": null,
+      "page_id": 16,
+      "polygon": [
+        [
+          68.97404545320371,
+          643.7498842477798
+        ],
+        [
+          257.1532346787921,
+          643.7498842477798
+        ],
+        [
+          257.1532346787921,
+          654.4701538629439
+        ],
+        [
+          68.97404545320371,
+          654.4701538629439
+        ]
+      ]
+    },
+    {
+      "title": "Takeaway 8",
+      "heading_level": null,
+      "page_id": 16,
+      "polygon": [
+        [
+          84.87333607673645,
+          667.2147043963574
+        ],
+        [
+          140.19724456249017,
+          667.2147043963574
+        ],
+        [
+          140.19724456249017,
+          677.7102166003451
+        ],
+        [
+          84.87333607673645,
+          677.7102166003451
+        ]
+      ]
+    },
+    {
+      "title": "5 A simple combination: Lite PPO",
+      "heading_level": null,
+      "page_id": 17,
+      "polygon": [
+        [
+          70.92155480384827,
+          547.5573728084564
+        ],
+        [
+          305.88489722725126,
+          547.5573728084564
+        ],
+        [
+          305.88489722725126,
+          564.0005371570587
+        ],
+        [
+          70.92155480384827,
+          564.0005371570587
+        ]
+      ]
+    },
+    {
+      "title": "6 Conclusion",
+      "heading_level": null,
+      "page_id": 18,
+      "polygon": [
+        [
+          68.97404545320371,
+          586.1988090276718
+        ],
+        [
+          168.68652420620475,
+          586.1988090276718
+        ],
+        [
+          168.68652420620475,
+          598.2441956272958
+        ],
+        [
+          68.97404545320371,
+          598.2441956272958
+        ]
+      ]
+    },
+    {
+      "title": "7 Future work",
+      "heading_level": null,
+      "page_id": 19,
+      "polygon": [
+        [
+          69.75890636444092,
+          166.48703902959824
+        ],
+        [
+          173.93454940373113,
+          166.48703902959824
+        ],
+        [
+          173.93454940373113,
+          177.67402802464798
+        ],
+        [
+          69.75890636444092,
+          177.67402802464798
+        ]
+      ]
+    },
+    {
+      "title": "References",
+      "heading_level": null,
+      "page_id": 19,
+      "polygon": [
+        [
+          68.97404545320371,
+          330.9186825156212
+        ],
+        [
+          143.1961161039338,
+          330.9186825156212
+        ],
+        [
+          143.1961161039338,
+          343.3531849590244
+        ],
+        [
+          68.97404545320371,
+          343.3531849590244
+        ]
+      ]
+    },
+    {
+      "title": "A Detailed Experimental Setup",
+      "heading_level": null,
+      "page_id": 22,
+      "polygon": [
+        [
+          69.17758214473724,
+          69.4723693728447
+        ],
+        [
+          285.6425143225067,
+          69.4723693728447
+        ],
+        [
+          285.6425143225067,
+          83.44905906915665
+        ],
+        [
+          69.17758214473724,
+          83.44905906915665
+        ]
+      ]
+    },
+    {
+      "title": "A.1 Parameters",
+      "heading_level": null,
+      "page_id": 22,
+      "polygon": [
+        [
+          69.46824425458908,
+          98.24790698289871
+        ],
+        [
+          161.93906323795656,
+          98.24790698289871
+        ],
+        [
+          161.93906323795656,
+          108.70351925558633
+        ],
+        [
+          69.46824425458908,
+          108.70351925558633
+        ]
+      ]
+    },
+    {
+      "title": "A.2 Prompt",
+      "heading_level": null,
+      "page_id": 22,
+      "polygon": [
+        [
+          69.72376333856462,
+          573.04427754879
+        ],
+        [
+          143.1961161039338,
+          573.04427754879
+        ],
+        [
+          143.1961161039338,
+          584.0002862075983
+        ],
+        [
+          69.72376333856462,
+          584.0002862075983
+        ]
+      ]
+    },
+    {
+      "title": "B Details of Overlong Filter",
+      "heading_level": null,
+      "page_id": 22,
+      "polygon": [
+        [
+          69.72376333856462,
+          665.1259979009628
+        ],
+        [
+          263.9006956470403,
+          665.1259979009628
+        ],
+        [
+          263.9006956470403,
+          681.5691622495651
+        ],
+        [
+          69.72376333856462,
+          681.5691622495651
+        ]
+      ]
+    },
+    {
+      "title": "B.1 Repeat Ratio",
+      "heading_level": null,
+      "page_id": 22,
+      "polygon": [
+        [
+          69.72376333856462,
+          695.5458519458771
+        ],
+        [
+          170.9356778622875,
+          695.5458519458771
+        ],
+        [
+          170.9356778622875,
+          706.1980354397401
+        ],
+        [
+          69.72376333856462,
+          706.1980354397401
+        ]
+      ]
+    },
+    {
+      "title": "B.2 Examples of Ostensible Positive Phenomena",
+      "heading_level": null,
+      "page_id": 23,
+      "polygon": [
+        [
+          68.97404545320371,
+          110.16920113563538
+        ],
+        [
+          344.12050938065767,
+          110.16920113563538
+        ],
+        [
+          344.12050938065767,
+          121.67941617965698
+        ],
+        [
+          68.97404545320371,
+          121.67941617965698
+        ]
+      ]
+    },
+    {
+      "title": "C Detailed Experimental Results",
+      "heading_level": null,
+      "page_id": 23,
+      "polygon": [
+        [
+          68.97404545320371,
+          457.9421271085739
+        ],
+        [
+          296.13856471755946,
+          457.9421271085739
+        ],
+        [
+          296.13856471755946,
+          472.740975022316
+        ],
+        [
+          68.97404545320371,
+          472.740975022316
+        ]
+      ]
+    },
+    {
+      "title": "D Case Study of Clip Higher",
+      "heading_level": null,
+      "page_id": 24,
+      "polygon": [
+        [
+          68.97404545320371,
+          296.12338004108
+        ],
+        [
+          271.39787450064944,
+          298.3724183705059
+        ],
+        [
+          270.6481566152885,
+          314.11568667648737
+        ],
+        [
+          68.97404545320371,
+          313.2422808408737
+        ]
+      ]
+    }
+  ],
+  "page_stats": [
+    {
+      "page_id": 0,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          74
+        ],
+        [
+          "Span",
+          73
+        ],
+        [
+          "Text",
+          3
+        ],
+        [
+          "PageHeader",
+          2
+        ],
+        [
+          "SectionHeader",
+          2
+        ],
+        [
+          "Picture",
+          1
+        ],
+        [
+          "Figure",
+          1
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "Footnote",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "FigureGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 1,
+        "llm_error_count": 0,
+        "llm_tokens_used": 8640,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 1,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          69
+        ],
+        [
+          "Span",
+          21
+        ],
+        [
+          "ListItem",
+          7
+        ],
+        [
+          "Text",
+          5
+        ],
+        [
+          "SectionHeader",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "ListGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 2,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Span",
+          63
+        ],
+        [
+          "Line",
+          58
+        ],
+        [
+          "Text",
+          8
+        ],
+        [
+          "SectionHeader",
+          4
+        ],
+        [
+          "Equation",
+          4
+        ],
+        [
+          "PageFooter",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 4,
+        "llm_error_count": 0,
+        "llm_tokens_used": 4950,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 3,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          62
+        ],
+        [
+          "Span",
+          61
+        ],
+        [
+          "Text",
+          11
+        ],
+        [
+          "SectionHeader",
+          3
+        ],
+        [
+          "Footnote",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 4,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Span",
+          61
+        ],
+        [
+          "Line",
+          56
+        ],
+        [
+          "Text",
+          7
+        ],
+        [
+          "ListItem",
+          3
+        ],
+        [
+          "SectionHeader",
+          3
+        ],
+        [
+          "Figure",
+          1
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "Footnote",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "FigureGroup",
+          1
+        ],
+        [
+          "ListGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 5,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          174
+        ],
+        [
+          "Span",
+          15
+        ],
+        [
+          "Figure",
+          1
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "Text",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "FigureGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 6,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          62
+        ],
+        [
+          "Span",
+          47
+        ],
+        [
+          "Text",
+          8
+        ],
+        [
+          "SectionHeader",
+          4
+        ],
+        [
+          "Equation",
+          2
+        ],
+        [
+          "Footnote",
+          2
+        ],
+        [
+          "PageFooter",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 2,
+        "llm_error_count": 0,
+        "llm_tokens_used": 1579,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 7,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          133
+        ],
+        [
+          "Span",
+          12
+        ],
+        [
+          "Text",
+          2
+        ],
+        [
+          "Figure",
+          1
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "FigureGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 1,
+        "llm_error_count": 0,
+        "llm_tokens_used": 1011,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 8,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          136
+        ],
+        [
+          "Span",
+          10
+        ],
+        [
+          "Text",
+          4
+        ],
+        [
+          "Figure",
+          1
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "Equation",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "FigureGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 1,
+        "llm_error_count": 0,
+        "llm_tokens_used": 791,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 9,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          159
+        ],
+        [
+          "Span",
+          11
+        ],
+        [
+          "Figure",
+          2
+        ],
+        [
+          "Caption",
+          2
+        ],
+        [
+          "FigureGroup",
+          2
+        ],
+        [
+          "PageFooter",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 10,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          76
+        ],
+        [
+          "Span",
+          13
+        ],
+        [
+          "Text",
+          5
+        ],
+        [
+          "SectionHeader",
+          3
+        ],
+        [
+          "Equation",
+          1
+        ],
+        [
+          "Figure",
+          1
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "FigureGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 1,
+        "llm_error_count": 0,
+        "llm_tokens_used": 881,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 11,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          118
+        ],
+        [
+          "Span",
+          12
+        ],
+        [
+          "SectionHeader",
+          2
+        ],
+        [
+          "Text",
+          2
+        ],
+        [
+          "Figure",
+          1
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "FigureGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 12,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          73
+        ],
+        [
+          "Span",
+          17
+        ],
+        [
+          "Text",
+          6
+        ],
+        [
+          "SectionHeader",
+          2
+        ],
+        [
+          "Figure",
+          1
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "FigureGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 13,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          52
+        ],
+        [
+          "Span",
+          18
+        ],
+        [
+          "Text",
+          5
+        ],
+        [
+          "SectionHeader",
+          3
+        ],
+        [
+          "Figure",
+          1
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "FigureGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 14,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          53
+        ],
+        [
+          "Span",
+          8
+        ],
+        [
+          "Text",
+          4
+        ],
+        [
+          "Equation",
+          2
+        ],
+        [
+          "SectionHeader",
+          2
+        ],
+        [
+          "Figure",
+          1
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "FigureGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 2,
+        "llm_error_count": 0,
+        "llm_tokens_used": 2270,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 15,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          118
+        ],
+        [
+          "Span",
+          13
+        ],
+        [
+          "Text",
+          4
+        ],
+        [
+          "Figure",
+          1
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "SectionHeader",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "FigureGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 16,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          129
+        ],
+        [
+          "Span",
+          18
+        ],
+        [
+          "SectionHeader",
+          2
+        ],
+        [
+          "Text",
+          2
+        ],
+        [
+          "Figure",
+          1
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "FigureGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 17,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          75
+        ],
+        [
+          "Span",
+          39
+        ],
+        [
+          "Text",
+          5
+        ],
+        [
+          "Figure",
+          1
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "SectionHeader",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "FigureGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 18,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          105
+        ],
+        [
+          "Span",
+          6
+        ],
+        [
+          "Text",
+          3
+        ],
+        [
+          "Figure",
+          1
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "SectionHeader",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "FigureGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 19,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          65
+        ],
+        [
+          "Span",
+          27
+        ],
+        [
+          "ListItem",
+          8
+        ],
+        [
+          "Text",
+          2
+        ],
+        [
+          "SectionHeader",
+          2
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "ListGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 20,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          72
+        ],
+        [
+          "Span",
+          39
+        ],
+        [
+          "ListItem",
+          14
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "ListGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 21,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          71
+        ],
+        [
+          "Span",
+          37
+        ],
+        [
+          "ListItem",
+          12
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "ListGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 22,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          57
+        ],
+        [
+          "Span",
+          40
+        ],
+        [
+          "SectionHeader",
+          5
+        ],
+        [
+          "Text",
+          3
+        ],
+        [
+          "Code",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 23,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          98
+        ],
+        [
+          "Span",
+          11
+        ],
+        [
+          "Text",
+          4
+        ],
+        [
+          "SectionHeader",
+          2
+        ],
+        [
+          "Caption",
+          2
+        ],
+        [
+          "Code",
+          1
+        ],
+        [
+          "Figure",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "FigureGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 24,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          68
+        ],
+        [
+          "Span",
+          9
+        ],
+        [
+          "Figure",
+          2
+        ],
+        [
+          "Caption",
+          2
+        ],
+        [
+          "FigureGroup",
+          2
+        ],
+        [
+          "SectionHeader",
+          1
+        ],
+        [
+          "Text",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 25,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          21
+        ],
+        [
+          "Span",
+          9
+        ],
+        [
+          "Text",
+          1
+        ],
+        [
+          "Figure",
+          1
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "FigureGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    }
+  ],
+  "debug_data_path": "debug_data/arXiv:2508.08221"
+}
\ No newline at end of file
diff --git a/research/papers/arXiv-vanilla_grpo/_page_0_Figure_8.jpeg b/research/papers/arXiv-vanilla_grpo/_page_0_Figure_8.jpeg
new file mode 100644
index 0000000..3be7b6c
Binary files /dev/null and b/research/papers/arXiv-vanilla_grpo/_page_0_Figure_8.jpeg differ
diff --git a/research/papers/arXiv-vanilla_grpo/_page_0_Picture_1.jpeg b/research/papers/arXiv-vanilla_grpo/_page_0_Picture_1.jpeg
new file mode 100644
index 0000000..503e2d4
Binary files /dev/null and b/research/papers/arXiv-vanilla_grpo/_page_0_Picture_1.jpeg differ
diff --git a/research/papers/arXiv-vanilla_grpo/_page_12_Figure_0.jpeg b/research/papers/arXiv-vanilla_grpo/_page_12_Figure_0.jpeg
new file mode 100644
index 0000000..5ec45fe
Binary files /dev/null and b/research/papers/arXiv-vanilla_grpo/_page_12_Figure_0.jpeg differ
diff --git a/research/papers/arXiv-vanilla_grpo/_page_18_Figure_2.jpeg b/research/papers/arXiv-vanilla_grpo/_page_18_Figure_2.jpeg
new file mode 100644
index 0000000..72b39d8
Binary files /dev/null and b/research/papers/arXiv-vanilla_grpo/_page_18_Figure_2.jpeg differ
diff --git a/research/papers/arXiv-vanilla_grpo/_page_19_Figure_0.jpeg b/research/papers/arXiv-vanilla_grpo/_page_19_Figure_0.jpeg
new file mode 100644
index 0000000..554079d
Binary files /dev/null and b/research/papers/arXiv-vanilla_grpo/_page_19_Figure_0.jpeg differ
diff --git a/research/papers/arXiv-vanilla_grpo/_page_20_Figure_0.jpeg b/research/papers/arXiv-vanilla_grpo/_page_20_Figure_0.jpeg
new file mode 100644
index 0000000..dc083be
Binary files /dev/null and b/research/papers/arXiv-vanilla_grpo/_page_20_Figure_0.jpeg differ
diff --git a/research/papers/arXiv-vanilla_grpo/_page_4_Figure_0.jpeg b/research/papers/arXiv-vanilla_grpo/_page_4_Figure_0.jpeg
new file mode 100644
index 0000000..c8430eb
Binary files /dev/null and b/research/papers/arXiv-vanilla_grpo/_page_4_Figure_0.jpeg differ
diff --git a/research/papers/arXiv-vanilla_grpo/_page_6_Figure_0.jpeg b/research/papers/arXiv-vanilla_grpo/_page_6_Figure_0.jpeg
new file mode 100644
index 0000000..3a72409
Binary files /dev/null and b/research/papers/arXiv-vanilla_grpo/_page_6_Figure_0.jpeg differ
diff --git a/research/papers/arXiv-vanilla_grpo/arXiv-2402.03300.md b/research/papers/arXiv-vanilla_grpo/arXiv-2402.03300.md
new file mode 100644
index 0000000..723109f
--- /dev/null
+++ b/research/papers/arXiv-vanilla_grpo/arXiv-2402.03300.md
@@ -0,0 +1,674 @@
+![](_page_0_Picture_1.jpeg)
+
+# DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models
+
+Zhihong Shao $^{1,2*\dagger}$ , Peiyi Wang $^{1,3*\dagger}$ , Qihao Zhu $^{1,3*\dagger}$ , Runxin Xu $^1$ , Junxiao Song $^1$  Xiao Bi $^1$ , Haowei Zhang $^1$ , Mingchuan Zhang $^1$ , Y.K. Li $^1$ , Y. Wu $^1$ , Daya Guo $^{1*}$ 
+
+<sup>1</sup>DeepSeek-AI, <sup>2</sup>Tsinghua University, <sup>3</sup>Peking University
+
+{zhihongshao,wangpeiyi,zhuqh,guoday}@deepseek.com https://github.com/deepseek-ai/DeepSeek-Math
+
+## **Abstract**
+
+Mathematical reasoning poses a significant challenge for language models due to its complex and structured nature. In this paper, we introduce DeepSeekMath 7B, which continues pretraining DeepSeek-Coder-Base-v1.5 7B with 120B math-related tokens sourced from Common Crawl, together with natural language and code data. DeepSeekMath 7B has achieved an impressive score of 51.7% on the competition-level MATH benchmark without relying on external toolkits and voting techniques, approaching the performance level of Gemini-Ultra and GPT-4. Self-consistency over 64 samples from DeepSeekMath 7B achieves 60.9% on MATH. The mathematical reasoning capability of DeepSeekMath is attributed to two key factors: First, we harness the significant potential of publicly available web data through a meticulously engineered data selection pipeline. Second, we introduce Group Relative Policy Optimization (GRPO), a variant of Proximal Policy Optimization (PPO), that enhances mathematical reasoning abilities while concurrently optimizing the memory usage of PPO.
+
+![](_page_0_Figure_8.jpeg)
+
+Figure 1 | Top1 accuracy of open-source models on the competition-level MATH benchmark (Hendrycks et al., 2021) without the use of external toolkits and voting techniques.
+
+<sup>\*</sup> Core contributors.
+
+<sup>†</sup> Work done during internship at DeepSeek-AI.
+
+## 1. Introduction
+
+Large language models (LLM) have revolutionized the approach to mathematical reasoning in artificial intelligence, spurring significant advancements in both the quantitative reasoning benchmark (Hendrycks et al., 2021) and the geometry reasoning benchmark (Trinh et al., 2024). Moreover, these models have proven instrumental in assisting humans in solving complex mathematical problems (Tao, 2023). However, cutting-edge models such as GPT-4 (OpenAI, 2023) and Gemini-Ultra (Anil et al., 2023) are not publicly available, and the currently accessible open-source models considerably trail behind in performance.
+
+In this study, we introduce DeepSeekMath, a domain-specific language model that significantly outperforms the mathematical capabilities of open-source models and approaches the performance level of GPT-4 on academic benchmarks. To achieve this, we create the DeepSeek-Math Corpus, a large-scale high-quality pre-training corpus comprising 120B math tokens. This dataset is extracted from the Common Crawl (CC) using a fastText-based classifier (Joulin et al., 2016). In the initial iteration, the classifier is trained using instances from OpenWebMath (Paster et al., 2023) as positive examples, while incorporating a diverse selection of other web pages to serve as negative examples. Subsequently, we employ the classifier to mine additional positive instances from the CC, which are further refined through human annotation. The classifier is then updated with this enhanced dataset to improve its performance. The evaluation results indicate that the large-scale corpus is of high quality, as our base model DeepSeekMath-Base 7B achieves 64.2% on GSM8K (Cobbe et al., 2021) and 36.2% on the competition-level MATH dataset (Hendrycks et al., 2021), outperforming Minerva 540B (Lewkowycz et al., 2022a). In addition, the DeepSeekMath Corpus is multilingual, so we notice an improvement in Chinese mathematical benchmarks (Wei et al., 2023; Zhong et al., 2023). We believe that our experience in mathematical data processing is a starting point for the research community, and there is significant room for improvement in the future.
+
+DeepSeekMath-Base is initialized with DeepSeek-Coder-Base-v1.5 7B (Guo et al., 2024), as we notice that starting from a code training model is a better choice compared to a general LLM. Furthermore, we observe the math training also improves model capability on MMLU (Hendrycks et al., 2020) and BBH benchmarks (Suzgun et al., 2022), indicating it does not only enhance the model's mathematical abilities but also amplifies general reasoning capabilities.
+
+After pre-training, we apply mathematical instruction tuning to DeepSeekMath-Base with chain-of-thought (Wei et al., 2022), program-of-thought (Chen et al., 2022; Gao et al., 2023), and tool-integrated reasoning (Gou et al., 2023) data. The resulting model DeepSeekMath-Instruct 7B beats all 7B counterparts and is comparable with 70B open-source instruction-tuned models.
+
+Furthermore, we introduce the Group Relative Policy Optimization (GRPO), a variant reinforcement learning (RL) algorithm of Proximal Policy Optimization (PPO) (Schulman et al., 2017). GRPO foregoes the critic model, instead estimating the baseline from group scores, significantly reducing training resources. By solely using a subset of English instruction tuning data, GRPO obtains a substantial improvement over the strong DeepSeekMath-Instruct, including both in-domain (GSM8K: 82.9%  $\rightarrow$  88.2%, MATH: 46.8%  $\rightarrow$  51.7%) and out-of-domain mathematical tasks (e.g., CMATH: 84.6%  $\rightarrow$  88.8%) during the reinforcement learning phase. We also provide a unified paradigm to understand different methods, such as Rejection Sampling Fine-Tuning (RFT) (Yuan et al., 2023a), Direct Preference Optimization (DPO) (Rafailov et al., 2023), PPO and GRPO. Based on such a unified paradigm, we find that all these methods are conceptualized as either direct or simplified RL techniques. We also conduct extensive experiments, e.g., online v.s. offline training, outcome v.s. process supervision, single-turn v.s. iterative RL and so on,
+
+to deeply investigate the essential elements of this paradigm. At last, we explain why our RL boosts the performance of instruction-tuned models, and further summarize potential directions to achieve more effective RL based on this unified paradigm.
+
+#### 1.1. Contributions
+
+### 1.1. Contributions
+
+## Math Pre-Training at Scale
+
+- Our research provides compelling evidence that the publicly accessible Common Crawl data contains valuable information for mathematical purposes. By implementing a meticulously designed data selection pipeline, we successfully construct the DeepSeekMath Corpus, a high-quality dataset of 120B tokens from web pages filtered for mathematical content, which is almost 7 times the size of the math web pages used by Minerva (Lewkowycz et al., 2022a) and 9 times the size of the recently released OpenWebMath (Paster et al., 2023).
+- Our pre-trained base model DeepSeekMath-Base 7B achieves comparable performance with Minerva 540B (Lewkowycz et al., 2022a), indicating the number of parameters is not the only key factor in mathematical reasoning capability. A smaller model pre-trained on high-quality data could achieve strong performance as well.
+- We share our findings from math training experiments. Code training prior to math training improves models' ability to solve mathematical problems both with and without tool use. This offers a partial answer to the long-standing question: *does code training improve reasoning abilities?* We believe it does, at least for mathematical reasoning.
+- Although training on arXiv papers is common, especially in many math-related papers, it brings no notable improvements on all mathematical benchmarks adopted in this paper.
+
+## **Exploration and Analysis of Reinforcement Learning**
+
+- We introduce Group Relative Policy Optimization (GRPO), an efficient and effective reinforcement learning algorithm. GRPO foregoes the critic model, instead estimating the baseline from group scores, significantly reducing training resources compared to Proximal Policy Optimization (PPO).
+- We demonstrate that GRPO significantly enhances the performance of our instructiontuned model DeepSeekMath-Instruct, by solely using the instruction-tuning data. Furthermore, we observe enhancements in the out-of-domain performance during the reinforcement learning process.
+- We provide a unified paradigm to understand different methods, such as RFT, DPO, PPO, and GRPO. We also conduct extensive experiments, e.g., online v.s. offline training, outcome v.s. process supervision, single-turn v.s. iterative reinforcement learning, and so on to deeply investigate the essential elements of this paradigm.
+- Based on our unified paradigm, we explore the reasons behind the effectiveness of reinforcement learning, and summarize several potential directions to achieve more effective reinforcement learning of LLMs.
+
+### 1.2. Summary of Evaluations and Metrics
+
+• English and Chinese Mathematical Reasoning: We conduct comprehensive assessments of our models on English and Chinese benchmarks, covering mathematical problems
+
+from grade-school level to college level. English benchmarks include GSM8K (Cobbe et al., 2021), MATH (Hendrycks et al., 2021), SAT (Azerbayev et al., 2023), OCW Courses (Lewkowycz et al., 2022a), MMLU-STEM (Hendrycks et al., 2020). Chinese benchmarks include MGSM-zh (Shi et al., 2023), CMATH (Wei et al., 2023), Gaokao-MathCloze (Zhong et al., 2023), and Gaokao-MathQA (Zhong et al., 2023). We evaluate models' ability to generate self-contained text solutions without tool use, and also the ability to solve problems using Python.
+
+On English benchmarks, DeepSeekMath-Base is competitive with the closed-source Minerva 540B (Lewkowycz et al., 2022a), and surpasses all open-source base models (e.g., Mistral 7B (Jiang et al., 2023) and Llemma-34B (Azerbayev et al., 2023)), regardless of whether they've undergone math pre-training or not, often by a significant margin. Notably, DeepSeekMath-Base is superior on Chinese benchmarks, likely because we don't follow previous works (Azerbayev et al., 2023; Lewkowycz et al., 2022a) to collect English-only math pre-training data, and also include high-quality non-English ones. With mathematical instruction tuning and reinforcement learning, the resulting DeepSeekMath-Instruct and DeepSeekMath-RL demonstrate strong performance, obtaining an accuracy of over 50% on the competition-level MATH dataset for the first time within the open-source community.
+
+- **Formal Mathematics**: We evaluate DeepSeekMath-Base using the informal-to-formal theorem proving task from (Jiang et al., 2022) on miniF2F (Zheng et al., 2021) with Isabelle (Wenzel et al., 2008) chosen to be the proof assistant. DeepSeekMath-Base demonstrates strong few-shot autoformalization performance.
+- Natural Language Understanding, Reasoning, and Code: To build a comprehensive profile of models' general understanding, reasoning, and coding capabilities, we evaluate DeepSeekMath-Base on the Massive Multitask Language Understanding (MMLU) benchmark (Hendrycks et al., 2020) which encompasses 57 multiple-choice tasks covering diverse subjects, BIG-Bench Hard (BBH) (Suzgun et al., 2022) which consists of 23 challenging tasks that mostly require multi-step reasoning to solve, as well as HumanEval (Chen et al., 2021) and MBPP (Austin et al., 2021) which are widely used to evaluate code language models. Math pre-training benefits both language understanding and reasoning performance.
+
+# 2. Math Pre-Training
+
+## 2.1. Data Collection and Decontamination
+
+In this section, we will outline the process of constructing the DeepSeekMath Corpus from Common Crawl. As depicted in Figure 2, we present an iterative pipeline that demonstrates how to systematically gather a large-scale mathematical corpus from Common Crawl, starting with a seed corpus (e.g., a small but high-quality collection of math-related dataset). It's worth noting that this approach is also applicable to other domains, such as coding.
+
+First, we choose OpenWebMath (Paster et al., 2023), a collection of high-quality mathematical web texts, as our initial seed corpus. Using this corpus, we train a fastText model (Joulin et al., 2016) to recall more OpenWebMath-like mathematical web pages. Specifically, we randomly select 500,000 data points from the seed corpus as positive training examples and another 500,000 web pages from Common Crawl as negative ones. We employ an open-source library for training, configuring the vector dimension to 256, learning rate to 0.1, the maximum length
+
+ $<sup>^{1}</sup>$ https://fasttext.cc
+
+![](_page_4_Figure_0.jpeg)
+
+Figure 2 | An iterative pipeline that collects mathematical web pages from Common Crawl.
+
+of word n-gram to 3, the minimum number of word occurrences to 3, and the number of training epochs to 3. To reduce the size of the original Common Crawl, we employ URL-based deduplication and near-deduplication techniques, resulting in 40B HTML web pages. We then recall mathematical web pages from deduplicated Common Crawl with the fastText model. To filter out low-quality mathematical content, we rank the collected pages according to their scores predicted by the fastText model, and only preserve the top-ranking ones. The volume of data preserved is assessed through pre-training experiments on the top 40B, 80B, 120B, and 160B tokens. In the first iteration, we choose to keep the top 40B tokens.
+
+After the first iteration of data collection, numerous mathematical web pages remain uncollected, mainly because the fastText model is trained on a set of positive examples that lacks sufficient diversity. We therefore identify additional mathematical web sources to enrich the seed corpus, so that we can optimize the fastText model. Specifically, we first organize the entire Common Crawl into disjoint domains; a domain is defined as web pages sharing the same base URL. For each domain, we calculate the percentage of web pages that are collected in the first iteration. Domains where over 10% of the web pages have been collected are classified as math-related (e.g., mathoverflow.net). Subsequently, we manually annotate the URLs associated with mathematical content within these identified domains (e.g., mathoverflow.net/questions). Web pages linked to these URLs, yet uncollected, will be added to the seed corpus. This approach enables us to gather more positive examples, thereby training an improved fastText model capable of recalling more mathematical data in the subsequent iteration. After four iterations of data collection, we end up with 35.5M mathematical web pages, totaling 120B tokens. In the fourth iteration, we notice that nearly 98% of the data has already been collected in the third iteration, so we decide to cease data collection.
+
+To avoid benchmark contamination, we follow Guo et al. (2024) to filter out web pages containing questions or answers from English mathematical benchmarks such as GSM8K (Cobbe et al., 2021) and MATH (Hendrycks et al., 2021) and Chinese benchmarks such as CMATH (Wei et al., 2023) and AGIEval (Zhong et al., 2023). The filtering criteria are as follows: any text segment containing a 10-gram string that matches exactly with any sub-string from the evaluation benchmarks is removed from our math training corpus. For benchmark texts that are shorter than 10 grams but have at least 3 grams, we employ exact matching to filter out contaminated web pages.
+
+## 2.2. Validating the Quality of the DeepSeekMath Corpus
+
+We run pre-training experiments to investigate how the DeepSeekMath Corpus is compared with the recently released math-training corpora:
+
+- MathPile (Wang et al., 2023c): a multi-source corpus (8.9B tokens) aggregated from textbooks, Wikipedia, ProofWiki, CommonCrawl, StackExchange, and arXiv, with the majority (over 85%) sourced from arXiv;
+- **OpenWebMath** (Paster et al., 2023): CommonCrawl data filtered for mathematical content, totaling 13.6B tokens;
+- **Proof-Pile-2** (Azerbayev et al., 2023): a mathematical corpus consisting of OpenWeb-Math, AlgebraicStack (10.3B tokens of mathematical code), and arXiv papers (28.0B tokens). When experimenting on Proof-Pile-2, we follow Azerbayev et al. (2023) to use an arXiv:Web:Code ratio of 2:4:1.
+
+### 2.2.1. Training Setting
+
+We apply math training to a general pre-trained language model with 1.3B parameters, which shares the same framework as the DeepSeek LLMs (DeepSeek-AI, 2024), denoted as DeepSeek-LLM 1.3B. We separately train a model on each mathematical corpus for 150B tokens. All experiments are conducted using the efficient and light-weight HAI-LLM (High-flyer, 2023) training framework. Following the training practice of DeepSeek LLMs, we use the AdamW optimizer (Loshchilov and Hutter, 2017) with  $\beta_1 = 0.9$ ,  $\beta_2 = 0.95$ , and weight\_decay = 0.1, along with a multi-step learning rate schedule where the learning rate reaches the peak after 2,000 warmup steps, decreases to its 31.6% after 80% of the training process, and further decreases to 10.0% of the peak after 90% of the training process. We set the maximum value of learning rate to 5.3e-4, and use a batch size of 4M tokens with a 4K context length.
+
+| Math Corpus         | Size   | English Benchmarks |       |      |       |              | Chinese Benchmarks |                     |                  |
+|---------------------|--------|--------------------|-------|------|-------|--------------|--------------------|---------------------|------------------|
+|                     |        | GSM8K              | MATH  | OCW  | SAT   | MMLU<br>STEM | CMATH              | Gaokao<br>MathCloze | Gaokao<br>MathQA |
+| No Math Training    | N/A    | 2.9%               | 3.0%  | 2.9% | 15.6% | 19.5%        | 12.3%              | 0.8%                | 17.9%            |
+| MathPile            | 8.9B   | 2.7%               | 3.3%  | 2.2% | 12.5% | 15.7%        | 1.2%               | 0.0%                | 2.8%             |
+| OpenWebMath         | 13.6B  | 11.5%              | 8.9%  | 3.7% | 31.3% | 29.6%        | 16.8%              | 0.0%                | 14.2%            |
+| Proof-Pile-2        | 51.9B  | 14.3%              | 11.2% | 3.7% | 43.8% | 29.2%        | 19.9%              | 5.1%                | 11.7%            |
+| DeepSeekMath Corpus | 120.2B | 23.8%              | 13.6% | 4.8% | 56.3% | 33.1%        | 41.5%              | 5.9%                | 23.6%            |
+
+Table 1 | Performance of DeepSeek-LLM 1.3B trained on different mathematical corpora, evaluated using few-shot chain-of-thought prompting. Corpus sizes are calculated using our tokenizer with a vocabulary size of 100K.
+
+#### 2.2.2. Evaluation Results
+
+The DeepSeekMath Corpus is of high quality, covers multilingual mathematical content, and is the largest in size.
+
+• **High-quality**: We evaluate downstream performance on 8 mathematical benchmarks using few-shot chain-of-thought prompting Wei et al. (2022). As shown in Table 1, there is a clear performance lead of the model trained on the DeepSeekMath Corpus. Figure 3 shows that the model trained on the DeepSeekMath Corpus demonstrates better performance than
+
+![](_page_6_Figure_0.jpeg)
+
+Figure 3 | Benchmark curves of DeepSeek-LLM 1.3B trained on different mathematical corpora.
+
+Proof-Pile-2 at 50B tokens (1 full epoch of Proof-Pile-2), indicating the average quality of DeepSeekMath Corpus is higher.
+
+- Multilingual: The DeepSeekMath Corpus encompasses data in multiple languages, predominantly featuring English and Chinese as the two most represented languages. As shown in Table 1, training on the DeepSeekMath Corpus enhances mathematical reasoning performance in both English and Chinese. In contrast, existing mathematical corpora, which are primarily English-centric, show limited improvement and may even hinder performance in Chinese mathematical reasoning.
+- Large-scale: The DeepSeekMath Corpus is several times larger than existing mathematical corpora. As depicted in Figure 3, DeepSeek-LLM 1.3B, when trained on the DeepSeek-Math Corpus, shows a steeper learning curve along with more lasting improvements. In contrast, the baseline corpora are much smaller, and have already been repeated multiple rounds during training, with the resulting model performance quickly reaching a plateau.
+
+## 2.3. Training and Evaluating DeepSeekMath-Base 7B
+
+In this section, we introduce DeepSeekMath-Base 7B, a base model with strong reasoning abilities, especially in mathematics. Our model is initialized with DeepSeek-Coder-Base-v1.5 7B
+
+(Guo et al., 2024) and trained for 500B tokens. The distribution of the data is as follows: 56% is from the DeepSeekMath Corpus, 4% from AlgebraicStack, 10% from arXiv, 20% is Github code, and the remaining 10% is natural language data from Common Crawl in both English and Chinese. We mainly adopt the training setting specified in Section 2.2.1, except that we set the maximum value of the learning rate to 4.2e-4 and use a batch size of 10M tokens.
+
+We conduct a comprehensive assessment of the mathematical capabilities of DeepSeekMath-Base 7B, focusing on its ability to produce self-contained mathematical solutions without relying on external tools, solve mathematical problems using tools, and conduct formal theorem proving. Beyond mathematics, we also provide a more general profile of the base model, including its performance of natural language understanding, reasoning, and programming skills.
+
+Mathematical Problem Solving with Step-by-Step Reasoning We evaluate DeepSeekMath-Base's performance of solving mathematical problems using few-shot chain-of-thought prompting (Wei et al., 2022), across eight benchmarks in English and Chinese. These benchmarks encompass quantitative reasoning (e.g., GSM8K (Cobbe et al., 2021), MATH (Hendrycks et al., 2021), and CMATH (Wei et al., 2023)) and multiple-choice problems (e.g., MMLU-STEM (Hendrycks et al., 2020) and Gaokao-MathQA (Zhong et al., 2023)), covering diverse fields of mathematics from elementary to college-level complexity.
+
+As shown in Table 2, DeepSeekMath-Base 7B leads in performance across all eight benchmarks among the open-source base models (including the widely-used general model Mistral 7B (Jiang et al., 2023) and the recently released Llemma 34B (Azerbayev et al., 2023) which underwent math training on Proof-Pile-2 (Azerbayev et al., 2023)). Notably, on the competition-level MATH dataset, DeepSeekMath-Base surpasses existing open-source base models by over 10% absolute, and outperforms Minerva 540B (Lewkowycz et al., 2022a), a closed-source base model 77 times larger which builds on PaLM (Lewkowycz et al., 2022b) and is further trained on mathematical texts.
+
+| Model                    | Size | English Benchmarks |       |       |       |              | Chinese Benchmarks |                     |                  |
+|--------------------------|------|--------------------|-------|-------|-------|--------------|--------------------|---------------------|------------------|
+|                          |      | GSM8K              | MATH  | OCW   | SAT   | MMLU<br>STEM | CMATH              | Gaokao<br>MathCloze | Gaokao<br>MathQA |
+| Closed-Source Base Model |      |                    |       |       |       |              |                    |                     |                  |
+| Minerva                  | 7B   | 16.2%              | 14.1% | 7.7%  | -     | 35.6%        | -                  | -                   | -                |
+| Minerva                  | 62B  | 52.4%              | 27.6% | 12.0% | -     | 53.9%        | -                  | -                   | -                |
+| Minerva                  | 540B | 58.8%              | 33.6% | 17.6% | -     | 63.9%        | -                  | -                   | -                |
+| Open-Source Base Model   |      |                    |       |       |       |              |                    |                     |                  |
+| Mistral                  | 7B   | 40.3%              | 14.3% | 9.2%  | 71.9% | 51.1%        | 44.9%              | 5.1%                | 23.4%            |
+| Llemma                   | 7B   | 37.4%              | 18.1% | 6.3%  | 59.4% | 43.1%        | 43.4%              | 11.9%               | 23.6%            |
+| Llemma                   | 34B  | 54.0%              | 25.3% | 10.3% | 71.9% | 52.9%        | 56.1%              | 11.9%               | 26.2%            |
+| DeepSeekMath-Base        | 7B   | 64.2%              | 36.2% | 15.4% | 84.4% | 56.5%        | 71.7%              | 20.3%               | 35.3%            |
+
+Table 2 | Comparisons between DeepSeekMath-Base 7B and strong base models on English and Chinese mathematical benchmarks. Models are evaluated with chain-of-thought prompting. Minerva results are quoted from Lewkowycz et al. (2022a).
+
+**Mathematical Problem Solving with Tool Use** We evaluate program-aided mathematical reasoning on GSM8K and MATH using few-shot program-of-thought prompting (Chen et al., 2022; Gao et al., 2023). Models are prompted to solve each problem by writing a Python program where libraries such as *math* and *sympy* can be utilized for intricate computations. The execution result of the program is evaluated as the answer. As shown in Table 3, DeepSeekMath-Base 7B outperforms the prior state-of-the-art Llemma 34B.
+
+| Model             | Size | Problem Solving w/ Tools |             | Informal-to-Formal Proving |              |
+|-------------------|------|--------------------------|-------------|----------------------------|--------------|
+|                   |      | GSM8K+Python             | MATH+Python | miniF2F-valid              | miniF2F-test |
+| Mistral           | 7B   | 48.5%                    | 18.2%       | 18.9%                      | 18.0%        |
+| CodeLlama         | 7B   | 27.1%                    | 17.2%       | 16.3%                      | 17.6%        |
+| CodeLlama         | 34B  | 52.7%                    | 23.5%       | 18.5%                      | 18.0%        |
+| Llemma            | 7B   | 41.0%                    | 18.6%       | 20.6%                      | 22.1%        |
+| Llemma            | 34B  | 64.6%                    | 26.3%       | 21.0%                      | 21.3%        |
+| DeepSeekMath-Base | 7B   | 66.9%                    | 31.4%       | 25.8%                      | 24.6%        |
+
+Table 3 | Few-shot evaluation of base models' ability to solve mathematical problems using tools and the ability to conduct informal-to-formal theorem proving in Isabelle.
+
+**Formal Mathematics** Formal proof automation is beneficial to ensure the accuracy and reliability of mathematical proofs and enhance efficiency, with increasing attention in recent years. We evaluate DeepSeekMath-Base 7B on the task of informal-to-formal proving from (Jiang et al., 2022) which is to generate a formal proof based on an informal statement, a formal counterpart of the statement, and an informal proof. We evaluate on miniF2F (Zheng et al., 2021), a benchmark for formal Olympiad-level mathematics, and generate a formal proof in Isabelle for each problem with few-shot prompting. Following Jiang et al. (2022), we leverage models to generate proof sketches, and execute the off-the-shelf automated prover Sledgehammer (Paulson, 2010) to fill in the missing details. As shown in Table 3, DeepSeekMath-Base 7B demonstrates strong performance in proof autoformalization.
+
+| Model                     | Size | MMLU  | BBH   | HumanEval (Pass@1) | MBPP (Pass@1) |
+|---------------------------|------|-------|-------|--------------------|---------------|
+| Mistral                   | 7B   | 62.4% | 55.7% | 28.0%              | 41.4%         |
+| DeepSeek-Coder-Base-v1.5† | 7B   | 42.9% | 42.9% | 40.2%              | 52.6%         |
+| DeepSeek-Coder-Base-v1.5  | 7B   | 49.1% | 55.2% | 43.2%              | 60.4%         |
+| DeepSeekMath-Base         | 7B   | 54.9% | 59.5% | 40.9%              | 52.6%         |
+
+Table 4 | Evaluation on natural language understanding, reasoning, and code benchmarks. DeepSeek-Coder-Base-v1.5 $^{\dagger}$  is the checkpoint right before learning rate decay, which is used to train DeepSeekMath-Base. On MMLU and BBH, we use few-shot chain-of-thought prompting. On HumanEval and MBPP, we evaluate model performance under the zero-shot setting and a few-shot setting, respectively.
+
+Natural Language Understanding, Reasoning, and Code We evaluate model performance of natural language understanding on MMLU (Hendrycks et al., 2020), reasoning on BBH (Suzgun et al., 2022), and coding capabilities on HumanEval (Chen et al., 2021) and MBPP (Austin et al.,
+
+2021). As shown in Table 4, DeepSeekMath-Base 7B exhibits significant enhancements in performance on MMLU and BBH over its precursor, DeepSeek-Coder-Base-v1.5 (Guo et al., 2024), illustrating the positive impact of math training on language understanding and reasoning. Additionally, by including code tokens for continual training, DeepSeekMath-Base 7B effectively maintains the performance of DeepSeek-Coder-Base-v1.5 on the two coding benchmarks. Overall, DeepSeekMath-Base 7B significantly outperforms the general model Mistral 7B (Jiang et al., 2023) on the three reasoning and coding benchmarks.
+
+# 3. Supervised Fine-Tuning
+
+### 3.1. SFT Data Curation
+
+We construct a mathematical instruction-tuning dataset covering English and Chinese problems from different mathematical fields and of varying complexity levels: problems are paired with solutions in chain-of-thought (CoT) (Wei et al., 2022), program-of-thought (PoT) (Chen et al., 2022; Gao et al., 2023), and tool-integrated reasoning format (Gou et al., 2023). The total number of training examples is 776K.
+
+- English mathematical datasets: We annotate GSM8K and MATH problems with tool-integrated solutions, and adopt a subset of MathInstruct (Yue et al., 2023) along with the training set of Lila-OOD (Mishra et al., 2022) where problems are solved with CoT or PoT. Our English collection covers diverse fields of mathematics, e.g., algebra, probability, number theory, calculus, and geometry.
+- Chinese mathematical datasets: We collect Chinese K-12 mathematical problems spanning 76 sub-topics such as linear equations, with solutions annotated in both CoT and toolintegrated reasoning format.
+
+## 3.2. Training and Evaluating DeepSeekMath-Instruct 7B
+
+In this section, we introduce DeepSeekMath-Instruct 7B which undergoes mathematical instruction tuning based on DeepSeekMath-Base. Training examples are randomly concatenated until reaching a maximum context length of 4K tokens. We train the model for 500 steps with a batch size of 256 and a constant learning rate of 5e-5.
+
+We evaluate models' mathematical performance both without and with tool use, on 4 quantitative reasoning benchmarks in English and Chinese. We benchmark our model against the leading models of the time:
+
+- Closed-source models include: (1) the GPT family among which GPT-4 (OpenAI, 2023) and GPT-4 Code Interpreter <sup>2</sup> are the most capable ones, (2) Gemini Ultra and Pro (Anil et al., 2023), (3) Inflection-2 (Inflection AI, 2023), (4) Grok-1 <sup>3</sup>, as well as models recently released by Chinese companies including (5) Baichuan-3 <sup>4</sup>, (6) the latest GLM-4 <sup>5</sup> from the GLM family (Du et al., 2022). These models are for general purposes, most of which have undergone a series of alignment procedures.
+- Open-source models include: general models like (1) DeepSeek-LLM-Chat 67B (DeepSeek-AI, 2024), (2) Qwen 72B (Bai et al., 2023), (3) SeaLLM-v2 7B (Nguyen et al., 2023), and (4)
+
+<sup>&</sup>lt;sup>2</sup>https://openai.com/blog/chatgpt-plugins#code-interpreter
+
+<sup>3</sup>https://x.ai/model-card
+
+<sup>4</sup>https://www.baichuan-ai.com
+
+<sup>&</sup>lt;sup>5</sup>https://open.bigmodel.cn/dev/api#glm-4
+
+ChatGLM3 6B (ChatGLM3 Team, 2023), as well as models with enhancements in mathematics including (5) InternLM2-Math 20B <sup>6</sup> which builds on InternLM2 and underwent math training followed by instruction tuning, (6) Math-Shepherd-Mistral 7B which applys PPO training (Schulman et al., 2017) to Mistral 7B (Jiang et al., 2023) with a process-supervised reward model, (7) the WizardMath series (Luo et al., 2023) which improves mathematical reasoning in Mistral 7B and Llama-2 70B (Touvron et al., 2023) using evolve-instruct (i.e., a version of instruction tuning that uses AI-evolved instructions) and PPO training with training problems primarily sourced from GSM8K and MATH, (8) MetaMath 70B (Yu et al., 2023) which is Llama-2 70B fine-tuned on an augmented version of GSM8K and MATH, (9) ToRA 34B Gou et al. (2023) which is CodeLlama 34B fine-tuned to do tool-integrated mathematical reasoning, (10) MAmmoTH 70B (Yue et al., 2023) which is Llama-2 70B instruction-tuned on MathInstruct.
+
+As shown in Table 5, under the evaluation setting where tool use is disallowed, DeepSeekMath-Instruct 7B demonstrates strong performance of step-by-step reasoning. Notably, on the competition-level MATH dataset, our model surpasses all open-source models and the majority of proprietary models (e.g., Inflection-2 and Gemini Pro) by at least 9% absolute. This is true even for models that are substantially larger (e.g., Qwen 72B) or have been specifically enhanced through math-focused reinforcement learning (e.g., WizardMath-v1.1 7B). While DeepSeekMath-Instruct rivals the Chinese proprietary models GLM-4 and Baichuan-3 on MATH, it still underperforms GPT-4 and Gemini Ultra.
+
+Under the evaluation setting where models are allowed to integrate natural language reasoning and program-based tool use for problem solving, DeepSeekMath-Instruct 7B approaches an accuracy of 60% on MATH, surpassing all existing open-source models. On the other benchmarks, our model is competitive with DeepSeek-LLM-Chat 67B, the prior state-of-the-art that is 10 times larger.
+
+# 4. Reinforcement Learning
+
+## 4.1. Group Relative Policy Optimization
+
+Reinforcement learning (RL) has been proven to be effective in further improving the mathematical reasoning ability of LLMs after the Supervised Fine-Tuning (SFT) stage (Luo et al., 2023; Wang et al., 2023b). In this section, we introduce our efficient and effective RL algorithm, Group Relative Policy Optimization (GRPO).
+
+#### 4.1.1. From PPO to GRPO
+
+Proximal Policy Optimization (PPO) (Schulman et al., 2017) is an actor-critic RL algorithm that is widely used in the RL fine-tuning stage of LLMs (Ouyang et al., 2022). In particular, it optimizes LLMs by maximizing the following surrogate objective:
+
+$$\mathcal{J}_{\text{PPO}}(\theta) = \mathbb{E}_{\big[ q \sim P(Q),\ o \sim \pi_{\theta_{\text{old}}}(O|q) \big]} \left[ \frac{1}{|o|} \sum_{t=1}^{|o|} \min\left( \frac{\pi_{\theta}(o_t|q, o_{$$
+
+where  $\pi_{\theta}$  and  $\pi_{\theta_{old}}$  are the current and old policy models, and q, o are questions and outputs sampled from the question dataset and the old policy  $\pi_{\theta_{old}}$ , respectively.  $\varepsilon$  is a clipping-related hyper-parameter introduced in PPO for stabilizing training.  $A_t$  is the advantage, which is computed by applying Generalized Advantage Estimation (GAE) (Schulman et al., 2015), based
+
+ $<sup>^6 {\</sup>tt https://github.com/InternLM/InternLM-Math}$ 
+
+| Model                        | Size | English Benchmarks |       | Chinese Benchmarks |       |
+|------------------------------|------|--------------------|-------|--------------------|-------|
+|                              |      | GSM8K              | MATH  | MGSM-zh            | CMATH |
+| Chain-of-Thought Reasoning   |      |                    |       |                    |       |
+| Closed-Source Model          |      |                    |       |                    |       |
+| Gemini Ultra                 | -    | 94.4%              | 53.2% | -                  | -     |
+| GPT-4                        | -    | 92.0%              | 52.9% | -                  | 86.0% |
+| Inflection-2                 | -    | 81.4%              | 34.8% | -                  | -     |
+| GPT-3.5                      | -    | 80.8%              | 34.1% | -                  | 73.8% |
+| Gemini Pro                   | -    | 86.5%              | 32.6% | -                  | -     |
+| Grok-1                       | -    | 62.9%              | 23.9% | -                  | -     |
+| Baichuan-3                   | -    | 88.2%              | 49.2% | -                  | -     |
+| GLM-4                        | -    | 87.6%              | 47.9% | -                  | -     |
+| Open-Source Model            |      |                    |       |                    |       |
+| InternLM2-Math               | 20B  | 82.6%              | 37.7% | -                  | -     |
+| Qwen                         | 72B  | 78.9%              | 35.2% | -                  | -     |
+| Math-Shepherd-Mistral        | 7B   | 84.1%              | 33.0% | -                  | -     |
+| WizardMath-v1.1              | 7B   | 83.2%              | 33.0% | -                  | -     |
+| DeepSeek-LLM-Chat            | 67B  | 84.1%              | 32.6% | 74.0%              | 80.3% |
+| MetaMath                     | 70B  | 82.3%              | 26.6% | 66.4%              | 70.9% |
+| SeaLLM-v2                    | 7B   | 78.2%              | 27.5% | 64.8%              | -     |
+| ChatGLM3                     | 6B   | 72.3%              | 25.7% | -                  | -     |
+| WizardMath-v1.0              | 70B  | 81.6%              | 22.7% | 64.8%              | 65.4% |
+| <b>DeepSeekMath-Instruct</b> | 7B   | 82.9%              | 46.8% | 73.2%              | 84.6% |
+| <b>DeepSeekMath-RL</b>       | 7B   | 88.2%              | 51.7% | 79.6%              | 88.8% |
+| Tool-Integrated Reasoning    |      |                    |       |                    |       |
+| Closed-Source Model          |      |                    |       |                    |       |
+| GPT-4 Code Interpreter       | -    | 97.0%              | 69.7% | -                  | -     |
+| Open-Source Model            |      |                    |       |                    |       |
+| InternLM2-Math               | 20B  | 80.7%              | 54.3% | -                  | -     |
+| DeepSeek-LLM-Chat            | 67B  | 86.7%              | 51.1% | 76.4%              | 85.4% |
+| ToRA                         | 34B  | 80.7%              | 50.8% | 41.2%              | 53.4% |
+| MAMmoTH                      | 70B  | 76.9%              | 41.8% | -                  | -     |
+| <b>DeepSeekMath-Instruct</b> | 7B   | 83.7%              | 57.4% | 72.0%              | 84.3% |
+| <b>DeepSeekMath-RL</b>       | 7B   | 86.7%              | 58.8% | 78.4%              | 87.6% |
+
+Table 5 | Performance of Open- and Closed-Source models with both Chain-of-Thought and Tool-Integrated Reasoning on English and Chinese Benchmarks. Scores in gray denote majority votes with 32 candidates; The others are Top1 scores. DeepSeekMath-RL 7B beats all open-source models from 7B to 70B, as well as the majority of closed-source models. Although DeepSeekMath-RL 7B is only further trained on chain-of-thought-format instruction tuning data of GSM8K and MATH, it improves over DeepSeekMath-Instruct 7B on all benchmarks.
+
+![](_page_12_Figure_0.jpeg)
+
+Figure 4 | Demonstration of PPO and our GRPO. GRPO foregoes the value model, instead estimating the baseline from group scores, significantly reducing training resources.
+
+on the rewards  $\{r_{\geq t}\}$  and a learned value function  $V_{\psi}$ . Thus, in PPO, a value function needs to be trained alongside the policy model and to mitigate over-optimization of the reward model, the standard approach is to add a per-token KL penalty from a reference model in the reward at each token (Ouyang et al., 2022), i.e.,
+
+$$r_{t} = r_{\varphi}(q, o_{\leq t}) - \beta \log \frac{\pi_{\theta}(o_{t}|q, o_{< t})}{\pi_{ref}(o_{t}|q, o_{< t})},$$
+(2)
+
+where  $r_{\varphi}$  is the reward model,  $\pi_{ref}$  is the reference model, which is usually the initial SFT model, and  $\beta$  is the coefficient of the KL penalty.
+
+As the value function employed in PPO is typically another model of comparable size as the policy model, it brings a substantial memory and computational burden. Additionally, during RL training, the value function is treated as a baseline in the calculation of the advantage for variance reduction. While in the LLM context, usually only the last token is assigned a reward score by the reward model, which may complicate the training of a value function that is accurate at each token. To address this, as shown in Figure 4, we propose Group Relative Policy Optimization (GRPO), which obviates the need for additional value function approximation as in PPO, and instead uses the average reward of multiple sampled outputs, produced in response to the same question, as the baseline. More specifically, for each question q, GRPO samples a group of outputs  $\{o_1, o_2, \cdots, o_G\}$  from the old policy  $\pi_{\theta_{old}}$  and then optimizes the policy model by maximizing the following objective:
+
+$$\mathcal{J}_{\text{GRPO}}(\theta) = \mathbb{E}\left[q \sim P(Q), \{o_i\}_{i=1}^G \sim \pi_{\theta_{\text{old}}}(O|q)\right] \\ \frac{1}{G} \sum_{i=1}^{G} \frac{1}{|o_i|} \sum_{t=1}^{|o_i|} \left\{ \min \left[ \frac{\pi_\theta(o_{i,t}|q, o_{i,$$
+
+where  $\varepsilon$  and  $\beta$  are hyper-parameters, and  $\hat{A}_{i,t}$  is the advantage calculated based on relative rewards of the outputs inside each group only, which will be detailed in the following subsections. The group relative way that GRPO leverages to calculate the advantages, aligns well with the comparative nature of rewards models, as reward models are typically trained on datasets of comparisons between outputs on the same question. Also note that, instead of adding KL penalty in the reward, GRPO regularizes by directly adding the KL divergence between the trained policy and the reference policy to the loss, avoiding complicating the calculation of  $\hat{A}_{i,t}$ .
+
+## Algorithm 1 Iterative Group Relative Policy Optimization
+
+```
+Input initial policy model \pi_{\theta_{\text{init}}}; reward models r_{\varphi}; task prompts \mathcal{D}; hyperparameters \varepsilon, \beta, \mu
+ 1: policy model \pi_{\theta} \leftarrow \pi_{\theta_{\text{init}}}
+ 2: for iteration = 1, ..., I do
+          reference model \pi_{ref} \leftarrow \pi_{\theta}
+ 4:
+           for step = 1, \ldots, M do
+                Sample a batch \mathcal{D}_b from \mathcal{D}
+ 5:
+                Update the old policy model \pi_{\theta_{old}} \leftarrow \pi_{\theta}
+ 6:
+                Sample G outputs \{o_i\}_{i=1}^G \sim \pi_{\theta_{old}}(\cdot \mid q) for each question q \in \mathcal{D}_b
+Compute rewards \{r_i\}_{i=1}^G for each sampled output o_i by running r_{\varphi}
+ 7:
+ 8:
+                Compute \hat{A}_{i,t} for the t-th token of o_i through group relative advantage estimation.
+ 9:
+10:
+                for GRPO iteration = 1, ..., \mu do
+                     Update the policy model \pi_{\theta} by maximizing the GRPO objective (Equation 21)
+11:
+           Update r_{\omega} through continuous training using a replay mechanism.
+Output \pi_{\theta}
+```
+
+ $\frac{\mathbf{Output}\,\pi_{\theta}}{\mathbf{Output}}$ 
+
+And different from the KL penalty term used in (2), we estimate the KL divergence with the following unbiased estimator (Schulman, 2020):
+
+$$\mathbb{D}_{KL}\left[\pi_{\theta}||\pi_{\text{ref}}\right] = \frac{\pi_{\text{ref}}(o_{i,t}|q,o_{i,$$
+
+which is guaranteed to be positive.
+
+### 4.1.2. Outcome Supervision RL with GRPO
+
+Formally, for each question q, a group of outputs  $\{o_1, o_2, \cdots, o_G\}$  are sampled from the old policy model  $\pi_{\theta_{old}}$ . A reward model is then used to score the outputs, yielding G rewards  $\mathbf{r} = \{r_1, r_2, \cdots, r_G\}$  correspondingly. Subsequently, these rewards are normalized by subtracting the group average and dividing by the group standard deviation. Outcome supervision provides the normalized reward at the end of each output  $o_i$  and sets the advantages  $\hat{A}_{i,t}$  of all tokens in the output as the normalized reward, i.e.,  $\hat{A}_{i,t} = \tilde{r}_i = \frac{r_i - \text{mean}(\mathbf{r})}{\text{std}(\mathbf{r})}$ , and then optimizes the policy by maximizing the objective defined in equation (3).
+
+#### 4.1.3. Process Supervision RL with GRPO
+
+Outcome supervision only provides a reward at the end of each output, which may not be sufficient and efficient to supervise the policy in complex mathematical tasks. Following Wang et al. (2023b), we also explore process supervision, which provides a reward at the end of each reasoning step. Formally, given the question q and G sampled outputs  $\{o_1, o_2, \cdots, o_G\}$ , a process reward model is used to score each step of the outputs, yielding corresponding rewards:  $\mathbf{R} = \{\{r_1^{index(1)}, \cdots, r_1^{index(K_1)}\}, \cdots, \{r_G^{index(1)}, \cdots, r_G^{index(K_G)}\}\}$ , where index(j) is the end token index of the j-th step, and  $K_i$  is the total number of steps in the i-th output. We also normalize these rewards with the average and the standard deviation, i.e.,  $\tilde{r}_i^{index(j)} = \frac{r_i^{index(j)} - \text{mean}(\mathbf{R})}{\text{std}(\mathbf{R})}$ . Subsequently, the process supervision calculates the advantage of each token as the sum of the normalized rewards from the following steps, i.e.,  $\hat{A}_{i,t} = \sum_{index(j) \geq t} \tilde{r}_i^{index(j)}$ , and then optimizes the policy by maximizing the objective defined in equation (3).
+
+#### 4.1.4. Iterative RL with GRPO
+
+As the reinforcement learning training process progresses, the old reward model may not be sufficient to supervise the current policy model. Therefore, we also explore the iterative RL with GRPO. As shown in Algorithm 1, in iterative GRPO, we generate new training sets for the reward model based on the sampling results from the policy model and continually train the old reward model using a replay mechanism that incorporates 10% of historical data. Then, we set the reference model as the policy model, and continually train the policy model with the new reward model.
+
+## 4.2. Training and Evaluating DeepSeekMath-RL
+
+We conduct RL based on DeepSeekMath-Instruct 7B. The training data of RL are chain-of-thought-format questions related to GSM8K and MATH from the SFT data, which consists of around 144K questions. We exclude other SFT questions to investigate the impact of RL on benchmarks that lack data throughout the RL phase. We construct the training set of reward models following (Wang et al., 2023b). We train our initial reward model based on the DeepSeekMath-Base 7B with a learning rate of 2e-5. For GRPO, we set the learning rate of the policy model as 1e-6. The KL coefficient is 0.04. For each question, we sample 64 outputs. The max length is set to 1024, and the training batch size is 1024. The policy model only has a single update following each exploration stage. We evaluate DeepSeekMath-RL 7B on benchmarks following DeepSeekMath-Instruct 7B. For DeepSeekMath-RL 7B, GSM8K and MATH with chain-of-thought reasoning can be regarded as in-domain tasks and all the other benchmarks can be regarded as out-of-domain tasks.
+
+Table 5 demonstrates the performance of open- and closed-source models with both chain-of-thought and tool-integrated reasoning on English and Chinese benchmarks. We find that: 1) DeepSeekMath-RL 7B attains accuracies of 88.2% and 51.7% on GSM8K and MATH, respectively, utilizing chain-of-thought reasoning. This performance surpasses that of all open-source models in the 7B to 70B range, as well as the majority of closed-source models. 2) Crucially, DeepSeekMath-RL 7B is only trained on chain-of-thought-format instruction tuning data of GSM8K and MATH, starting from DeepSeekMath-Instruct 7B. Despite the constrained scope of its training data, it outperforms DeepSeekMath-Instruct 7B across all evaluation metrics, showcasing the effectiveness of reinforcement learning.
+
+### 5. Discussion
+
+In this section, we will share our findings in pre-training and RL experiments.
+
+## 5.1. Lessons Learnt in Pre-Training
+
+We first share our experience in pre-training. Unless otherwise specified, we will adhere to the training settings outlined in Section 2.2.1. It is worth noting that, when referring to the DeepSeekMath Corpus in this section, we use an 89B-token dataset from the second iteration of the data collection process.
+
+#### 5.1.1. Code Training Benefits Mathematical Reasoning
+
+A popular yet unverified hypothesis suggests that code training improves reasoning. We attempt to offer a partial response to this, particularly within the mathematical domain: code training
+
+| Training Setting           | Training Tokens |      |      | w/o Tool Use |       |       | w/ Tool Use  |             |  |
+|----------------------------|-----------------|------|------|--------------|-------|-------|--------------|-------------|--|
+|                            | General         | Code | Math | GSM8K        | MATH  | CMATH | GSM8K+Python | MATH+Python |  |
+| No Continual Training      | –               | –    | –    | 2.9%         | 3.0%  | 12.3% | 2.7%         | 2.3%        |  |
+| Two-Stage Training         |                 |      |      |              |       |       |              |             |  |
+| Stage 1: General Training  | 400B            | –    | –    | 2.9%         | 3.2%  | 14.8% | 3.3%         | 2.3%        |  |
+| Stage 2: Math Training     | –               | –    | 150B | 19.1%        | 14.4% | 37.2% | 14.3%        | 6.7%        |  |
+| Stage 1: Code Training     | –               | 400B | –    | 5.9%         | 3.6%  | 19.9% | 12.4%        | 10.0%       |  |
+| Stage 2: Math Training     | –               | –    | 150B | 21.9%        | 15.3% | 39.7% | 17.4%        | 9.4%        |  |
+| One-Stage Training         |                 |      |      |              |       |       |              |             |  |
+| Math Training              | –               | –    | 150B | 20.5%        | 13.1% | 37.6% | 11.4%        | 6.5%        |  |
+| Code & Math Mixed Training | –               | 400B | 150B | 17.6%        | 12.1% | 36.3% | 19.7%        | 13.5%       |  |
+
+Table 6 | Investigation of how code affects mathematical reasoning under different training settings. We experiment with DeepSeek-LLM 1.3B, and evaluate its mathematical reasoning performance without and with tool use via few-shot chain-of-thought prompting and few-shot program-of-thought prompting, respectively.
+
+improves models' ability to do mathematical reasoning both with and without tool use.
+
+To study how code training affects mathematical reasoning, we experimented with the following two-stage training and one-stage training settings:
+
+### **Two-Stage Training**
+
+- Code Training for 400B Tokens → Math Training for 150B Tokens: We train DeepSeek-LLM 1.3B for 400B code tokens followed by 150B math tokens;
+- General Training for 400B Tokens → Math Training for 150B Tokens: As a control experiment, we also experiment with general tokens (sampled from a large-scale general corpus created by DeepSeek-AI) instead of code tokens in the first stage of training, in an attempt to investigate the advantages of code tokens over general tokens in improving mathematical reasoning.
+
+### **One-Stage Training**
+
+- Math Training for 150B Tokens: We train DeepSeek-LLM 1.3B for 150B math tokens;
+- Training on a mixture of 400B Code Tokens and 150B Math Tokens: Math training following code training degrades coding performance. We investigate whether code tokens, when mixed with math tokens for one-stage training, would still improve mathematical reasoning and also alleviate the problem of catastrophic forgetting.
+
+**Results** Table 6 and Table 7 demonstrate the downstream performance under different training settings.
+
+Code training benefits program-aided mathematical reasoning, both under the two-stage training and one-stage training settings. As shown in Table 6, under the two-stage training setting, code training alone already significantly enhances the ability to solve GSM8K and MATH problems using Python. Math training in the second stage yields further improvements. Interestingly, under the one-stage training setting, mixing code tokens and math tokens effectively mitigates the issue of catastrophic forgetting that arises from two-stage training, and also synergizes coding (Table 7) and program-aided mathematical reasoning (Table 6).
+
+| Training Setting           | Training Tokens |      |      | MMLU         | BBH          | HumanEval (Pass@1) | MBPP (Pass@1) |
+|----------------------------|-----------------|------|------|--------------|--------------|--------------------|---------------|
+|                            | General         | Code | Math |              |              |                    |               |
+| No Continual Training      | –               | –    | –    | 24.5%        | 28.1%        | 12.2%              | 13.0%         |
+| Two-Stage Training         |                 |      |      |              |              |                    |               |
+| Stage 1: General Training  | 400B            | –    | –    | 25.9%        | 27.7%        | 15.2%              | 13.6%         |
+| Stage 2: Math Training     | –               | –    | 150B | 33.1%        | 32.7%        | 12.8%              | 13.2%         |
+| Stage 1: Code Training     | –               | 400B | –    | 25.0%        | 31.5%        | 25.0%              | <b>40.0%</b>  |
+| Stage 2: Math Training     | –               | –    | 150B | <b>36.2%</b> | 35.3%        | 12.2%              | 17.0%         |
+| One-Stage Training         |                 |      |      |              |              |                    |               |
+| Math Training              | –               | –    | 150B | 32.3%        | 32.5%        | 11.6%              | 13.2%         |
+| Code & Math Mixed Training | –               | 400B | 150B | 33.5%        | <b>35.6%</b> | <b>29.3%</b>       | <b>39.4%</b>  |
+
+Table 7 | Investigation of how different settings of code and math training affect model performance of language understanding, reasoning, and coding. We experiment with DeepSeek-LLM 1.3B. We evaluate the models on MMLU and BBH using few-shot chain-of-thought prompting. On HumanEval and MBPP, we conduct zero-shot and few-shot evaluations, respectively.
+
+| Model                    | Size  | ArXiv Corpus     | English Benchmarks |       |       |       |              | Chinese Benchmarks |                     |                  |
+|--------------------------|-------|------------------|--------------------|-------|-------|-------|--------------|--------------------|---------------------|------------------|
+| DeepSeek-LLM             | 1.3B  |                  | GSM8K              | MATH  | OCW   | SAT   | MMLU<br>STEM | CMATH              | Gaokao<br>MathCloze | Gaokao<br>MathQA |
+|                          |       | No Math Training | 2.9%               | 3.0%  | 2.9%  | 15.6% | 19.5%        | 12.3%              | 0.8%                | 17.9%            |
+|                          |       | MathPile         | 2.7%               | 3.3%  | 2.2%  | 12.5% | 15.7%        | 1.2%               | 0.0%                | 2.8%             |
+| ArXiv-RedPajama          | 3.3%  | 3.4%             | 4.0%               | 9.4%  | 9.0%  | 7.4%  | 0.8%         | 2.3%               |                     |                  |
+| DeepSeek-Coder-Base-v1.5 | 7B    |                  | GSM8K              | MATH  | OCW   | SAT   | MMLU<br>STEM | CMATH              | Gaokao<br>MathCloze | Gaokao<br>MathQA |
+|                          |       | No Math Training | 29.0%              | 12.5% | 6.6%  | 40.6% | 38.1%        | 45.9%              | 5.9%                | 21.1%            |
+|                          |       | MathPile         | 23.6%              | 11.5% | 7.0%  | 46.9% | 35.8%        | 37.9%              | 4.2%                | 25.6%            |
+| ArXiv-RedPajama          | 28.1% | 11.1%            | 7.7%               | 50.0% | 35.2% | 42.6% | 7.6%         | 24.8%              |                     |                  |
+
+Table 8 | Effect of math training on different arXiv datasets. Model performance is evaluated with few-shot chain-of-thought prompting.
+
+| ArXiv Corpus     | miniF2F-valid | miniF2F-test |
+|------------------|---------------|--------------|
+| No Math Training | 20.1%         | 21.7%        |
+| MathPile         | 16.8%         | 16.4%        |
+| ArXiv-RedPajama  | 14.8%         | 11.9%        |
+
+Table 9 | Effect of math training on different arXiv corpora, the base model being DeepSeek-Coder-Base-v1.5 7B. We evaluate informal-to-formal proving in Isabelle.
+
+Code training also improves mathematical reasoning without tool use. Under the two-stage training setting, the initial stage of code training already results in moderate enhancements. It also boosts the efficiency of the subsequent math training, eventually leading to the best performance. However, combining code tokens and math tokens for one-stage training compromises mathematical reasoning without tool use. One conjecture is that DeepSeek-LLM 1.3B, due to its limited scale, lacks the capacity to fully assimilate both code and mathematical data simultaneously.
+
+## 5.1.2. ArXiv Papers Seem Ineffective in Improving Mathematical Reasoning
+
+ArXiv papers are commonly included as a component of math pre-training data (Azerbayev et al., 2023; Lewkowycz et al., 2022a; Polu and Sutskever, 2020; Wang et al., 2023c). However,
+
+detailed analysis regarding their impact on mathematical reasoning has not been extensively conducted. Perhaps counter-intuitively, according to our experiments, arXiv papers seem ineffective in improving mathematical reasoning. We experiment with models of different sizes, including DeepSeek-LLM 1.3B and DeepSeek-Coder-Base-v1.5 7B (Guo et al., 2024), using arXiv corpora that underwent varied processing pipelines:
+
+- MathPile (Wang et al., 2023c): an 8.9B-token corpus developed with cleaning and filtering heuristic rules, over 85% of which are scientific arXiv papers;
+- **ArXiv-RedPajama** (Computer, 2023): the entirety of arXiv LaTeX files with preambles, comments, macros, and bibliographies removed, totaling 28.0B tokens.
+
+In our experiments, we separately train DeepSeek-LLM 1.3B for 150B tokens and DeepSeek-Coder-Base-v1.57B for 40B tokens on each arXiv corpus. It seems that arXiv papers are ineffective in improving mathematical reasoning. When trained on a arXiv-only corpus, both models display no notable improvements or even deterioration across various mathematical benchmarks of different complexities employed in this study. These benchmarks include quantitative reasoning datasets like GSM8K and MATH (Table 8), multiple-choice challenges like MMLU-STEM (Table 8), and formal mathematics like miniF2F (Table 9).
+
+However, this conclusion has its limitations and should be taken with a grain of salt. We have not yet studied:
+
+- The impact of arXiv tokens on specific math-related tasks not included in this research, such as informalization of theorems which is to convert formal statements or proofs to their informal versions;
+- The effect of arXiv tokens when combined with other types of data;
+- Whether the benefits of arXiv papers would manifest themselves at a larger model scale.
+
+Thus, further exploration is required, which we leave for future studies.
+
+#### 5.2. Insights of Reinforcement Learning
+
+#### 5.2.1. Towards to a Unified Paradigm
+
+In this section, we provide a unified paradigm to analyze different training methods, such as SFT, RFT, DPO, PPO, GRPO, and further conduct experiments to explore the factors of the unified paradigm. Generally, the gradient with respect to the parameter  $\theta$  of a training method can be written as:
+
+$$\nabla_{\theta} \mathcal{J}_{\mathcal{A}}(\theta) = \mathbb{E}\left[\underbrace{(q, o) \sim \mathcal{D}}_{\text{Data Source}}\right] \left(\frac{1}{|o|} \sum_{t=1}^{|o|} \underbrace{GC_{\mathcal{A}}(q, o, t, \pi_{rf})}_{\text{Gradient Coefficient}} \nabla_{\theta} \log \pi_{\theta}(o_{t}|q, o_{< t})\right). \tag{5}$$
+
+There exist three key components: 1) *Data Source*  $\mathcal{D}$ , which determines the training data; 2) *Reward Function*  $\pi_{rf}$ , which is the source of the training reward signal; 3) *Algorithm*  $\mathcal{A}$ : which processes the training data and the reward signal to the gradient coefficient GC that determines the magnitude of the penalty or reinforcement for the data. We analyze several representative methods based on such a unified paradigm:
+
+• **Supervised Fine-tuning (SFT)**: SFT fine-tunes pretrained model on human selected SFT data.
+
+| Methods    | Data Source                                                 | Reward Function | Gradient Coefficient |
+|------------|-------------------------------------------------------------|-----------------|----------------------|
+| SFT        | $q,o \sim P_{sft}(Q,O)$                                     | -               | 1                    |
+| RFT        | $q \sim P_{sft}(Q), o \sim \pi_{sft}(O q)$                  | Rule            | Equation 10          |
+| DPO        | $q \sim P_{sft}(Q), o^+, o^- \sim \pi_{sft}(O q)$           | Rule            | Equation 14          |
+| Online RFT | $q \sim P_{sft}(Q), o \sim \pi_{\theta}(O q)$               | Rule            | Equation 10          |
+| PPO        | $q \sim P_{sft}(Q), o \sim \pi_{\theta}(O q)$               | Model           | Equation 18          |
+| GRPO       | $q \sim P_{sft}(Q), \{o_i\}_{i=1}^G \sim \pi_{\theta}(O q)$ | Model           | Equation 21          |
+
+Table 10 | The data source and gradient coefficient of different methods.  $P_{sft}$  denotes the data distribution of supervised fine-tuning datasets.  $\pi_{\theta_{sft}}$  and  $\pi_{\theta}$  denote the supervised fine-tuned model and the real-time policy model during the online training process, respectively.
+
+![](_page_18_Figure_2.jpeg)
+
+Figure 5 | Performance of the DeepSeekMath-Instruct 1.3B model, which was further trained using various methods, on two benchmarks.
+
+- **Rejection Sampling Fine-tuning (RFT)**: RFT further fine-tunes the SFT model on the filtered outputs sampled from the SFT model based on SFT questions. RFT filters the outputs based on the correctness of their answers.
+- **Direct Preference Optimization (DPO)**: DPO further refines the SFT model by fine-tuning it on augmented outputs sampled from the SFT model, using pair-wise DPO loss.
+- Online Rejection Sampling Fine-tuning (Online RFT): Different from RFT, Online RFT initiates the policy model using the SFT model and refines it by fine-tuning with the augmented outputs sampled from the real-time policy model.
+- **PPO/GRPO**: PPO/GRPO initializes the policy model using the SFT model and reinforces it with the outputs sampled from the real-time policy model.
+
+We summarize the components of these methods in Table 10. Please refer to Appendix A.1 for a more detailed derivation process.
+
+**Observation about Data Source** We divide the data source into two categories, online sampling, and offline sampling. Online sampling denotes that the training data is from the exploration results of the real-time training policy model, while offline sampling denotes that the
+
+![](_page_19_Figure_0.jpeg)
+
+Figure 6 | Performance of iterative reinforcement learning with DeepSeekMath-Instruct 7B on two benchmarks.
+
+training data is from the sampling results of the initial SFT model. RFT and DPO follow the offline style, while Online RFT and GRPO follow the online style.
+
+As shown in Figure 5, we find that the Online RFT significantly outperforms RFT on two benchmarks. Specifically, Online RFT is comparable to RFT in the early stage of training but gains an absolute advantage in the later stage, demonstrating the superiority of online training. This is intuitive, as in the initial stage, the actor and the SFT model exhibit close resemblance, with the sampled data revealing only minor differences. In the later stage, however, the data sampled from the actor will exhibit more significant differences, and real-time data sampling will offer greater advantages.
+
+**Observation about Gradient Coefficient** The algorithm processes the reward signal to the gradient coefficient to update the model parameter. We divide the reward function as 'Rule' and 'Model' in our experiments. Rule refers to judging the quality of a response based on the correctness of the answer, and Model denotes that we train a reward model to score each response. The training data of the reward model is based on the rule judgment. Equations 10 and 21 highlight a key difference between GRPO and Online RFT: GRPO uniquely adjusts its gradient coefficient based on the reward value provided by the reward model. This allows for differential reinforcement and penalization of responses according to their varying magnitudes. In contrast, Online RFT lacks this feature; it does not penalize incorrect responses and uniformly reinforces all responses with correct answers at the same level of intensity.
+
+As demonstrated in Figure 5, GRPO surpasses online RFT, thereby highlighting the efficiency of altering positive and negative gradient coefficients. In addition, GRPO+PS shows superior performance compared to GRPO+OS, indicating the benefits of using fine-grained, step-aware gradient coefficients. Furthermore, we explore the iterative RL, in our experiments, we conduct two rounds of iteration. As shown in Figure 6, we notice that the iterative RL significantly improves the performance, especially at the first iteration.
+
+![](_page_20_Figure_0.jpeg)
+
+Figure 7 | The Maj@K and Pass@K of SFT and RL DeepSeekMath 7B on GSM8K and MATH (temperature 0.7). It was noted that RL enhances Maj@K but not Pass@K.
+
+### 5.2.2. Why RL Works?
+
+In this paper, we conduct reinforcement learning based on a subset of instruction tuning data, and it achieves significant performance enhancement upon the instruction tuning model. To further explain why reinforcement learning works. We evaluate the Pass@K and Maj@K accuracy of the Instruct and RL models on two benchmarks. As shown in Figure 7, RL enhances Maj@K's performance but not Pass@K. These findings indicate that RL enhances the model's overall performance by rendering the output distribution more robust, in other words, it seems that the improvement is attributed to boosting the correct response from TopK rather than the enhancement of fundamental capabilities. Similarly, (Wang et al., 2023a) identified a misalignment problem in reasoning tasks within the SFT model, showing that the reasoning performance of SFT models can be improved through a series of preference alignment strategies (Song et al., 2023; Wang et al., 2023a; Yuan et al., 2023b).
+
+#### 5.2.3. How to Achieve More Effective RL?
+
+We demonstrate RL works pretty well in mathematical reasoning tasks. We also provide a unified paradigm to understand different representative training methods. Within this paradigm, all methods are conceptualized as either direct or simplified RL techniques. As summarized in Equation 5, there exist three key components: Data Source, Algorithm, and Reward Function. We provide some potential future directions about the three components.
+
+**Data Source** Data source is the raw material of all training methods. In the context of RL, we specifically refer to the data source as the unlabeled questions with the outputs sampled from the policy model. In this paper, we only use the questions from the instruction tuning stage and a naive nucleus sampling to sample outputs. We think this is a potential reason that our RL pipeline only improves the Maj@K performance. In the future, we will explore our RL pipeline on out-of-distribution question prompts, in conjunction with **advanced sampling (decoding) strategies**, like those based on tree-search methods (Yao et al., 2023). Also, the **efficient inference techniques** (Kwon et al., 2023; Leviathan et al., 2023; Xia et al., 2023, 2024), which determines
+
+the exploration efficiency of policy models, also play an exceedingly important role.
+
+Algorithms Algorithms process the data and reward signal to the gradient coefficient to update the model parameter. Based on Equation 5, to some extent, all methods now fully **TRUST** the signal of the reward function to increase or decrease the conditional probability of a certain token. However, it is impossible to ensure the reward signal is always reliable, especially in extremely complex tasks. For example, even the PRM800K datasets (Lightman et al., 2023), which have been carefully annotated by well-trained annotators, still contain approximately 20% of incorrectly annotations<sup>7</sup>. To this end, we will explore the reinforcement learning algorithm that is robust against noisy reward signals. We believe such **WEAK-TO-STRONG** (Burns et al., 2023) alignment methods will bring a fundamental change to the learning algorithms.
+
+Reward Function Reward function is the source of the training signal. In RL, the reward function is usually the neural reward model. We think there exist three important directions for reward models: 1) How to enhance the generalization ability of the reward model. The reward model must be effectively generalized to handle out-of-distribution questions and advanced decoding outputs; otherwise, reinforcement learning may merely stabilize the distribution of LLMs rather than improve their fundamental capabilities; 2) How to reflect the uncertainty of reward model. The uncertainty could potentially act as a linking bridge between the weak reward model and the weak-to-strong learning algorithms; 3) How to efficiently build high-quality process reward models that can provide fine-grained training signals for the reasoning process (Lightman et al., 2023; Wang et al., 2023b).
+
+# 6. Conclusion, Limitation, and Future Work
+
+We present DeepSeekMath, which outperforms all open-source models on the competition-level MATH benchmark and approaches the performance of closed models. DeepSeekMath is initialized with DeepSeek-Coder-v1.5 7B and undergoes continual training for 500B tokens, with a significant component of the training data being 120B math tokens sourced from Common Crawl. Our extensive ablation study shows web pages offer significant potential for high-quality mathematical data, while arXiv may not as beneficial as we expected. We introduce Group Relative Policy Optimization (GRPO), a variant of Proximal Policy Optimization (PPO), which can notably improve mathematical reasoning capabilities with less memory consumption. The experiment results show that GRPO is effective even if DeepSeekMath-Instruct 7B has reached a high score on benchmarks. We also provide a unified paradigm to understand a series of methods and summarize several potential directions for more effective reinforcement learning.
+
+Although DeepSeekMath achieves impressive scores on quantitative reasoning benchmarks, its capability on geometry and theorem-proof are relatively weaker than closed models. For instance, in our dry run, the model cannot handle problems related to triangles and ellipses, which may indicate data selection bias in pre-training and fine-tuning. In addition, restricted by the model scale, DeepSeekMath is worse than GPT-4 on few-shot capability. GPT-4 could improve its performance with few-shot inputs, while DeepSeekMath shows similar performance in zero-shot and few-shot evaluation. In the future, we will further improve our engineered data selection pipeline to construct more high-quality pre-trained corpus. In addition, we will explore the potential directions (Section 5.2.3) for more effective reinforcement learning of LLMs.
+
+<sup>7</sup>https://github.com/openai/prm800k/issues/12#issuecomment-1728491852
+
+## References
+
+- R. Anil, S. Borgeaud, Y. Wu, J. Alayrac, J. Yu, R. Soricut, J. Schalkwyk, A. M. Dai, A. Hauth, K. Millican, D. Silver, S. Petrov, M. Johnson, I. Antonoglou, J. Schrittwieser, A. Glaese, J. Chen, E. Pitler, T. P. Lillicrap, A. Lazaridou, O. Firat, J. Molloy, M. Isard, P. R. Barham, T. Hennigan, B. Lee, F. Viola, M. Reynolds, Y. Xu, R. Doherty, E. Collins, C. Meyer, E. Rutherford, E. Moreira, K. Ayoub, M. Goel, G. Tucker, E. Piqueras, M. Krikun, I. Barr, N. Savinov, I. Danihelka, B. Roelofs, A. White, A. Andreassen, T. von Glehn, L. Yagati, M. Kazemi, L. Gonzalez, M. Khalman, J. Sygnowski, and et al. Gemini: A family of highly capable multimodal models. <a href="CoRR">CORR</a>, abs/2312.11805, 2023. doi: 10.48550/ARXIV.2312.11805. URL https://doi.org/10.48550/arXiv.2312.11805.
+- J. Austin, A. Odena, M. Nye, M. Bosma, H. Michalewski, D. Dohan, E. Jiang, C. Cai, M. Terry, Q. Le, et al. Program synthesis with large language models. <a href="arXiv:2108.07732">arXiv:preprint arXiv:2108.07732</a>, 2021.
+- Z. Azerbayev, H. Schoelkopf, K. Paster, M. D. Santos, S. McAleer, A. Q. Jiang, J. Deng, S. Biderman, and S. Welleck. Llemma: An open language model for mathematics. <a href="arXiv:2310.10631"><u>arXiv:2310.10631</u></a>, 2023.
+- J. Bai, S. Bai, Y. Chu, Z. Cui, K. Dang, X. Deng, Y. Fan, W. Ge, Y. Han, F. Huang, et al. Qwen technical report. arXiv preprint arXiv:2309.16609, 2023.
+- C. Burns, P. Izmailov, J. H. Kirchner, B. Baker, L. Gao, L. Aschenbrenner, Y. Chen, A. Ecoffet, M. Joglekar, J. Leike, et al. Weak-to-strong generalization: Eliciting strong capabilities with weak supervision. arXiv preprint arXiv:2312.09390, 2023.
+- ChatGLM3 Team. Chatglm3 series: Open bilingual chat llms, 2023. URL https://github.com/THUDM/ChatGLM3.
+- M. Chen, J. Tworek, H. Jun, Q. Yuan, H. P. de Oliveira Pinto, J. Kaplan, H. Edwards, Y. Burda, N. Joseph, G. Brockman, A. Ray, R. Puri, G. Krueger, M. Petrov, H. Khlaaf, G. Sastry, P. Mishkin, B. Chan, S. Gray, N. Ryder, M. Pavlov, A. Power, L. Kaiser, M. Bavarian, C. Winter, P. Tillet, F. P. Such, D. Cummings, M. Plappert, F. Chantzis, E. Barnes, A. Herbert-Voss, W. H. Guss, A. Nichol, A. Paino, N. Tezak, J. Tang, I. Babuschkin, S. Balaji, S. Jain, W. Saunders, C. Hesse, A. N. Carr, J. Leike, J. Achiam, V. Misra, E. Morikawa, A. Radford, M. Knight, M. Brundage, M. Murati, K. Mayer, P. Welinder, B. McGrew, D. Amodei, S. McCandlish, I. Sutskever, and W. Zaremba. Evaluating large language models trained on code. <a href="CoRR"><u>CoRR</u></a>, abs/2107.03374, 2021. URL https://arxiv.org/abs/2107.03374.
+- W. Chen, X. Ma, X. Wang, and W. W. Cohen. Program of thoughts prompting: Disentangling computation from reasoning for numerical reasoning tasks. <u>CoRR</u>, abs/2211.12588, 2022. doi: 10.48550/ARXIV.2211.12588. URL https://doi.org/10.48550/arXiv.2211.12588.
+- K. Cobbe, V. Kosaraju, M. Bavarian, M. Chen, H. Jun, L. Kaiser, M. Plappert, J. Tworek, J. Hilton, R. Nakano, et al. Training verifiers to solve math word problems. <u>arXiv preprint</u> arXiv:2110.14168, 2021.
+- T. Computer. Redpajama: an open dataset for training large language models, Oct. 2023. URL https://github.com/togethercomputer/RedPajama-Data.
+- DeepSeek-AI. Deepseek LLM: scaling open-source language models with longtermism. <u>CoRR</u>, abs/2401.02954, 2024. doi: 10.48550/ARXIV.2401.02954. URL https://doi.org/10.48550/arXiv.2401.02954.
+
+- Z. Du, Y. Qian, X. Liu, M. Ding, J. Qiu, Z. Yang, and J. Tang. Glm: General language model pretraining with autoregressive blank infilling. In Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 320–335, 2022.
+- L. Gao, A. Madaan, S. Zhou, U. Alon, P. Liu, Y. Yang, J. Callan, and G. Neubig. PAL: programaided language models. In A. Krause, E. Brunskill, K. Cho, B. Engelhardt, S. Sabato, and J. Scarlett, editors, International Conference on Machine Learning, ICML 2023, 23-29 July 2023, Honolulu, Hawaii, USA, volume 202 of Proceedings of Machine Learning Research, pages 10764–10799. PMLR, 2023. URL https://proceedings.mlr.press/v202/gao23f.html.
+- Z. Gou, Z. Shao, Y. Gong, Y. Shen, Y. Yang, M. Huang, N. Duan, and W. Chen. Tora: A toolintegrated reasoning agent for mathematical problem solving. <u>CoRR</u>, abs/2309.17452, 2023. doi: 10.48550/ARXIV.2309.17452. URL https://doi.org/10.48550/arXiv.2309.1745
+- D. Guo, Q. Zhu, D. Yang, Z. Xie, K. Dong, W. Zhang, G. Chen, X. Bi, Y. Wu, Y. K. Li, F. Luo, Y. Xiong, and W. Liang. Deepseek-coder: When the large language model meets programming the rise of code intelligence, 2024.
+- D. Hendrycks, C. Burns, S. Basart, A. Zou, M. Mazeika, D. Song, and J. Steinhardt. Measuring massive multitask language understanding. arXiv preprint arXiv:2009.03300, 2020.
+- D. Hendrycks, C. Burns, S. Kadavath, A. Arora, S. Basart, E. Tang, D. Song, and J. Steinhardt. Measuring mathematical problem solving with the math dataset. <a href="mailto:arXiv:2103.03874">arXiv:2103.03874</a>, 2021.
+- High-flyer. Hai-llm: 高效且轻量的大模型训练工具, 2023. URL https://www.high-flyer.c n/en/blog/hai-llm.
+- Inflection AI. Inflection-2, 2023. URL https://inflection.ai/inflection-2.
+- A. Q. Jiang, S. Welleck, J. P. Zhou, W. Li, J. Liu, M. Jamnik, T. Lacroix, Y. Wu, and G. Lample. Draft, sketch, and prove: Guiding formal theorem provers with informal proofs. <a href="arXiv:2210.12283"><u>arXiv:2210.12283</u></a>, 2022.
+- A. Q. Jiang, A. Sablayrolles, A. Mensch, C. Bamford, D. S. Chaplot, D. d. l. Casas, F. Bressand, G. Lengyel, G. Lample, L. Saulnier, et al. Mistral 7b. arXiv preprint arXiv:2310.06825, 2023.
+- A. Joulin, E. Grave, P. Bojanowski, M. Douze, H. Jégou, and T. Mikolov. Fasttext. zip: Compressing text classification models. arXiv preprint arXiv:1612.03651, 2016.
+- W. Kwon, Z. Li, S. Zhuang, Y. Sheng, L. Zheng, C. H. Yu, J. E. Gonzalez, H. Zhang, and I. Stoica. Efficient memory management for large language model serving with pagedattention. In Proceedings of the ACM SIGOPS 29th Symposium on Operating Systems Principles, 2023.
+- Y. Leviathan, M. Kalman, and Y. Matias. Fast inference from transformers via speculative decoding. In <u>International Conference on Machine Learning</u>, pages 19274–19286. PMLR, 2023.
+- A. Lewkowycz, A. Andreassen, D. Dohan, E. Dyer, H. Michalewski, V. Ramasesh, A. Slone, C. Anil, I. Schlag, T. Gutman-Solo, et al. Solving quantitative reasoning problems with language models. Advances in Neural Information Processing Systems, 35:3843–3857, 2022a.
+
+- A. Lewkowycz, A. Andreassen, D. Dohan, E. Dyer, H. Michalewski, V. V. Ramasesh, A. Slone, C. Anil, I. Schlag, T. Gutman-Solo, Y. Wu, B. Neyshabur, G. Gur-Ari, and V. Misra. Solving quantitative reasoning problems with language models. In S. Koyejo, S. Mohamed, A. Agarwal, D. Belgrave, K. Cho, and A. Oh, editors, Advances in Neural Information Processing Systems 35: Annual Conference on Neural Information Processing Systems 2022, NeurIPS 2022, New Orleans, LA, USA, November 28 December 9, 2022, 2022b. URL http://papers.nips.cc/paper\_files/paper/2022/hash/18abbeef8cfe9203fdf9053c9c4fe191-Abstract-Conference.html.
+- H. Lightman, V. Kosaraju, Y. Burda, H. Edwards, B. Baker, T. Lee, J. Leike, J. Schulman, I. Sutskever, and K. Cobbe. Let's verify step by step. arXiv preprint arXiv:2305.20050, 2023.
+- I. Loshchilov and F. Hutter. Decoupled weight decay regularization. <u>arXiv preprint</u> arXiv:1711.05101, 2017.
+- H. Luo, Q. Sun, C. Xu, P. Zhao, J. Lou, C. Tao, X. Geng, Q. Lin, S. Chen, and D. Zhang. Wizardmath: Empowering mathematical reasoning for large language models via reinforced evol-instruct. arXiv preprint arXiv:2308.09583, 2023.
+- S. Mishra, M. Finlayson, P. Lu, L. Tang, S. Welleck, C. Baral, T. Rajpurohit, O. Tafjord, A. Sabharwal, P. Clark, and A. Kalyan. LILA: A unified benchmark for mathematical reasoning. In Y. Goldberg, Z. Kozareva, and Y. Zhang, editors, Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing, EMNLP 2022, Abu Dhabi, United Arab Emirates, December 7-11, 2022, pages 5807–5832. Association for Computational Linguistics, 2022. doi: 10.18653/V1/2022.EMNLP-MAIN.392. URL https://doi.org/10.18653/v1/2022.emnlp-main.392.
+- X. Nguyen, W. Zhang, X. Li, M. M. Aljunied, Q. Tan, L. Cheng, G. Chen, Y. Deng, S. Yang, C. Liu, H. Zhang, and L. Bing. Seallms large language models for southeast asia. <a href="Maintenance-color: Corr. ARXIV.2312.00738">Corr. Liu, H. Zhang, and L. Bing. Seallms large language models for southeast asia. <a href="Corr. Corr. ARXIV.2312.00738">Corr. Liu, H. Zhang, and L. Bing. Seallms large language models for southeast asia. <a href="Corr. Corr. Corr. Corr. ARXIV.2312.00738">Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr. Corr.
+- OpenAI. GPT4 technical report. arXiv preprint arXiv:2303.08774, 2023.
+- L. Ouyang, J. Wu, X. Jiang, D. Almeida, C. Wainwright, P. Mishkin, C. Zhang, S. Agarwal, K. Slama, A. Ray, et al. Training language models to follow instructions with human feedback. Advances in Neural Information Processing Systems, 35:27730–27744, 2022.
+- K. Paster, M. D. Santos, Z. Azerbayev, and J. Ba. Openwebmath: An open dataset of high-quality mathematical web text. <u>CoRR</u>, abs/2310.06786, 2023. doi: 10.48550/ARXIV.2310.06786. URL https://doi.org/10.48550/arXiv.2310.06786.
+- L. C. Paulson. Three years of experience with sledgehammer, a practical link between automatic and interactive theorem provers. In R. A. Schmidt, S. Schulz, and B. Konev, editors, Proceedings of the 2nd Workshop on Practical Aspects of Automated Reasoning, PAAR-2010, Edinburgh, Scotland, UK, July 14, 2010, volume 9 of EPiC Series in Computing, pages 1–10. EasyChair, 2010. doi: 10.29007/TNFD. URL https://doi.org/10.29007/tnfd.
+- S. Polu and I. Sutskever. Generative language modeling for automated theorem proving. <u>CoRR</u>, abs/2009.03393, 2020. URL https://arxiv.org/abs/2009.03393.
+- R. Rafailov, A. Sharma, E. Mitchell, S. Ermon, C. D. Manning, and C. Finn. Direct preference optimization: Your language model is secretly a reward model. 2023.
+
+- J. Schulman. Approximating kl divergence, 2020. URL http://joschu.net/blog/kl-approx.html.
+- J. Schulman, P. Moritz, S. Levine, M. Jordan, and P. Abbeel. High-dimensional continuous control using generalized advantage estimation. arXiv preprint arXiv:1506.02438, 2015.
+- J. Schulman, F. Wolski, P. Dhariwal, A. Radford, and O. Klimov. Proximal policy optimization algorithms. arXiv preprint arXiv:1707.06347, 2017.
+- F. Shi, M. Suzgun, M. Freitag, X. Wang, S. Srivats, S. Vosoughi, H. W. Chung, Y. Tay, S. Ruder, D. Zhou, D. Das, and J. Wei. Language models are multilingual chain-of-thought reasoners. In <a href="https://creativecommons.org/learning-new-net-pdf">The Eleventh International Conference on Learning Representations, ICLR 2023, Kigali, Rwanda, May 1-5, 2023</a>. OpenReview.net, 2023. URL https://openreview.net/pdf?id=fR3wGCk-IXp.
+- F. Song, B. Yu, M. Li, H. Yu, F. Huang, Y. Li, and H. Wang. Preference ranking optimization for human alignment. arXiv preprint arXiv:2306.17492, 2023.
+- M. Suzgun, N. Scales, N. Schärli, S. Gehrmann, Y. Tay, H. W. Chung, A. Chowdhery, Q. V. Le, E. H. Chi, D. Zhou, et al. Challenging big-bench tasks and whether chain-of-thought can solve them. arXiv preprint arXiv:2210.09261, 2022.
+- T. Tao. Embracing change and resetting expectations, 2023. URL https://unlocked.microsoft.com/ai-anthology/terence-tao/.
+- H. Touvron, L. Martin, K. Stone, P. Albert, A. Almahairi, Y. Babaei, N. Bashlykov, S. Batra, P. Bhargava, S. Bhosale, D. Bikel, L. Blecher, C. Canton-Ferrer, M. Chen, G. Cucurull, D. Esiobu, J. Fernandes, J. Fu, W. Fu, B. Fuller, C. Gao, V. Goswami, N. Goyal, A. Hartshorn, S. Hosseini, R. Hou, H. Inan, M. Kardas, V. Kerkez, M. Khabsa, I. Kloumann, A. Korenev, P. S. Koura, M. Lachaux, T. Lavril, J. Lee, D. Liskovich, Y. Lu, Y. Mao, X. Martinet, T. Mihaylov, P. Mishra, I. Molybog, Y. Nie, A. Poulton, J. Reizenstein, R. Rungta, K. Saladi, A. Schelten, R. Silva, E. M. Smith, R. Subramanian, X. E. Tan, B. Tang, R. Taylor, A. Williams, J. X. Kuan, P. Xu, Z. Yan, I. Zarov, Y. Zhang, A. Fan, M. Kambadur, S. Narang, A. Rodriguez, R. Stojnic, S. Edunov, and T. Scialom. Llama 2: Open foundation and fine-tuned chat models. <a href="CoRR">CoRR</a>, abs/2307.09288, 2023. doi: 10.48550/arXiv.2307.09288. URL https://doi.org/10.48550/arXiv.2307.09288.
+- T. H. Trinh, Y. Wu, Q. V. Le, H. He, and T. Luong. Solving olympiad geometry without human demonstrations. Nature, 625(7995):476–482, 2024.
+- P. Wang, L. Li, L. Chen, F. Song, B. Lin, Y. Cao, T. Liu, and Z. Sui. Making large language models better reasoners with alignment. arXiv preprint arXiv:2309.02144, 2023a.
+- P. Wang, L. Li, Z. Shao, R. Xu, D. Dai, Y. Li, D. Chen, Y. Wu, and Z. Sui. Math-shepherd: Verify and reinforce llms step-by-step without human annotations. CoRR, abs/2312.08935, 2023b.
+- Z. Wang, R. Xia, and P. Liu. Generative AI for math: Part I mathpile: A billion-token-scale pretraining corpus for math. <u>CoRR</u>, abs/2312.17120, 2023c. doi: 10.48550/ARXIV.2312.17120. URL https://doi.org/10.48550/arXiv.2312.17120.
+- J. Wei, X. Wang, D. Schuurmans, M. Bosma, B. Ichter, F. Xia, E. H. Chi, Q. V. Le, and D. Zhou. Chain-of-thought prompting elicits reasoning in large language models. In <u>NeurIPS</u>, 2022. URL http://papers.nips.cc/paper\_files/paper/2022/hash/9d5609613524ecf 4f15af0f7b31abca4-Abstract-Conference.html.
+
+- T. Wei, J. Luan, W. Liu, S. Dong, and B. Wang. Cmath: Can your language model pass chinese elementary school math test?, 2023.
+- M. Wenzel, L. C. Paulson, and T. Nipkow. The isabelle framework. In O. A. Mohamed, C. A. Muñoz, and S. Tahar, editors, Theorem Proving in Higher Order Logics, 21st International Conference, TPHOLs 2008, Montreal, Canada, August 18-21, 2008. Proceedings, volume 5170 of Lecture Notes in Computer Science, pages 33–38. Springer, 2008. doi: 10.1007/978-3-540-7 1067-7\\_7. URL https://doi.org/10.1007/978-3-540-71067-7\_7.
+- H. Xia, T. Ge, P. Wang, S.-Q. Chen, F. Wei, and Z. Sui. Speculative decoding: Exploiting speculative execution for accelerating seq2seq generation. In H. Bouamor, J. Pino, and K. Bali, editors, Findings of the Association for Computational Linguistics: EMNLP 2023, pages 3909–3925, Singapore, Dec. 2023. Association for Computational Linguistics. doi: 10.18653/v1/2023.findings-emnlp.257. URL https://aclanthology.org/2023.findings-emnlp.257.
+- H. Xia, Z. Yang, Q. Dong, P. Wang, Y. Li, T. Ge, T. Liu, W. Li, and Z. Sui. Unlocking efficiency in large language model inference: A comprehensive survey of speculative decoding. <a href="arXiv:2401.07851"><u>arXiv</u></a> preprint arXiv:2401.07851, 2024.
+- S. Yao, D. Yu, J. Zhao, I. Shafran, T. L. Griffiths, Y. Cao, and K. Narasimhan. Tree of thoughts: Deliberate problem solving with large language models. <a href="arXiv:2305.10601">arXiv:2305.10601</a>, 2023.
+- L. Yu, W. Jiang, H. Shi, J. Yu, Z. Liu, Y. Zhang, J. T. Kwok, Z. Li, A. Weller, and W. Liu. Metamath: Bootstrap your own mathematical questions for large language models. <u>CoRR</u>, abs/2309.12284, 2023. doi: 10.48550/ARXIV.2309.12284. URL https://doi.org/10.48550/arXiv.2309.12284.
+- Z. Yuan, H. Yuan, C. Li, G. Dong, C. Tan, and C. Zhou. Scaling relationship on learning mathematical reasoning with large language models. arXiv preprint arXiv:2308.01825, 2023a.
+- Z. Yuan, H. Yuan, C. Tan, W. Wang, S. Huang, and F. Huang. Rrhf: Rank responses to align language models with human feedback without tears. arXiv preprint arXiv:2304.05302, 2023b.
+- X. Yue, X. Qu, G. Zhang, Y. Fu, W. Huang, H. Sun, Y. Su, and W. Chen. Mammoth: Building math generalist models through hybrid instruction tuning. <u>CoRR</u>, abs/2309.05653, 2023. doi: 10.48550/ARXIV.2309.05653. URL https://doi.org/10.48550/arXiv.2309.05653.
+- K. Zheng, J. M. Han, and S. Polu. Minif2f: a cross-system benchmark for formal olympiad-level mathematics. arXiv preprint arXiv:2109.00110, 2021.
+- W. Zhong, R. Cui, Y. Guo, Y. Liang, S. Lu, Y. Wang, A. Saied, W. Chen, and N. Duan. AGIEval: A human-centric benchmark for evaluating foundation models. <u>CoRR</u>, abs/2304.06364, 2023. doi: 10.48550/arXiv.2304.06364. URL https://doi.org/10.48550/arXiv.2304.06364.
+
+# A. Appendix
+
+### A.1. Analysis of Reinforcement Learning
+
+We provide the detailed derivation of the data source and gradient coefficient (algorithm and reward function) across various methods, including SFT, RFT, Online RFT, DPO, PPO, and GRPO.
+
+#### A.1.1. Supervised Fine-tuning
+
+The objective of Supervised Fine-tuning is maximizing the following objective:
+
+$$\mathcal{J}_{SFT}(\theta) = \mathbb{E}[q, o \sim P_{sft}(Q, O)] \left( \frac{1}{|o|} \sum_{t=1}^{|o|} \log \pi_{\theta}(o_t | q, o_{< t}) \right). \tag{6}$$
+
+The gradient of  $\mathcal{J}_{SFT}(\theta)$  is:
+
+$$\nabla_{\theta} \mathcal{J}_{SFT} = \mathbb{E}\left[ q, o \sim P_{\text{sft}}(Q, O) \right] \left( \frac{1}{|o|} \sum_{t=1}^{|o|} \nabla_{\theta} \log \pi_{\theta}(o_t \mid q, o_{< t}) \right). \tag{7}$$
+
+Data Source: The dataset employed for SFT. Reward Function: This can be regarded as human selection. Gradient Coefficient: always set to 1.
+
+#### A.1.2. Rejection Sampling Fine-tuning
+
+Rejection Sampling Fine-tuning first samples multiple outputs from the supervised fine-tuned LLMs for each question, and then trains LLMs on the sampled outputs with the correct answer. Formally, the objective of RFT is to maximize the following objectives:
+
+$$\mathcal{J}_{\text{RFT}}(\theta) = \mathbb{E}_{q \sim P_{\text{sft}}(Q),\, o \sim \pi_{\text{sft}}(O|q)} \left( \frac{1}{|o|} \sum_{t=1}^{|o|} \mathbb{I}(o) \log \pi_{\theta}(o_t | q, o_{$$
+
+The gradient of  $\mathcal{J}_{RFT}(\theta)$  is:
+
+$$\nabla_{\theta} \mathcal{J}_{RFT}(\theta) = \mathbb{E}\left[q \sim P_{sft}(Q), o \sim \pi_{sft}(O|q)\right] \left(\frac{1}{|o|} \sum_{t=1}^{|o|} \mathbb{I}(o) \nabla_{\theta} \log \pi_{\theta}(o_t|q, o_{< t})\right). \tag{9}$$
+
+Data Source: question in SFT dataset with outputs sampled from SFT model. Reward Function: Rule (whether the answer is correct or not). Gradient Coefficient:
+
+$$GC_{RFT}(q, o, t) = \mathbb{I}(o) = \begin{cases} 1 & \text{the answer of o is correct} \\ 0 & \text{the answer of o is incorrect} \end{cases}$$
+ (10)
+
+#### A.1.3. Online Rejection Sampling Fine-tuning
+
+The only difference between RFT and Online RFT is that the outputs of Online RFT are sampled from the real-time policy model  $\pi_{\theta}$ , rather than from the SFT model  $\pi_{\theta_{sft}}$ . Therefore, the gradient of online RFT is:
+
+$$\nabla_{\theta} \mathcal{J}_{OnRFT}(\theta) = \mathbb{E}[q \sim P_{sft}(Q), o \sim \pi_{\theta}(O|q)] \left(\frac{1}{|o|} \sum_{t=1}^{|o|} \mathbb{I}(o) \nabla_{\theta} \log \pi_{\theta}(o_{t}|q, o_{< t})\right). \tag{11}$$
+
+#### A.1.4. Direct Preference Optimization (DPO)
+
+The objective of DPO is:
+
+$$\mathcal{J}_{DPO}(\theta) = \mathbb{E}[q \sim P_{sft}(Q), o^{+}, o^{-} \sim \pi_{sft}(O|q)] \log \sigma \left(\beta \frac{1}{|o^{+}|} \sum_{t=1}^{|o^{+}|} \log \frac{\pi_{\theta}(o_{t}^{+}|q, o_{< t}^{+})}{\pi_{\text{ref}}(o_{t}^{+}|q, o_{< t}^{+})} - \beta \frac{1}{|o^{-}|} \sum_{t=1}^{|o^{-}|} \log \frac{\pi_{\theta}(o_{< t}^{-}|q, o_{< t}^{-})}{\pi_{\text{ref}}(o_{< t}^{-}|q, o_{< t}^{-})}\right)$$
+(12)
+
+The gradient of  $\mathcal{J}_{DPO}(\theta)$  is:
+
+$$\nabla_{\theta} \mathcal{J}_{DPO}(\theta) = \mathbb{E}[q \sim P_{sft}(Q), o^{+}, o^{-} \sim \pi_{sft}(O|q)] \left( \frac{1}{|o^{+}|} \sum_{t=1}^{|o^{+}|} GC_{DPO}(q, o, t) \nabla_{\theta} \log \pi_{\theta}(o_{t}^{+}|q, o_{< t}^{+}) \right.\ \nabla_{\theta} \mathcal{J}_{DPO}(\theta) = \mathbb{E}[q \sim P_{sft}(Q), o^{+}, o^{-} \sim \pi_{sft}(O|q)] \left( \frac{1}{|o^{+}|} \sum_{t=1}^{|o^{+}|} GC_{DPO}(q, o, t) \nabla_{\theta} \log \pi_{\theta}(o_{t}^{+}|q, o_{< t}^{+}) - \frac{1}{|o^{-}|} \sum_{t=1}^{|o^{-}|} GC_{DPO}(q, o, t) \nabla_{\theta} \log \pi_{\theta}(o_{t}^{-}|q, o_{< t}^{-}) \right) \tag{13}$$
+
+Data Source: question in SFT dataset with outputs sampled from SFT model. Reward Function: human preference in the general domain (can be 'Rule' in mathematical tasks). Gradient Coefficient:
+
+$$GC_{DPO}(q, o, t) = \sigma \left( \beta \log \frac{\pi_{\theta}(o_t^- | q, o_{< t}^-)}{\pi_{ref}(o_t^- | q, o_{< t}^-)} - \beta \log \frac{\pi_{\theta}(o_t^+ | q, o_{< t}^+)}{\pi_{ref}(o_t^+ | q, o_{< t}^+)} \right)$$
+(14)
+
+#### A.1.5. Proximal Policy Optimization (PPO)
+
+The objective of PPO is:
+
+$$\mathcal{J}_{PPO}(\theta) = \mathbb{E}\left[q \sim P_{sft}(Q), o \sim \pi_{\theta_{old}}(O|q)\right] \frac{1}{|o|} \sum_{t=1}^{|o|} \min\left[\frac{\pi_{\theta}(o_t|q, o_{< t})}{\pi_{\theta_{old}}(o_t|q, o_{< t})} A_t, \operatorname{clip}\left(\frac{\pi_{\theta}(o_t|q, o_{< t})}{\pi_{\theta_{old}}(o_t|q, o_{< t})}, 1 - \varepsilon, 1 + \varepsilon\right) A_t\right]. \tag{15}$$
+
+To simplify the analysis, it is assumed that the model only has a single update following each exploration stage, thereby ensuring that  $\pi_{\theta_{old}} = \pi_{\theta}$ . In this case, we can remove the min and clip operation:
+
+$$\mathcal{J}_{PPO}(\theta) = \mathbb{E}[q \sim P_{sft}(Q), o \sim \pi_{\theta_{old}}(O|q)] \frac{1}{|o|} \sum_{t=1}^{|o|} \frac{\pi_{\theta}(o_t|q, o_{< t})}{\pi_{\theta_{old}}(o_t|q, o_{< t})} A_t.$$
+ (16)
+
+The gradient of  $\mathcal{J}_{PPO}(\theta)$  is:
+
+$$\nabla_{\theta} \mathcal{J}_{PPO}(\theta) = \mathbb{E}[q \sim P_{sft}(Q),\ o \sim \pi_{\theta_{\text{old}}}(O|q)]\ \frac{1}{|o|} \sum_{t=1}^{|o|} A_t \nabla_{\theta} \log \pi_{\theta}(o_t|q, o_{< t})$$
+(17)
+
+Data Source: question in SFT dataset with outputs sampled from policy model. Reward Function: reward model. Gradient Coefficient:
+
+$$GC_{\text{PPO}}(q, o, t, \pi_{\theta_{\text{rm}}}) = A_t, \tag{18}$$
+
+where  $A_t$  is the advantage, which is computed by applying Generalized Advantage Estimation (GAE) (Schulman et al., 2015), based on the rewards  $\{r_{\geq t}\}$  and a learned value function  $V_{\psi}$ .
+
+#### A.1.6. Group Relative Policy Optimization (GRPO)
+
+The objective of GRPO is (assume  $\pi_{\theta_{old}} = \pi_{\theta}$  for simplified analysis):
+
+$$\mathcal{J}_{GRPO}(\theta) = \mathbb{E}[q \sim P_{sft}(Q), \{o_i\}_{i=1}^G \sim \pi_{\theta_{old}}(O|q)] \quad 
+\frac{1}{G} \sum_{i=1}^G \frac{1}{|o_i|} \sum_{t=1}^{|o_i|} \left[ \frac{\pi_{\theta}(o_{i,t}|q, o_{i,(19)$$
+
+The gradient of  $\mathcal{J}_{GRPO}(\theta)$  is:
+
+$$\nabla_{\theta} \mathcal{J}_{GRPO}(\theta) = \mathbb{E}\left[q \sim P_{sft}(Q), \{o_{i}\}_{i=1}^{G} \sim \pi_{\theta_{old}}(O|q)\right] \\ \frac{1}{G} \sum_{i=1}^{G} \frac{1}{|o_{i}|} \sum_{t=1}^{|o_{i}|} \left[\hat{A}_{i,t} + \beta \left(\frac{\pi_{\text{ref}}(o_{i,t}|o_{i,< t})}{\pi_{\theta}(o_{i,t}|o_{i,< t})} - 1\right)\right] \nabla_{\theta} \log \pi_{\theta}(o_{i,t}|q, o_{i,< t}). \quad\text{(20)}$$
+
+Data Source: question in SFT dataset with outputs sampled from policy model. Reward Function: reward model. Gradient Coefficient:
+
+$$GC_{GRPO}(q, o, t, \pi_{\theta_{\text{rm}}}) = \hat{A}_{i,t} + \beta \left( \frac{\pi_{\text{ref}}(o_{i,t}|o_{i,$$
+
+where  $\hat{A}_{i,t}$  is computed based on the group reward scores.
\ No newline at end of file
diff --git a/research/papers/arXiv-vanilla_grpo/arXiv-2402.03300_meta.json b/research/papers/arXiv-vanilla_grpo/arXiv-2402.03300_meta.json
new file mode 100644
index 0000000..e90836c
--- /dev/null
+++ b/research/papers/arXiv-vanilla_grpo/arXiv-2402.03300_meta.json
@@ -0,0 +1,2319 @@
+{
+  "table_of_contents": [
+    {
+      "title": "DeepSeekMath: Pushing the Limits of Mathematical\nReasoning in Open Language Models",
+      "heading_level": null,
+      "page_id": 0,
+      "polygon": [
+        [
+          103.46106817980558,
+          115.92430865764618
+        ],
+        [
+          489.56577914067424,
+          115.92430865764618
+        ],
+        [
+          489.56577914067424,
+          152.09927022457123
+        ],
+        [
+          103.46106817980558,
+          152.09927022457123
+        ]
+      ]
+    },
+    {
+      "title": "Abstract",
+      "heading_level": null,
+      "page_id": 0,
+      "polygon": [
+        [
+          267.64928507384485,
+          293.0994045138359
+        ],
+        [
+          326.87699801735675,
+          293.0994045138359
+        ],
+        [
+          326.87699801735675,
+          305.1195333587837
+        ],
+        [
+          267.64928507384485,
+          305.1195333587837
+        ]
+      ]
+    },
+    {
+      "title": "1. Introduction",
+      "heading_level": null,
+      "page_id": 1,
+      "polygon": [
+        [
+          68.97404545320371,
+          85.50445461273193
+        ],
+        [
+          162.68878112331748,
+          85.50445461273193
+        ],
+        [
+          162.68878112331748,
+          96.70864816531473
+        ],
+        [
+          68.97404545320371,
+          96.70864816531473
+        ]
+      ]
+    },
+    {
+      "title": "1.1. Contributions",
+      "heading_level": null,
+      "page_id": 2,
+      "polygon": [
+        [
+          68.97404545320371,
+          146.18749141268506
+        ],
+        [
+          164.9379347794002,
+          146.18749141268506
+        ],
+        [
+          164.9379347794002,
+          155.93332417353074
+        ],
+        [
+          68.97404545320371,
+          155.93332417353074
+        ]
+      ]
+    },
+    {
+      "title": "Math Pre-Training at Scale",
+      "heading_level": null,
+      "page_id": 2,
+      "polygon": [
+        [
+          68.97404545320371,
+          203.07307970523834
+        ],
+        [
+          204.67298270352842,
+          203.07307970523834
+        ],
+        [
+          204.67298270352842,
+          214.58329474925995
+        ],
+        [
+          68.97404545320371,
+          214.58329474925995
+        ]
+      ]
+    },
+    {
+      "title": "Exploration and Analysis of Reinforcement Learning",
+      "heading_level": null,
+      "page_id": 2,
+      "polygon": [
+        [
+          69.72376333856462,
+          460.30317808917243
+        ],
+        [
+          335.87361264168766,
+          460.30317808917243
+        ],
+        [
+          335.87361264168766,
+          471.54836973630205
+        ],
+        [
+          69.72376333856462,
+          471.54836973630205
+        ]
+      ]
+    },
+    {
+      "title": "1.2. Summary of Evaluations and Metrics",
+      "heading_level": null,
+      "page_id": 2,
+      "polygon": [
+        [
+          69.72376333856462,
+          703.7674341201782
+        ],
+        [
+          278.8950533542585,
+          703.7674341201782
+        ],
+        [
+          278.8950533542585,
+          714.4445093143019
+        ],
+        [
+          69.72376333856462,
+          714.4445093143019
+        ]
+      ]
+    },
+    {
+      "title": "2. Math Pre-Training",
+      "heading_level": null,
+      "page_id": 3,
+      "polygon": [
+        [
+          68.97404545320371,
+          527.8255755901337
+        ],
+        [
+          196.4260859645584,
+          527.8255755901337
+        ],
+        [
+          196.4260859645584,
+          540.9801070690155
+        ],
+        [
+          68.97404545320371,
+          540.9801070690155
+        ]
+      ]
+    },
+    {
+      "title": "2.1. Data Collection and Decontamination",
+      "heading_level": null,
+      "page_id": 3,
+      "polygon": [
+        [
+          68.97404545320371,
+          555.3678758740425
+        ],
+        [
+          282.6436427810631,
+          555.3678758740425
+        ],
+        [
+          282.6436427810631,
+          566.0079795721908
+        ],
+        [
+          68.97404545320371,
+          566.0079795721908
+        ]
+      ]
+    },
+    {
+      "title": "2.2. Validating the Quality of the DeepSeekMath Corpus",
+      "heading_level": null,
+      "page_id": 5,
+      "polygon": [
+        [
+          68.97404545320371,
+          86.7376919388771
+        ],
+        [
+          355.36627766107136,
+          86.7376919388771
+        ],
+        [
+          355.36627766107136,
+          98.20800705159868
+        ],
+        [
+          68.97404545320371,
+          98.20800705159868
+        ]
+      ]
+    },
+    {
+      "title": "2.2.1. Training Setting",
+      "heading_level": null,
+      "page_id": 5,
+      "polygon": [
+        [
+          68.97404545320371,
+          281.8794706213825
+        ],
+        [
+          180.68201037197932,
+          281.8794706213825
+        ],
+        [
+          180.68201037197932,
+          293.0994045138359
+        ],
+        [
+          68.97404545320371,
+          293.0994045138359
+        ]
+      ]
+    },
+    {
+      "title": "2.2.2. Evaluation Results",
+      "heading_level": null,
+      "page_id": 5,
+      "polygon": [
+        [
+          68.97404545320371,
+          645.3942006826401
+        ],
+        [
+          195.67636807919752,
+          645.3942006826401
+        ],
+        [
+          195.67636807919752,
+          655.2600992918015
+        ],
+        [
+          68.97404545320371,
+          655.2600992918015
+        ]
+      ]
+    },
+    {
+      "title": "2.3. Training and Evaluating DeepSeekMath-Base 7B",
+      "heading_level": null,
+      "page_id": 6,
+      "polygon": [
+        [
+          68.97404545320371,
+          702.4496382240302
+        ],
+        [
+          336.6233305270486,
+          702.4496382240302
+        ],
+        [
+          336.6233305270486,
+          713.6948298711599
+        ],
+        [
+          68.97404545320371,
+          713.6948298711599
+        ]
+      ]
+    },
+    {
+      "title": "3. Supervised Fine-Tuning",
+      "heading_level": null,
+      "page_id": 9,
+      "polygon": [
+        [
+          68.97404545320371,
+          203.4841588139534
+        ],
+        [
+          230.16339080579937,
+          203.4841588139534
+        ],
+        [
+          230.16339080579937,
+          215.81653207540512
+        ],
+        [
+          68.97404545320371,
+          215.81653207540512
+        ]
+      ]
+    },
+    {
+      "title": "3.1. SFT Data Curation",
+      "heading_level": null,
+      "page_id": 9,
+      "polygon": [
+        [
+          68.97404545320371,
+          230.15158904458622
+        ],
+        [
+          186.67975345486659,
+          230.15158904458622
+        ],
+        [
+          186.67975345486659,
+          239.8974218054319
+        ],
+        [
+          68.97404545320371,
+          239.8974218054319
+        ]
+      ]
+    },
+    {
+      "title": "3.2. Training and Evaluating DeepSeekMath-Instruct 7B",
+      "heading_level": null,
+      "page_id": 9,
+      "polygon": [
+        [
+          68.97404545320371,
+          455.8051014303206
+        ],
+        [
+          353.1171240049886,
+          455.8051014303206
+        ],
+        [
+          353.1171240049886,
+          467.0502930774502
+        ],
+        [
+          68.97404545320371,
+          467.0502930774502
+        ]
+      ]
+    },
+    {
+      "title": "4. Reinforcement Learning",
+      "heading_level": null,
+      "page_id": 10,
+      "polygon": [
+        [
+          68.97404545320371,
+          460.30317808917243
+        ],
+        [
+          231.66282657652118,
+          460.30317808917243
+        ],
+        [
+          231.66282657652118,
+          472.29804917944404
+        ],
+        [
+          68.97404545320371,
+          472.29804917944404
+        ]
+      ]
+    },
+    {
+      "title": "4.1. Group Relative Policy Optimization",
+      "heading_level": null,
+      "page_id": 10,
+      "polygon": [
+        [
+          69.72376333856462,
+          487.53982293605804
+        ],
+        [
+          273.6470281567322,
+          486.54195859914154
+        ],
+        [
+          273.6470281567322,
+          497.78715024627115
+        ],
+        [
+          69.72376333856462,
+          499.2865091325551
+        ]
+      ]
+    },
+    {
+      "title": "4.1.1. From PPO to GRPO",
+      "heading_level": null,
+      "page_id": 10,
+      "polygon": [
+        [
+          68.97404545320371,
+          580.032622396946
+        ],
+        [
+          200.9243932767239,
+          580.032622396946
+        ],
+        [
+          200.9243932767239,
+          589.997721752734
+        ],
+        [
+          68.97404545320371,
+          589.997721752734
+        ]
+      ]
+    },
+    {
+      "title": "Algorithm 1 Iterative Group Relative Policy Optimization",
+      "heading_level": null,
+      "page_id": 13,
+      "polygon": [
+        [
+          69.46824425458908,
+          86.32661283016205
+        ],
+        [
+          350.118252463545,
+          86.32661283016205
+        ],
+        [
+          350.118252463545,
+          98.20800705159868
+        ],
+        [
+          69.46824425458908,
+          98.20800705159868
+        ]
+      ]
+    },
+    {
+      "title": "4.1.2. Outcome Supervision RL with GRPO",
+      "heading_level": null,
+      "page_id": 13,
+      "polygon": [
+        [
+          68.97404545320371,
+          405.57657873980827
+        ],
+        [
+          284.8927964371458,
+          405.57657873980827
+        ],
+        [
+          284.8927964371458,
+          416.4231371283531
+        ],
+        [
+          68.97404545320371,
+          416.4231371283531
+        ]
+      ]
+    },
+    {
+      "title": "4.1.3. Process Supervision RL with GRPO",
+      "heading_level": null,
+      "page_id": 13,
+      "polygon": [
+        [
+          68.97404545320371,
+          541.2685579485056
+        ],
+        [
+          277.3956175835367,
+          541.2685579485056
+        ],
+        [
+          277.3956175835367,
+          551.7640701524933
+        ],
+        [
+          68.97404545320371,
+          551.7640701524933
+        ]
+      ]
+    },
+    {
+      "title": "4.1.4. Iterative RL with GRPO",
+      "heading_level": null,
+      "page_id": 14,
+      "polygon": [
+        [
+          68.97404545320371,
+          86.96281540446905
+        ],
+        [
+          222.66621195219028,
+          86.96281540446905
+        ],
+        [
+          222.66621195219028,
+          96.70864816531473
+        ],
+        [
+          68.97404545320371,
+          96.70864816531473
+        ]
+      ]
+    },
+    {
+      "title": "4.2. Training and Evaluating DeepSeekMath-RL",
+      "heading_level": null,
+      "page_id": 14,
+      "polygon": [
+        [
+          68.97404545320371,
+          223.21595603227615
+        ],
+        [
+          313.3820760808604,
+          223.21595603227615
+        ],
+        [
+          313.3820760808604,
+          233.90401285886765
+        ],
+        [
+          68.97404545320371,
+          233.90401285886765
+        ]
+      ]
+    },
+    {
+      "title": "5. Discussion",
+      "heading_level": null,
+      "page_id": 14,
+      "polygon": [
+        [
+          68.97404545320371,
+          558.5111851407711
+        ],
+        [
+          152.94244861362563,
+          558.5111851407711
+        ],
+        [
+          152.94244861362563,
+          568.2570179016168
+        ],
+        [
+          68.97404545320371,
+          568.2570179016168
+        ]
+      ]
+    },
+    {
+      "title": "5.1. Lessons Learnt in Pre-Training",
+      "heading_level": null,
+      "page_id": 14,
+      "polygon": [
+        [
+          68.97404545320371,
+          618.485540592129
+        ],
+        [
+          246.65718428373938,
+          618.485540592129
+        ],
+        [
+          246.65718428373938,
+          629.7307322392587
+        ],
+        [
+          68.97404545320371,
+          629.7307322392587
+        ]
+      ]
+    },
+    {
+      "title": "5.1.1. Code Training Benefits Mathematical Reasoning",
+      "heading_level": null,
+      "page_id": 14,
+      "polygon": [
+        [
+          68.2243275678428,
+          711.4457915417339
+        ],
+        [
+          336.6233305270486,
+          711.4457915417339
+        ],
+        [
+          336.6233305270486,
+          721.9413037457216
+        ],
+        [
+          68.2243275678428,
+          721.9413037457216
+        ]
+      ]
+    },
+    {
+      "title": "Two-Stage Training",
+      "heading_level": null,
+      "page_id": 15,
+      "polygon": [
+        [
+          68.8869200348854,
+          390.93623238801956
+        ],
+        [
+          169.43624209156565,
+          390.93623238801956
+        ],
+        [
+          169.43624209156565,
+          401.8281815240984
+        ],
+        [
+          68.8869200348854,
+          401.8281815240984
+        ]
+      ]
+    },
+    {
+      "title": "One-Stage Training",
+      "heading_level": null,
+      "page_id": 15,
+      "polygon": [
+        [
+          69.72376333856462,
+          513.5304185522526
+        ],
+        [
+          168.68652420620475,
+          513.5304185522526
+        ],
+        [
+          168.68652420620475,
+          524.7756101993823
+        ],
+        [
+          69.72376333856462,
+          524.7756101993823
+        ]
+      ]
+    },
+    {
+      "title": "5.1.2. ArXiv Papers Seem Ineffective in Improving Mathematical Reasoning",
+      "heading_level": null,
+      "page_id": 16,
+      "polygon": [
+        [
+          68.97404545320371,
+          710.6961120985919
+        ],
+        [
+          440.8341165922151,
+          710.6961120985919
+        ],
+        [
+          440.8341165922151,
+          721.9413037457216
+        ],
+        [
+          68.97404545320371,
+          721.9413037457216
+        ]
+      ]
+    },
+    {
+      "title": "5.2. Insights of Reinforcement Learning",
+      "heading_level": null,
+      "page_id": 17,
+      "polygon": [
+        [
+          68.97404545320371,
+          465.5509341911663
+        ],
+        [
+          272.14759238601033,
+          465.5509341911663
+        ],
+        [
+          272.14759238601033,
+          476.0464463951539
+        ],
+        [
+          68.97404545320371,
+          476.0464463951539
+        ]
+      ]
+    },
+    {
+      "title": "5.2.1. Towards to a Unified Paradigm",
+      "heading_level": null,
+      "page_id": 17,
+      "polygon": [
+        [
+          68.97404545320371,
+          488.0413174854255
+        ],
+        [
+          254.1543631373485,
+          488.0413174854255
+        ],
+        [
+          254.1543631373485,
+          498.53682968941314
+        ],
+        [
+          68.97404545320371,
+          498.53682968941314
+        ]
+      ]
+    },
+    {
+      "title": "5.2.2. Why RL Works?",
+      "heading_level": null,
+      "page_id": 20,
+      "polygon": [
+        [
+          68.97404545320371,
+          348.6009410610182
+        ],
+        [
+          181.4317282573402,
+          348.6009410610182
+        ],
+        [
+          181.4317282573402,
+          360.5958121512898
+        ],
+        [
+          68.97404545320371,
+          360.5958121512898
+        ]
+      ]
+    },
+    {
+      "title": "5.2.3. How to Achieve More Effective RL?",
+      "heading_level": null,
+      "page_id": 20,
+      "polygon": [
+        [
+          68.97404545320371,
+          538.2698401759378
+        ],
+        [
+          274.39674604209307,
+          538.2698401759378
+        ],
+        [
+          274.39674604209307,
+          548.7653523799255
+        ],
+        [
+          68.97404545320371,
+          548.7653523799255
+        ]
+      ]
+    },
+    {
+      "title": "6. Conclusion, Limitation, and Future Work",
+      "heading_level": null,
+      "page_id": 21,
+      "polygon": [
+        [
+          68.97404545320371,
+          422.589323759079
+        ],
+        [
+          331.37530532952223,
+          422.589323759079
+        ],
+        [
+          331.37530532952223,
+          434.8140770223453
+        ],
+        [
+          68.97404545320371,
+          435.5637564654873
+        ]
+      ]
+    },
+    {
+      "title": "References",
+      "heading_level": null,
+      "page_id": 22,
+      "polygon": [
+        [
+          69.46824425458908,
+          85.50445461273193
+        ],
+        [
+          137.19837302104654,
+          85.50445461273193
+        ],
+        [
+          137.19837302104654,
+          96.70864816531473
+        ],
+        [
+          69.46824425458908,
+          96.70864816531473
+        ]
+      ]
+    },
+    {
+      "title": "A. Appendix",
+      "heading_level": null,
+      "page_id": 27,
+      "polygon": [
+        [
+          68.97404545320371,
+          85.91553372144699
+        ],
+        [
+          150.69329495754292,
+          85.91553372144699
+        ],
+        [
+          150.69329495754292,
+          98.24790698289871
+        ],
+        [
+          68.97404545320371,
+          98.24790698289871
+        ]
+      ]
+    },
+    {
+      "title": "A.1. Analysis of Reinforcement Learning",
+      "heading_level": null,
+      "page_id": 27,
+      "polygon": [
+        [
+          68.97404545320371,
+          113.04675489664078
+        ],
+        [
+          277.3956175835367,
+          113.04675489664078
+        ],
+        [
+          277.3956175835367,
+          123.73481172323227
+        ],
+        [
+          68.97404545320371,
+          123.73481172323227
+        ]
+      ]
+    },
+    {
+      "title": "A.1.1. Supervised Fine-tuning",
+      "heading_level": null,
+      "page_id": 27,
+      "polygon": [
+        [
+          68.2243275678428,
+          192.7961019873619
+        ],
+        [
+          215.9187509839421,
+          192.7961019873619
+        ],
+        [
+          215.9187509839421,
+          203.4841588139534
+        ],
+        [
+          68.2243275678428,
+          203.4841588139534
+        ]
+      ]
+    },
+    {
+      "title": "A.1.2. Rejection Sampling Fine-tuning",
+      "heading_level": null,
+      "page_id": 27,
+      "polygon": [
+        [
+          68.2243275678428,
+          390.93623238801956
+        ],
+        [
+          257.90295256415305,
+          390.93623238801956
+        ],
+        [
+          257.90295256415305,
+          402.5778609672404
+        ],
+        [
+          68.2243275678428,
+          402.5778609672404
+        ]
+      ]
+    },
+    {
+      "title": "A.1.3. Online Rejection Sampling Fine-tuning",
+      "heading_level": null,
+      "page_id": 27,
+      "polygon": [
+        [
+          68.2243275678428,
+          658.9682305217957
+        ],
+        [
+          293.8894110614767,
+          658.9682305217957
+        ],
+        [
+          293.8894110614767,
+          669.6478680968285
+        ],
+        [
+          68.2243275678428,
+          669.6478680968285
+        ]
+      ]
+    },
+    {
+      "title": "A.1.4. Direct Preference Optimization (DPO)",
+      "heading_level": null,
+      "page_id": 28,
+      "polygon": [
+        [
+          68.2243275678428,
+          86.7376919388771
+        ],
+        [
+          290.1408216346722,
+          86.7376919388771
+        ],
+        [
+          290.1408216346722,
+          97.4583276084567
+        ],
+        [
+          68.2243275678428,
+          97.4583276084567
+        ]
+      ]
+    },
+    {
+      "title": "A.1.5. Proximal Policy Optimization (PPO)",
+      "heading_level": null,
+      "page_id": 28,
+      "polygon": [
+        [
+          68.97404545320371,
+          340.3544671864565
+        ],
+        [
+          285.6425143225067,
+          340.3544671864565
+        ],
+        [
+          285.6425143225067,
+          351.5996588335861
+        ],
+        [
+          68.97404545320371,
+          351.5996588335861
+        ]
+      ]
+    },
+    {
+      "title": "A.1.6. Group Relative Policy Optimization (GRPO)",
+      "heading_level": null,
+      "page_id": 28,
+      "polygon": [
+        [
+          68.97404545320371,
+          673.9618193846352
+        ],
+        [
+          325.37756224663497,
+          673.9618193846352
+        ],
+        [
+          325.37756224663497,
+          685.2070110317649
+        ],
+        [
+          68.97404545320371,
+          685.2070110317649
+        ]
+      ]
+    }
+  ],
+  "page_stats": [
+    {
+      "page_id": 0,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          55
+        ],
+        [
+          "Span",
+          33
+        ],
+        [
+          "Text",
+          4
+        ],
+        [
+          "SectionHeader",
+          2
+        ],
+        [
+          "Footnote",
+          2
+        ],
+        [
+          "PageHeader",
+          1
+        ],
+        [
+          "Picture",
+          1
+        ],
+        [
+          "Figure",
+          1
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "FigureGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 3,
+        "llm_error_count": 0,
+        "llm_tokens_used": 10945,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 1,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          52
+        ],
+        [
+          "Span",
+          29
+        ],
+        [
+          "Text",
+          5
+        ],
+        [
+          "SectionHeader",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 2,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          62
+        ],
+        [
+          "Span",
+          21
+        ],
+        [
+          "ListItem",
+          8
+        ],
+        [
+          "SectionHeader",
+          4
+        ],
+        [
+          "Text",
+          3
+        ],
+        [
+          "ListGroup",
+          2
+        ],
+        [
+          "PageFooter",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 3,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          55
+        ],
+        [
+          "Span",
+          13
+        ],
+        [
+          "Text",
+          4
+        ],
+        [
+          "ListItem",
+          2
+        ],
+        [
+          "SectionHeader",
+          2
+        ],
+        [
+          "Footnote",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "ListGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 4,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          46
+        ],
+        [
+          "Span",
+          5
+        ],
+        [
+          "Text",
+          3
+        ],
+        [
+          "Figure",
+          1
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "FigureGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 5,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "TableCell",
+          131
+        ],
+        [
+          "Line",
+          45
+        ],
+        [
+          "Span",
+          22
+        ],
+        [
+          "Text",
+          4
+        ],
+        [
+          "SectionHeader",
+          3
+        ],
+        [
+          "ListItem",
+          3
+        ],
+        [
+          "Table",
+          1
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "TableGroup",
+          1
+        ],
+        [
+          "ListGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 1,
+        "llm_error_count": 0,
+        "llm_tokens_used": 3027,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 6,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          78
+        ],
+        [
+          "Span",
+          7
+        ],
+        [
+          "Text",
+          2
+        ],
+        [
+          "ListItem",
+          2
+        ],
+        [
+          "Figure",
+          1
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "SectionHeader",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "FigureGroup",
+          1
+        ],
+        [
+          "ListGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 7,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "TableCell",
+          184
+        ],
+        [
+          "Line",
+          34
+        ],
+        [
+          "Span",
+          6
+        ],
+        [
+          "Text",
+          4
+        ],
+        [
+          "Table",
+          1
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "TableGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 1,
+        "llm_error_count": 0,
+        "llm_tokens_used": 3378,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 8,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "TableCell",
+          141
+        ],
+        [
+          "Line",
+          32
+        ],
+        [
+          "Span",
+          14
+        ],
+        [
+          "Text",
+          3
+        ],
+        [
+          "Table",
+          2
+        ],
+        [
+          "Caption",
+          2
+        ],
+        [
+          "TableGroup",
+          2
+        ],
+        [
+          "PageFooter",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 2,
+        "llm_error_count": 0,
+        "llm_tokens_used": 3998,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 9,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          58
+        ],
+        [
+          "Span",
+          16
+        ],
+        [
+          "Text",
+          4
+        ],
+        [
+          "ListItem",
+          4
+        ],
+        [
+          "Footnote",
+          4
+        ],
+        [
+          "SectionHeader",
+          3
+        ],
+        [
+          "ListGroup",
+          2
+        ],
+        [
+          "PageFooter",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 10,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          52
+        ],
+        [
+          "Span",
+          25
+        ],
+        [
+          "Text",
+          6
+        ],
+        [
+          "SectionHeader",
+          3
+        ],
+        [
+          "Equation",
+          1
+        ],
+        [
+          "Footnote",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 1,
+        "llm_error_count": 0,
+        "llm_tokens_used": 1243,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 11,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "TableCell",
+          372
+        ],
+        [
+          "Line",
+          8
+        ],
+        [
+          "Span",
+          2
+        ],
+        [
+          "Table",
+          1
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "TableGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 1,
+        "llm_error_count": 0,
+        "llm_tokens_used": 6341,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 12,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          72
+        ],
+        [
+          "Span",
+          30
+        ],
+        [
+          "Text",
+          4
+        ],
+        [
+          "Equation",
+          2
+        ],
+        [
+          "Figure",
+          1
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "FigureGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 2,
+        "llm_error_count": 0,
+        "llm_tokens_used": 3334,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 13,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Span",
+          117
+        ],
+        [
+          "Line",
+          56
+        ],
+        [
+          "Text",
+          5
+        ],
+        [
+          "SectionHeader",
+          3
+        ],
+        [
+          "Code",
+          1
+        ],
+        [
+          "Equation",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 1,
+        "llm_error_count": 0,
+        "llm_tokens_used": 1033,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 14,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          52
+        ],
+        [
+          "Span",
+          12
+        ],
+        [
+          "Text",
+          6
+        ],
+        [
+          "SectionHeader",
+          5
+        ],
+        [
+          "PageFooter",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 15,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "TableCell",
+          159
+        ],
+        [
+          "Line",
+          42
+        ],
+        [
+          "Span",
+          13
+        ],
+        [
+          "Text",
+          4
+        ],
+        [
+          "ListItem",
+          4
+        ],
+        [
+          "SectionHeader",
+          2
+        ],
+        [
+          "ListGroup",
+          2
+        ],
+        [
+          "Table",
+          1
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "TableGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 1,
+        "llm_error_count": 0,
+        "llm_tokens_used": 3309,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 16,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "TableCell",
+          299
+        ],
+        [
+          "Line",
+          25
+        ],
+        [
+          "Span",
+          7
+        ],
+        [
+          "Table",
+          3
+        ],
+        [
+          "Text",
+          3
+        ],
+        [
+          "Caption",
+          2
+        ],
+        [
+          "TableGroup",
+          2
+        ],
+        [
+          "SectionHeader",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 3,
+        "llm_error_count": 0,
+        "llm_tokens_used": 7700,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 17,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          54
+        ],
+        [
+          "Span",
+          35
+        ],
+        [
+          "Text",
+          7
+        ],
+        [
+          "ListItem",
+          5
+        ],
+        [
+          "SectionHeader",
+          2
+        ],
+        [
+          "ListGroup",
+          2
+        ],
+        [
+          "Equation",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 1,
+        "llm_error_count": 0,
+        "llm_tokens_used": 1231,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 18,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          51
+        ],
+        [
+          "TableCell",
+          28
+        ],
+        [
+          "Span",
+          22
+        ],
+        [
+          "ListItem",
+          4
+        ],
+        [
+          "Caption",
+          2
+        ],
+        [
+          "Text",
+          2
+        ],
+        [
+          "Table",
+          1
+        ],
+        [
+          "Figure",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "TableGroup",
+          1
+        ],
+        [
+          "FigureGroup",
+          1
+        ],
+        [
+          "ListGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 1,
+        "llm_error_count": 0,
+        "llm_tokens_used": 1727,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 19,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          61
+        ],
+        [
+          "Span",
+          7
+        ],
+        [
+          "Text",
+          4
+        ],
+        [
+          "Figure",
+          1
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "FigureGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 20,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          67
+        ],
+        [
+          "Span",
+          12
+        ],
+        [
+          "Text",
+          3
+        ],
+        [
+          "SectionHeader",
+          2
+        ],
+        [
+          "Figure",
+          1
+        ],
+        [
+          "Caption",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "FigureGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 21,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          50
+        ],
+        [
+          "Span",
+          12
+        ],
+        [
+          "Text",
+          5
+        ],
+        [
+          "SectionHeader",
+          1
+        ],
+        [
+          "Footnote",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 22,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          56
+        ],
+        [
+          "Span",
+          13
+        ],
+        [
+          "ListItem",
+          11
+        ],
+        [
+          "SectionHeader",
+          1
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "ListGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 23,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          55
+        ],
+        [
+          "Span",
+          15
+        ],
+        [
+          "ListItem",
+          14
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "ListGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 24,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          56
+        ],
+        [
+          "Span",
+          13
+        ],
+        [
+          "ListItem",
+          12
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "ListGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 25,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          56
+        ],
+        [
+          "Span",
+          14
+        ],
+        [
+          "ListItem",
+          13
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "ListGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 26,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          46
+        ],
+        [
+          "Span",
+          12
+        ],
+        [
+          "ListItem",
+          11
+        ],
+        [
+          "PageFooter",
+          1
+        ],
+        [
+          "ListGroup",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 0,
+        "llm_error_count": 0,
+        "llm_tokens_used": 0,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 27,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          47
+        ],
+        [
+          "Span",
+          22
+        ],
+        [
+          "Text",
+          8
+        ],
+        [
+          "Equation",
+          6
+        ],
+        [
+          "SectionHeader",
+          5
+        ],
+        [
+          "PageFooter",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 6,
+        "llm_error_count": 0,
+        "llm_tokens_used": 6067,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 28,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          47
+        ],
+        [
+          "Span",
+          27
+        ],
+        [
+          "Text",
+          9
+        ],
+        [
+          "Equation",
+          8
+        ],
+        [
+          "SectionHeader",
+          3
+        ],
+        [
+          "PageFooter",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 8,
+        "llm_error_count": 0,
+        "llm_tokens_used": 8881,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    },
+    {
+      "page_id": 29,
+      "text_extraction_method": "surya",
+      "block_counts": [
+        [
+          "Line",
+          13
+        ],
+        [
+          "Span",
+          8
+        ],
+        [
+          "Text",
+          3
+        ],
+        [
+          "Equation",
+          2
+        ],
+        [
+          "PageFooter",
+          1
+        ]
+      ],
+      "block_metadata": {
+        "llm_request_count": 2,
+        "llm_error_count": 0,
+        "llm_tokens_used": 2213,
+        "previous_text": "",
+        "previous_type": "",
+        "previous_order": 0
+      }
+    }
+  ],
+  "debug_data_path": "debug_data/arXiv:2402.03300"
+}
\ No newline at end of file
diff --git a/research/papers/dapo.pdf b/research/papers/dapo.pdf
new file mode 100644
index 0000000..d2dca93
Binary files /dev/null and b/research/papers/dapo.pdf differ
diff --git a/research/papers/rl_zvp.pdf b/research/papers/rl_zvp.pdf
new file mode 100644
index 0000000..aa5204a
Binary files /dev/null and b/research/papers/rl_zvp.pdf differ
diff --git a/research/papers/scalerl.pdf b/research/papers/scalerl.pdf
new file mode 100644
index 0000000..127db52
Binary files /dev/null and b/research/papers/scalerl.pdf differ
diff --git a/research/papers/tricks_or_traps_part1.pdf b/research/papers/tricks_or_traps_part1.pdf
new file mode 100644
index 0000000..855afd1
Binary files /dev/null and b/research/papers/tricks_or_traps_part1.pdf differ
diff --git a/research/papers/vanilla_grpo.pdf b/research/papers/vanilla_grpo.pdf
new file mode 100644
index 0000000..247efad
Binary files /dev/null and b/research/papers/vanilla_grpo.pdf differ
diff --git a/research/plan.md b/research/plan.md
new file mode 100644
index 0000000..25e9c4c
--- /dev/null
+++ b/research/plan.md
@@ -0,0 +1,82 @@
+# ScaleRL, RL-ZVP, and DAPO Implementation Plan
+
+This document outlines the plan to integrate features from **ScaleRL**, **RL-ZVP**, and **DAPO** into the `ludic` repository, adhering to the existing modular architecture.
+
+## 1. Credit Assignment (`src/ludic/training/credit_assignment.py`)
+
+**Goal:** Support new advantage estimation techniques without changing the core rollout loop.
+
+### New Components:
+
+*   **`BatchNormalizedReturn` (ScaleRL)**
+    *   **Logic:** Computes group-centered advantages ($A_i = R_i - \mu_{group}$) but normalizes them using the **batch-wide** standard deviation ($\sigma_{batch}$) instead of the group standard deviation. This is preferred by ScaleRL for more robust normalization when reward distributions are highly concentrated.
+    *   **Implementation:** A new `CreditAssigner` dataclass. It will collect all centered advantages in the batch first, compute $\sigma_{batch}$, and then assign final weights.
+
+*   **`ZVPGroupNormalizedReturn` (RL-ZVP)**
+    *   **Logic:** Detects "Zero-Variance Prompts" (ZVP) where $\sigma_{group} \approx 0$.
+    *   **Action:**
+        *   If ZVP: Marks the rollout metadata with `zvp_direction` (+1 if all correct, -1 if all wrong) and sets the scalar weight to `0.0`. This effectively signals the ZVP-aware loss function to switch modes for these samples.
+        *   If Non-ZVP: Fallbacks to standard `GroupNormalizedReturn` logic.
+    *   **Implementation:** A new `CreditAssigner` dataclass that injects this metadata.
+
+## 2. Loss Functions (`src/ludic/training/loss.py`)
+
+**Goal:** Integrate entropy-scaling and specific aggregation methods into the loss calculation.
+
+### New Components:
+
+*   **`ZVPCISPOLoss` (RL-ZVP)**
+    *   **Logic:** Extends `CISPOLoss`.
+        *   Checks `batch["meta"]["zvp_direction"]` (or similar metadata passed via `SAWItem`).
+        *   **Non-ZVP:** Uses standard CISPO update.
+        *   **ZVP:** Computes the entropy-guided gradient:
+            *   $Weight = \alpha \cdot H(x)$ (for positive ZVP).
+            *   $Weight = -\alpha \cdot (\max(H) - H(x))$ (for negative ZVP).
+    *   **Implementation:** A new dataclass extending `CISPOLoss` that accepts an `alpha_zvp` parameter and branches logic based on metadata presence.
+
+*   **`ScaleRLLoss` (ScaleRL)**
+    *   **Logic:** Enforces **prompt-level** loss aggregation.
+    *   **Implementation:** This is likely a specific configuration of `CISPOLoss` or `ReinforceLoss` that correctly handles normalization by `group_size` and `prompt_length`. We will verify if a new class is needed or if `length_normalize` flags are sufficient.
+
+*   **DAPO Clipping**
+    *   **Status:** `TokenClippedSurrogateLoss` already supports asymmetric `clip_eps_low` / `clip_eps_high`. We will ensure the presets in `algorithm.py` expose these correctly.
+
+## 3. Data Curriculum & Sampling (`src/ludic/training/batching/`)
+
+**Goal:** Filter prompts based on historical performance.
+
+### New Components:
+
+*   **`NoPositiveResampling` (ScaleRL)**
+    *   **Logic:** "If a prompt is solved (pass rate > 0.9), never show it again."
+    *   **Implementation:**
+        *   We will implement this in `src/ludic/training/batching/requests_from_dataset.py`.
+        *   Create a `HistoryFilter` wrapper or similar mechanism that maintains a persistent map of `{prompt_hash: pass_rate}`.
+        *   It will filter items from the dataset queue before they are turned into `RolloutRequest`s.
+
+*   **Dynamic Sampling (DAPO)**
+    *   **Logic:** "If a batch has ZVP, drop them and sample *more* to fill the batch."
+    *   **Implementation:** True dynamic *resampling* is complex in the decoupled `PipelineRL` architecture. We will implement the **Offline** variant (Zero-Variance Filtering) first, as used in ScaleRL. This acts as a `SampleFilter` in `RolloutEngine.generate_batch` that drops items with weight=0 (ZVP) before the update.
+
+## 4. Algorithm Presets (`src/ludic/training/algorithm.py`)
+
+**Goal:** Provide user-friendly entry points for the new recipes.
+
+### New Factories:
+
+*   **`make_scalerl(...)`**:
+    *   Combines `BatchNormalizedReturn`.
+    *   Uses `CISPOLoss` (with prompt-level aggregation).
+    *   Configures `FP32` precision (via existing config or new args).
+    *   Enables `NoPositiveResampling`.
+
+*   **`make_rl_zvp(...)`**:
+    *   Combines `ZVPGroupNormalizedReturn`.
+    *   Uses `ZVPCISPOLoss`.
+
+## Summary of Work
+
+1.  **Step 1:** Implement `BatchNormalizedReturn` and `ZVPGroupNormalizedReturn` in `credit_assignment.py`.
+2.  **Step 2:** Implement `ZVPCISPOLoss` in `loss.py`.
+3.  **Step 3:** Implement `NoPositiveResampling` logic in `batching/`.
+4.  **Step 4:** Create `make_scalerl` and `make_rl_zvp` in `algorithm.py`.
diff --git a/scripts/visualize_logs.py b/scripts/visualize_logs.py
new file mode 100644
index 0000000..827a707
--- /dev/null
+++ b/scripts/visualize_logs.py
@@ -0,0 +1,793 @@
+# /// script
+# requires-python = ">=3.11"
+# dependencies = [
+#     "streamlit",
+#     "pandas",
+#     "tiktoken",
+#     "plotly",
+# ]
+# ///
+
+import streamlit as st
+import json
+import pandas as pd
+import glob
+import os
+import sys
+import math
+import tiktoken
+import plotly.express as px
+import plotly.graph_objects as go
+from datetime import datetime
+from streamlit.web import cli as stcli
+
+# --- Path Setup & Imports ---
+current_dir = os.path.dirname(os.path.abspath(__file__))
+project_root = os.path.join(current_dir, "..", "src")
+if project_root not in sys.path:
+    sys.path.append(project_root)
+
+try:
+    from ludic.envs.code_exec.adapters.apps import APPS_SYSTEM_PROMPT
+except ImportError as e:
+    APPS_SYSTEM_PROMPT = f"Error loading system prompt from codebase: {e}"
+
+import re
+
+# --- Configuration & Tokenizer ---
+st.set_page_config(layout="wide", page_title="Neural Observatory", page_icon="🔭")
+
+
+@st.cache_resource
+def get_encoder():
+    return tiktoken.get_encoding("cl100k_base")
+
+
+encoder = get_encoder()
+
+COMMON_ERRORS = [
+    "SyntaxError", "IndentationError", "NameError", "TypeError",
+    "ValueError", "AttributeError", "IndexError", "KeyError",
+    "FileNotFoundError", "ImportError", "ModuleNotFoundError",
+    "TimeoutError", "AssertionError", "ZeroDivisionError",
+    "RuntimeError", "MemoryError", "RecursionError"
+]
+
+def extract_primary_error(steps):
+    """Scans execution outputs for common Python errors."""
+    if not steps:
+        return "No Steps"
+    
+    # Check the last step first as it's the most likely failure point
+    for step in reversed(steps):
+        out = step.get("next_obs", "")
+        if not out: continue
+        
+        for err in COMMON_ERRORS:
+            if err in out:
+                return err
+        if "Error:" in out:
+            return "Generic Error"
+            
+    return "None"
+
+def count_tokens(text: str) -> int:
+    if not text:
+        return 0
+    return len(encoder.encode(text))
+
+
+# --- Aesthetic System: Neural Observatory ---
+st.markdown(
+    """
+<style>
+    @import url('https://fonts.googleapis.com/css2?family=Cinzel:wght@400;700&family=Inter:wght@300;400;600&family=JetBrains+Mono:wght@400;700&display=swap');
+
+    :root {
+        --bg-base: #050505;
+        --bg-elevated: #0f1115;
+        --bg-card: #13161c;
+        --color-primary: #00f0ff; /* Electric Cyan */
+        --color-secondary: #7000ff; /* Deep Violet */
+        --color-success: #00ff9d;
+        --color-failure: #ff0055;
+        --text-primary: #e0e0e0;
+        --text-secondary: #949aa5;
+        --text-muted: #5c616b;
+        --font-display: 'Cinzel', serif;
+        --font-body: 'Inter', sans-serif;
+        --font-mono: 'JetBrains Mono', monospace;
+        --glow-strength: 0px 0px 20px rgba(0, 240, 255, 0.1);
+    }
+
+    /* Base Theme Overrides */
+    .stApp {
+        background-color: var(--bg-base);
+        font-family: var(--font-body);
+        color: var(--text-primary);
+    }
+    
+    h1, h2, h3 {
+        font-family: var(--font-display) !important;
+        letter-spacing: 0.05em;
+        background: linear-gradient(120deg, var(--text-primary), var(--text-secondary));
+        -webkit-background-clip: text;
+        -webkit-text-fill-color: transparent;
+    }
+
+    /* Metrics Cards */
+    .metric-card {
+        background: var(--bg-elevated);
+        border: 1px solid rgba(255, 255, 255, 0.05);
+        border-radius: 8px;
+        padding: 16px;
+        position: relative;
+        overflow: hidden;
+    }
+    .metric-card::before {
+        content: '';
+        position: absolute;
+        top: 0; left: 0; width: 100%; height: 2px;
+        background: linear-gradient(90deg, var(--color-primary), transparent);
+    }
+    .metric-value {
+        font-family: var(--font-mono);
+        font-size: 1.8rem;
+        font-weight: 700;
+        color: var(--color-primary);
+        text-shadow: 0 0 10px rgba(0, 240, 255, 0.3);
+    }
+    .metric-label {
+        font-size: 0.75rem;
+        text-transform: uppercase;
+        letter-spacing: 0.1em;
+        color: var(--text-secondary);
+        margin-bottom: 4px;
+    }
+
+    /* Grid Cards */
+    .obs-card {
+        background: var(--bg-card);
+        border: 1px solid rgba(255, 255, 255, 0.08);
+        border-radius: 12px;
+        padding: 0;
+        transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1);
+        position: relative;
+        overflow: hidden;
+        margin-bottom: 16px;
+        height: 100%;
+        display: flex;
+        flex-direction: column;
+    }
+    .obs-card:hover {
+        transform: translateY(-4px);
+        border-color: var(--color-primary);
+        box-shadow: 0 10px 30px -10px rgba(0, 0, 0, 0.5), var(--glow-strength);
+    }
+    .obs-card-header {
+        padding: 12px 16px;
+        border-bottom: 1px solid rgba(255, 255, 255, 0.05);
+        display: flex;
+        justify-content: space-between;
+        align-items: center;
+        background: rgba(0,0,0,0.2);
+    }
+    .obs-card-body {
+        padding: 16px;
+        flex-grow: 1;
+    }
+    .obs-card-footer {
+        padding: 12px 16px;
+        background: rgba(0,0,0,0.3);
+        border-top: 1px solid rgba(255, 255, 255, 0.05);
+        font-size: 0.75rem;
+        color: var(--text-secondary);
+        display: flex;
+        justify-content: space-between;
+        align-items: center;
+    }
+    
+    /* Status Indicators */
+    .status-dot {
+        height: 8px;
+        width: 8px;
+        border-radius: 50%;
+        display: inline-block;
+        margin-right: 6px;
+        box-shadow: 0 0 8px currentColor;
+    }
+    .status-pass { color: var(--color-success); }
+    .status-fail { color: var(--color-failure); }
+
+    /* Code Preview */
+    .code-snippet {
+        font-family: var(--font-mono);
+        font-size: 0.75rem;
+        color: var(--text-secondary);
+        background: rgba(0,0,0,0.3);
+        padding: 10px;
+        border-radius: 6px;
+        border-left: 2px solid var(--text-muted);
+        margin: 10px 0;
+        overflow: hidden;
+        display: -webkit-box;
+        -webkit-line-clamp: 4;
+        -webkit-box-orient: vertical;
+        opacity: 0.8;
+    }
+
+    /* Token Heatmap/Bar */
+    .token-bar-bg {
+        width: 100%;
+        height: 4px;
+        background: rgba(255,255,255,0.1);
+        border-radius: 2px;
+        margin-top: 8px;
+        overflow: hidden;
+    }
+    .token-bar-fill {
+        height: 100%;
+        background: linear-gradient(90deg, var(--color-primary), var(--color-secondary));
+        opacity: 0.8;
+    }
+
+    /* Detail View Timeline */
+    .timeline-step {
+        border-left: 2px solid rgba(255,255,255,0.1);
+        padding-left: 24px;
+        margin-left: 12px;
+        padding-bottom: 32px;
+        position: relative;
+    }
+    .timeline-step:last-child {
+        border-left: 2px solid transparent;
+    }
+    .timeline-node {
+        position: absolute;
+        left: -9px;
+        top: 0;
+        width: 16px;
+        height: 16px;
+        border-radius: 50%;
+        background: var(--bg-base);
+        border: 2px solid var(--color-primary);
+        box-shadow: 0 0 10px var(--color-primary);
+    }
+    
+    /* Animations */
+    @keyframes fadeIn {
+        from { opacity: 0; transform: translateY(10px); }
+        to { opacity: 1; transform: translateY(0); }
+    }
+    .animate-enter {
+        animation: fadeIn 0.4s ease-out forwards;
+    }
+    
+</style>
+""",
+    unsafe_allow_html=True,
+)
+
+# --- Data Loading & Processing ---
+
+
+@st.cache_data
+def load_and_process_data(file_path):
+    data = []
+    token_counts = []
+    rewards = []
+    difficulties = []
+    step_counts = []
+    error_types = []
+    ids = []
+    pass_stats = []
+    
+    with open(file_path, 'r', encoding='utf-8') as f:
+        for line in f:
+            try:
+                item = json.loads(line)
+                # Pre-calculate token counts for efficiency
+                steps = item.get("steps", [])
+                code = steps[0].get("action", "") if steps else ""
+                t_count = count_tokens(code)
+                item["_token_count"] = t_count
+                
+                # Extract other metadata
+                diff = item.get("meta", {}).get("difficulty", "Unknown")
+                err = extract_primary_error(steps)
+                
+                # Extract Detailed Test Info
+                passed = 0
+                total = 0
+                pass_rate = 0.0
+                compile_failed = False
+                exec_ms = 0.0
+                
+                if steps:
+                    last_step = steps[-1]
+                    last_info = last_step.get("info", {})
+                    passed = last_info.get("passed", 0)
+                    total = last_info.get("total", 0)
+                    compile_failed = last_info.get("compile_failed", False)
+                    # Use provided pass_rate or calculate it
+                    pass_rate = last_info.get("pass_rate", 0.0)
+                    if total > 0 and "pass_rate" not in last_info:
+                        pass_rate = passed / total
+                    
+                    # Extract Timing
+                    timing = last_step.get("timing", {})
+                    exec_ms = timing.get("total_execution_ms", 0.0)
+
+                data.append(item)
+                token_counts.append(t_count)
+                rewards.append(item.get("total_reward", 0.0))
+                difficulties.append(diff)
+                step_counts.append(len(steps))
+                error_types.append(err)
+                ids.append(item.get("meta", {}).get("problem_id", "N/A"))
+                pass_stats.append({
+                    "passed": passed, 
+                    "total": total, 
+                    "pass_rate": pass_rate,
+                    "compile_failed": compile_failed,
+                    "exec_ms": exec_ms
+                })
+            except json.JSONDecodeError:
+                continue
+    
+    # Calculate stats for relative visualization
+    df = pd.DataFrame({
+        "id": ids,
+        "tokens": token_counts,
+        "reward": rewards,
+        "difficulty": difficulties,
+        "steps": step_counts,
+        "error_type": error_types,
+        "pass_rate": [p["pass_rate"] for p in pass_stats],
+        "compile_failed": [p["compile_failed"] for p in pass_stats],
+        "exec_ms": [p["exec_ms"] for p in pass_stats]
+    })
+    
+    stats = {
+        "max_tokens": max(token_counts) if token_counts else 1,
+        "avg_tokens": sum(token_counts) / len(token_counts) if token_counts else 0,
+        "avg_steps": sum(step_counts) / len(step_counts) if step_counts else 0,
+        "success_rate": sum(1 for r in rewards if r == 1.0) / len(rewards) if rewards else 0,
+        "compile_fail_rate": sum(1 for p in pass_stats if p["compile_failed"]) / len(pass_stats) if pass_stats else 0
+    }
+    
+    return data, stats, df
+
+
+# --- View Components ---
+
+
+def render_metric(label, value, subtext=None, color="primary"):
+    color_var = f"var(--color-{color})"
+    st.markdown(
+        f"""
+    <div class="metric-card animate-enter">
+        <div class="metric-label">{label}</div>
+        <div class="metric-value" style="color: {color_var}">{value}</div>
+        {f'<div style="font-size: 0.7rem; color: var(--text-muted); margin-top: 4px;">{subtext}</div>' if subtext else ""}
+    </div>
+    """,
+        unsafe_allow_html=True,
+    )
+
+
+def render_overview(data, stats, df):
+    st.markdown("## 📡 Mission Control")
+    
+    # Top Level Metrics
+    c1, c2, c3, c4, c5 = st.columns(5)
+    with c1:
+        render_metric("Total Rollouts", len(data), "Active Datastreams")
+    with c2:
+        render_metric("Success Rate", f"{stats['success_rate']*100:.1f}%", "Mission Completion", 
+                      color="success" if stats['success_rate'] > 0.5 else "failure")
+    with c3:
+        render_metric("Compile Failures", f"{stats['compile_fail_rate']*100:.1f}%", "Syntax/Build Errors", color="failure")
+    with c4:
+        render_metric("Avg. Latency", f"{df['exec_ms'].mean()/1000:.1f}s", "Inference + Exec")
+    with c5:
+        render_metric("Avg. Tokens", f"{int(stats['avg_tokens']):,}", "Computational Load")
+
+    st.markdown("### 📊 Telemetry Analysis")
+    
+    # Row 1: Difficulty & Error Analysis
+    col_charts_1, col_charts_2, col_charts_3 = st.columns(3)
+    
+    with col_charts_1:
+        # Success Rate by Difficulty
+        if "difficulty" in df.columns:
+            diff_stats = df.groupby("difficulty")["reward"].mean().reset_index()
+            diff_stats["success_pct"] = diff_stats["reward"] * 100
+            
+            fig_diff = px.bar(
+                diff_stats, x="difficulty", y="success_pct",
+                title="Success Rate by Difficulty (%)",
+                color="success_pct",
+                color_continuous_scale=["#ff0055", "#00ff9d"],
+                template="plotly_dark",
+                labels={"success_pct": "Success Rate (%)", "difficulty": "Difficulty Level"}
+            )
+            fig_diff.update_layout(paper_bgcolor="rgba(0,0,0,0)", plot_bgcolor="rgba(0,0,0,0)")
+            fig_diff.update_yaxes(range=[0, 100])
+            st.plotly_chart(fig_diff, width="stretch")
+        else:
+            st.info("No difficulty metadata found.")
+        
+    with col_charts_2:
+        # Pass Rate Distribution (Partial Credit)
+        fig_pass = px.histogram(
+            df, x="pass_rate", nbins=20,
+            title="Test Pass Rate Distribution",
+            color_discrete_sequence=['#00f0ff'],
+            template="plotly_dark",
+            labels={"pass_rate": "Pass Rate (0.0 - 1.0)"}
+        )
+        fig_pass.update_layout(paper_bgcolor="rgba(0,0,0,0)", plot_bgcolor="rgba(0,0,0,0)")
+        st.plotly_chart(fig_pass, width="stretch")
+
+    with col_charts_3:
+        # Latency Distribution (Box Plot)
+        fig_lat = px.box(
+            df, y="exec_ms",
+            title="Execution Latency (ms)",
+            template="plotly_dark",
+            color_discrete_sequence=['#7000ff']
+        )
+        fig_lat.update_layout(paper_bgcolor="rgba(0,0,0,0)", plot_bgcolor="rgba(0,0,0,0)")
+        st.plotly_chart(fig_lat, width="stretch")
+
+    # Row 2: Deep Dive Data Table
+    st.markdown("### 📋 Data Matrix")
+    st.dataframe(
+        df,
+        column_config={
+            "reward": st.column_config.ProgressColumn(
+                "Reward",
+                help="Total Reward (0.0 to 1.0)",
+                min_value=0.0,
+                max_value=1.0,
+                format="%.2f",
+            ),
+            "pass_rate": st.column_config.ProgressColumn(
+                "Pass Rate",
+                help="Fraction of tests passed",
+                min_value=0.0,
+                max_value=1.0,
+                format="%.2f",
+            ),
+            "exec_ms": st.column_config.NumberColumn("Latency (ms)", format="%d"),
+            "tokens": st.column_config.NumberColumn("Tokens", format="%d"),
+            "compile_failed": st.column_config.CheckboxColumn("Compile Fail"),
+        },
+        width="stretch",
+        hide_index=True,
+        height=300
+    )
+
+
+def render_grid_view(rollouts, global_stats):
+    st.markdown(f"## 🔭 Observed Events ({len(rollouts)})")
+
+    # Pagination
+    items_per_page = 24
+    if "page" not in st.session_state:
+        st.session_state.page = 1
+    total_pages = math.ceil(len(rollouts) / items_per_page)
+
+    # Controls
+    c_ctrl_1, c_ctrl_2 = st.columns([6, 1])
+    with c_ctrl_2:
+        c_prev, c_page, c_next = st.columns([1, 2, 1])
+        if c_prev.button("◀", width="stretch") and st.session_state.page > 1:
+            st.session_state.page -= 1
+            st.rerun()
+        if c_next.button("▶", width="stretch") and st.session_state.page < total_pages:
+            st.session_state.page += 1
+            st.rerun()
+        c_page.caption(f"Pg {st.session_state.page}/{total_pages}")
+
+    # Render Grid
+    start_idx = (st.session_state.page - 1) * items_per_page
+    batch = rollouts[start_idx : start_idx + items_per_page]
+
+    cols = st.columns(3)
+    for i, item in enumerate(batch):
+        col = cols[i % 3]
+        with col:
+            # Prepare Data
+            meta = item.get("meta", {})
+            pid = meta.get("problem_id", "Unknown")
+            diff = meta.get("difficulty", "N/A")
+            reward = item.get("total_reward", 0.0)
+            token_count = item.get("_token_count", 0)
+
+            steps = item.get("steps", [])
+            code = steps[0].get("action", "") if steps else ""
+            preview_code = (
+                code[:150] + "..." if len(code) > 150 else code or "// No Code Action"
+            )
+
+            # Status styling
+            is_pass = reward == 1.0
+            status_class = "status-pass" if is_pass else "status-fail"
+            status_label = "SUCCESS" if is_pass else "FAILURE"
+            border_color = (
+                "var(--color-success)" if is_pass else "rgba(255, 255, 255, 0.1)"
+            )
+            if not is_pass:
+                border_color = "rgba(255, 0, 85, 0.3)"
+
+            # Token Meter
+            token_pct = min(100, (token_count / global_stats["max_tokens"]) * 100)
+
+            # Card HTML
+            card_html = f"""
+            <div class="obs-card animate-enter" style="animation-delay: {i * 50}ms; border-left: 3px solid {border_color}">
+                <div class="obs-card-header">
+                    <div style="font-weight: 600; font-size: 0.9rem;">{pid}</div>
+                    <div style="font-size: 0.7rem; opacity: 0.7;">{diff}</div>
+                </div>
+                <div class="obs-card-body">
+                    <div style="display: flex; align-items: center; margin-bottom: 8px;">
+                        <span class="status-dot {status_class}"></span>
+                        <span style="font-size: 0.75rem; font-weight: 700; letter-spacing: 0.05em;">{status_label}</span>
+                    </div>
+                    <div class="code-snippet">{preview_code}</div>
+                    <div style="margin-top: 12px;">
+                        <div style="display: flex; justify-content: space-between; font-size: 0.7rem; color: var(--text-muted);">
+                            <span>TOKEN USAGE</span>
+                            <span>{token_count:,} T</span>
+                        </div>
+                        <div class="token-bar-bg">
+                            <div class="token-bar-fill" style="width: {token_pct}%;"></div>
+                        </div>
+                    </div>
+                </div>
+            </div>
+            """
+            st.markdown(card_html, unsafe_allow_html=True)
+
+            # Invisible button overlay for interaction
+            if st.button(f"Analyze {pid}", key=f"btn_{item['id']}", width="stretch"):
+                st.session_state.selected_rollout = item
+                st.rerun()
+
+
+def render_detail_view():
+    item = st.session_state.selected_rollout
+    meta = item.get("meta", {})
+    steps = item.get("steps", [])
+
+    # Navigation
+    if st.button("← Return to Observatory"):
+        st.session_state.selected_rollout = None
+        st.rerun()
+
+    # Header
+    st.markdown(f"# 🧬 Analysis: {meta.get('problem_id', 'Unknown')}")
+
+    # KPI Strip
+    k1, k2, k3, k4 = st.columns(4)
+    with k1:
+        render_metric(
+            "Reward Signal",
+            item.get("total_reward"),
+            color="success" if item.get("total_reward") == 1 else "failure",
+        )
+    with k2:
+        render_metric("Total Tokens", f"{item.get('_token_count', 0):,}", "Cost Factor")
+    with k3:
+        render_metric("Interaction Depth", len(steps), "Steps taken")
+    with k4:
+        render_metric("Difficulty", meta.get("difficulty", "N/A"))
+
+    st.markdown("---")
+    
+    # Layout: System Prompt | Interaction Timeline
+    col_sys, col_main = st.columns([1, 3])
+    
+    with col_sys:
+        with st.expander("🖥️ System Directive", expanded=False):
+            st.markdown("*Initial System Instructions provided to the agent.*")
+            st.code(APPS_SYSTEM_PROMPT, language="text")
+            
+        st.markdown("### Metadata")
+        st.json(meta)
+
+        # Test Results Inspection
+        if steps:
+            last_info = steps[-1].get("info", {})
+            
+            if last_info.get("compile_failed"):
+                st.error("🚨 Compilation Failed")
+            
+            if "test_results" in last_info:
+                st.markdown("### 🧪 Test Results")
+                test_results = last_info["test_results"]
+                if test_results:
+                    tdf = pd.DataFrame(test_results)
+                    # Select relevant columns if they exist
+                    cols = [c for c in ["test_id", "passed", "failure_reason", "run_duration_ms"] if c in tdf.columns]
+                    st.dataframe(
+                        tdf[cols], 
+                        width="stretch",
+                        column_config={
+                            "passed": st.column_config.CheckboxColumn("Pass"),
+                            "run_duration_ms": st.column_config.NumberColumn("Time (ms)", format="%.1f")
+                        }
+                    )
+            
+            if "passed" in last_info:
+                p = last_info["passed"]
+                t = last_info["total"]
+                st.metric("Tests Passed", f"{p}/{t}", f"{(p/t*100) if t else 0:.1f}% Rate")
+
+    with col_main:
+        st.markdown("## 🎞️ Interaction Reconstruction")
+
+        for i, step in enumerate(steps):
+            # Step Container
+            st.markdown(
+                f"""
+            <div class="timeline-step">
+                <div class="timeline-node"></div>
+                <h3 style="margin-top: -5px; font-size: 1.1rem; color: var(--color-primary);">Step {i + 1}</h3>
+            </div>
+            """,
+                unsafe_allow_html=True,
+            )
+
+            # 1. Observation/Prompt
+            with st.container():
+                prev_obs = step.get("prev_obs", "")
+                obs_tokens = count_tokens(prev_obs)
+                st.markdown(
+                    f"**📡 Input / Observation** <span style='color:var(--text-muted); font-size:0.8em'>({obs_tokens} tokens)</span>",
+                    unsafe_allow_html=True,
+                )
+                if len(prev_obs) > 300:
+                    with st.expander(f"Show Input ({obs_tokens} tokens)"):
+                        st.code(prev_obs, language="text")
+                else:
+                    st.code(prev_obs, language="text")
+
+            # 2. Action (Code)
+            with st.container():
+                action = step.get("action", "")
+                toks = count_tokens(action)
+                st.markdown(
+                    f"**⚡ Agent Action** <span style='color:var(--text-muted); font-size:0.8em'>({toks} tokens)</span>",
+                    unsafe_allow_html=True,
+                )
+                st.code(action, language="python", line_numbers=True)
+
+            # 3. Output
+            with st.container():
+                out = step.get("next_obs")
+                if out:
+                    st.markdown("**⚙️ Execution Output**")
+                    if len(out) > 1000:
+                        st.code(
+                            out[:300] + "\n... [TRUNCATED] ...\n" + out[-300:],
+                            language="text",
+                        )
+                        with st.expander("View Full Output"):
+                            st.code(out, language="text")
+                    else:
+                        st.code(out, language="text")
+                else:
+                    st.info("No output recorded for this step.")
+
+            st.markdown("<div style='height: 24px'></div>", unsafe_allow_html=True)
+
+
+# --- Main App Logic ---
+
+
+def main():
+    # Sidebar
+    with st.sidebar:
+        st.markdown("### 🔭 Neural Observatory")
+
+        # File Selector
+        search_paths = ["*.jsonl", "data/*.jsonl", "logs/**/*.jsonl", "**/*.jsonl"]
+        files = []
+        for p in search_paths:
+            files.extend(glob.glob(p, recursive=True))
+        files = sorted(list(set(files)), key=os.path.getmtime, reverse=True)
+
+        selected_file = st.selectbox("Select Datastream", files) if files else None
+
+        if not selected_file:
+            st.warning("No data found.")
+            return
+
+        # Load Data
+        with st.spinner("Decoding telemetry..."):
+            data, stats, df = load_and_process_data(selected_file)
+
+        st.markdown("---")
+        st.markdown("### 🔍 Filters")
+
+        # View Mode
+        view_mode = st.radio("View Mode", ["Dashboard", "Grid Inspection"])
+
+        # Filters
+        diffs = ["All"] + sorted(
+            list(set(d.get("meta", {}).get("difficulty", "unknown") for d in data))
+        )
+        f_diff = st.selectbox("Difficulty Class", diffs)
+        
+        # Error Filter
+        all_errors = ["All"] + sorted(list(set(df["error_type"].unique())))
+        f_error = st.selectbox("Error Type", all_errors)
+
+        f_outcome = st.radio("Outcome State", ["All", "Success", "Failure"])
+
+        f_search = st.text_input("Search (ID or Content)")
+        deep_search = st.checkbox("Deep Search (Scan Code/Output)", value=False)
+
+        # Filter Logic
+        filtered = data
+        if f_diff != "All":
+            filtered = [
+                d for d in filtered if d.get("meta", {}).get("difficulty") == f_diff
+            ]
+        if f_error != "All":
+            # Match against the pre-calculated error type in df, 
+            # but we need to map back to the list of dicts. 
+            # Efficient way: filter ids based on df filter
+            target_ids = set(df[df["error_type"] == f_error]["id"])
+            filtered = [d for d in filtered if d.get("meta", {}).get("problem_id") in target_ids]
+            
+        if f_outcome == "Success":
+            filtered = [d for d in filtered if d.get("total_reward") == 1.0]
+        elif f_outcome == "Failure":
+            filtered = [d for d in filtered if d.get("total_reward") != 1.0]
+            
+        if f_search:
+            term = f_search.lower()
+            if deep_search:
+                # Heavy scan
+                new_filtered = []
+                for d in filtered:
+                    # Check ID
+                    if term in str(d.get("meta", {}).get("problem_id", "")).lower():
+                        new_filtered.append(d)
+                        continue
+                    # Check content
+                    found = False
+                    for step in d.get("steps", []):
+                        if term in step.get("action", "").lower() or term in step.get("next_obs", "").lower() or term in step.get("prev_obs", "").lower():
+                            found = True
+                            break
+                    if found:
+                        new_filtered.append(d)
+                filtered = new_filtered
+            else:
+                # Light scan (ID/Metadata only)
+                filtered = [d for d in filtered if term in str(d).lower()]
+
+        st.markdown("---")
+        st.caption(f"v2.1.0 | {len(filtered)} records active")
+
+    # Main Router
+    if st.session_state.get("selected_rollout"):
+        render_detail_view()
+    elif view_mode == "Dashboard":
+        render_overview(filtered, stats, df)
+    else:
+        render_grid_view(filtered, stats)
+
+
+if __name__ == "__main__":
+    if st.runtime.exists():
+        main()
+    else:
+        sys.argv = ["streamlit", "run", sys.argv[0]]
+        sys.exit(stcli.main())
diff --git a/src/ludic/envs/code_exec/README.md b/src/ludic/envs/code_exec/README.md
new file mode 100644
index 0000000..0c4453c
--- /dev/null
+++ b/src/ludic/envs/code_exec/README.md
@@ -0,0 +1,263 @@
+# CodeExecEnv Module
+
+A sandboxed code execution environment for reinforcement learning on code generation tasks.
+
+## Module Structure
+
+```
+code_exec/
+├── __init__.py           # Public API exports
+├── types.py              # Data types (TestCase, TestResult, BatchTestResult, etc.)
+├── sandbox.py            # Sandbox/SandboxPool protocols
+├── docker_sandbox.py     # Docker-based sandbox implementation + LRU cache
+├── runners.py            # Code execution strategies (StdinStdoutRunner)
+├── env.py                # CodeExecEnv (main RL environment)
+└── adapters/
+    ├── base.py           # TestAdapter, OutputVerifier protocols
+    └── apps.py           # APPS dataset adapter
+```
+
+## Core Abstractions
+
+### Sandbox Protocol
+
+```python
+class Sandbox(Protocol):
+    """Single sandboxed execution environment."""
+
+    async def execute(
+        self,
+        code: str,
+        stdin: str = "",
+        timeout_s: float = 5.0,
+    ) -> ExecutionResult:
+        """Execute code and return result."""
+        ...
+```
+
+### SandboxPool Protocol
+
+```python
+class SandboxPool(Protocol):
+    """Pool of reusable sandboxes with caching."""
+
+    async def checkout(self, timeout_s: float = 30.0) -> Sandbox:
+        """Get a sandbox from the pool."""
+        ...
+
+    async def release(self, sandbox: Sandbox) -> None:
+        """Return sandbox to pool."""
+        ...
+
+    def cache_get(self, code_hash: str, tests_hash: str) -> BatchTestResult | None:
+        """Check cache for previous results."""
+        ...
+
+    def cache_put(self, code_hash: str, tests_hash: str, result: BatchTestResult) -> None:
+        """Store result in cache."""
+        ...
+```
+
+### TestAdapter Protocol
+
+```python
+class TestAdapter(Protocol):
+    """Extracts test cases from dataset samples."""
+
+    def extract_tests(self, sample: dict[str, Any]) -> list[TestCase]:
+        """Extract test cases from a sample."""
+        ...
+
+    def format_problem(self, sample: dict[str, Any]) -> str:
+        """Format problem description for prompt."""
+        ...
+```
+
+### CodeRunner Protocol
+
+```python
+class CodeRunner(Protocol):
+    """Executes code against test cases."""
+
+    async def run_tests(
+        self,
+        code: str,
+        tests: list[TestCase],
+        sandbox: Sandbox,
+        config: CodeExecConfig,
+    ) -> BatchTestResult:
+        """Run all tests and return results."""
+        ...
+```
+
+## Usage
+
+### Basic Setup
+
+```python
+from ludic.envs.code_exec import (
+    CodeExecEnv,
+    CodeExecConfig,
+    DockerSandboxPool,
+    DockerSandboxConfig,
+)
+from ludic.envs.code_exec.adapters.apps import APPSTestAdapter
+
+# Create sandbox pool
+pool_config = DockerSandboxConfig(
+    python_version="3.11",
+    memory_limit="256m",
+    cpu_quota=50000,
+    network_disabled=True,
+)
+pool = DockerSandboxPool(n_workers=4, config=pool_config)
+await pool.start()
+
+# Create environment
+env_config = CodeExecConfig(
+    timeout_per_test_s=5.0,
+    stop_on_first_failure=True,
+    partial_credit=False,
+)
+env = CodeExecEnv(
+    sample={"question": "...", "inputs": [...], "outputs": [...]},
+    sandbox_pool=pool,
+    test_adapter=APPSTestAdapter(),
+    config=env_config,
+)
+
+# Run episode
+obs, info = await env.env_reset()
+outcome = await env.env_step("print(input())")
+
+# Cleanup
+await pool.shutdown()
+```
+
+### With SingleAgentProtocol
+
+The protocol automatically detects async environments:
+
+```python
+from ludic.interaction import SingleAgentProtocol
+from ludic.agent import Agent
+
+protocol = SingleAgentProtocol(agent=agent)
+rollouts = await protocol.run(env=env, max_steps=3)
+```
+
+## Configuration
+
+### CodeExecConfig
+
+| Field | Type | Default | Description |
+|-------|------|---------|-------------|
+| `timeout_per_test_s` | `float` | `5.0` | Timeout per test case |
+| `stop_on_first_failure` | `bool` | `True` | Stop after first failed test |
+| `compile_first` | `bool` | `True` | Check syntax before running |
+| `partial_credit` | `bool` | `False` | Reward based on pass fraction |
+| `compile_failure_reward` | `float` | `-0.1` | Reward for syntax errors |
+| `timeout_reward` | `float` | `-0.05` | Reward for timeout |
+| `use_cache` | `bool` | `True` | Enable result caching |
+
+### DockerSandboxConfig
+
+| Field | Type | Default | Description |
+|-------|------|---------|-------------|
+| `python_version` | `str` | `"3.11"` | Python version in container |
+| `memory_limit` | `str` | `"256m"` | Container memory limit |
+| `cpu_quota` | `int` | `50000` | CPU quota (50% of one core) |
+| `network_disabled` | `bool` | `True` | Disable container networking |
+
+## Implementing Custom Adapters
+
+```python
+from ludic.envs.code_exec import TestAdapter, TestCase, ExactMatchVerifier
+
+class MyDatasetAdapter(TestAdapter):
+    def __init__(self):
+        self._verifier = ExactMatchVerifier(strip=True, normalize_whitespace=True)
+
+    def extract_tests(self, sample: dict) -> list[TestCase]:
+        tests = []
+        for i, (inp, out) in enumerate(zip(sample["inputs"], sample["outputs"])):
+            tests.append(TestCase(input=inp, expected=out, id=f"test_{i}"))
+        return tests
+
+    def format_problem(self, sample: dict) -> str:
+        return sample["problem_statement"]
+
+    @property
+    def verifier(self) -> ExactMatchVerifier:
+        return self._verifier
+```
+
+## Result Types
+
+### TestResult
+
+```python
+@dataclass
+class TestResult:
+    test_case: TestCase
+    passed: bool
+    actual: str | None
+    execution: ExecutionResult
+    error_message: str | None = None
+```
+
+### BatchTestResult
+
+```python
+@dataclass
+class BatchTestResult:
+    results: list[TestResult]
+    code_hash: str
+    tests_hash: str
+
+    @property
+    def passed_count(self) -> int: ...
+
+    @property
+    def total_count(self) -> int: ...
+
+    @property
+    def all_passed(self) -> bool: ...
+
+    @property
+    def pass_rate(self) -> float: ...
+```
+
+## Caching
+
+The `DockerSandboxPool` includes an LRU cache to avoid re-executing identical code:
+
+```python
+pool = DockerSandboxPool(
+    n_workers=4,
+    config=config,
+    cache_size=10000,  # Max cached results
+)
+
+# Check cache stats
+print(pool.cache_stats)
+# {'hits': 150, 'misses': 50, 'size': 200, 'max_size': 10000}
+```
+
+Cache keys are computed from:
+- SHA256 hash of the code
+- SHA256 hash of serialized test cases
+
+## Thread Safety
+
+- `LRUCache`: Thread-safe via `threading.Lock`
+- `DockerSandboxPool`: Async-safe via `asyncio.Queue`
+- `CodeExecEnv`: Not thread-safe (one instance per rollout)
+
+## Dependencies
+
+**Required:**
+- `docker>=7.0.0` - Docker Python SDK
+
+**Optional (for specific adapters):**
+- `datasets` - HuggingFace datasets for APPS
diff --git a/src/ludic/envs/code_exec/__init__.py b/src/ludic/envs/code_exec/__init__.py
new file mode 100644
index 0000000..2b4f816
--- /dev/null
+++ b/src/ludic/envs/code_exec/__init__.py
@@ -0,0 +1,126 @@
+"""
+Code execution environment for RL on code generation tasks.
+
+This module provides:
+  - CodeExecEnv: Environment that executes code against test cases
+  - Sandbox protocols: Async sandboxed execution
+  - Test adapters: Dataset-specific test extraction
+  - Code runners: Execution strategies (stdin/stdout, function calls, etc.)
+  - Backend selection: Auto-detection and manual selection of sandbox backends
+
+Supported backends:
+  - Docker (requires docker package + daemon): pip install docker>=7.0.0
+  - Podman-HPC (HPC clusters): requires podman-hpc CLI
+  - Singularity (planned): not yet implemented
+
+Usage:
+  # Recommended: use the factory with auto-detection
+  from ludic.envs.code_exec import create_sandbox_pool
+
+  pool = await create_sandbox_pool(n_workers=4)  # Auto-detects backend
+  pool = await create_sandbox_pool(n_workers=4, backend="podman-hpc")  # Explicit
+
+  # Or import specific implementations
+  from ludic.envs.code_exec import DockerSandboxPool  # Docker
+  from ludic.envs.code_exec import PodmanHPCSandboxPool  # Podman-HPC
+"""
+
+from __future__ import annotations
+
+from .types import (
+    CompileStatus,
+    RunStatus,
+    CompileResult,
+    ExecutionResult,
+    TestCase,
+    TestResult,
+    BatchTestResult,
+)
+from .sandbox import Sandbox, SandboxPool
+from .adapters.base import TestAdapter, OutputVerifier, ExactMatchVerifier
+from .runners import CodeRunner, StdinStdoutRunner, compute_hash, hash_tests
+from .env import CodeExecConfig, CodeExecEnv
+
+# Backend detection and factory (always available)
+from .backend import (
+    SandboxBackend,
+    detect_available_backend,
+    is_docker_available,
+    is_podman_hpc_available,
+    is_singularity_available,
+    get_backend_info,
+)
+from .factory import create_sandbox_pool
+
+# Docker-related imports are optional (requires docker package)
+try:
+    from .docker_sandbox import (
+        DockerSandboxConfig,
+        DockerSandbox,
+        DockerSandboxPool,
+        LRUCache,
+    )
+    _DOCKER_AVAILABLE = True
+except ImportError:
+    _DOCKER_AVAILABLE = False
+    DockerSandboxConfig = None  # type: ignore[misc, assignment]
+    DockerSandbox = None  # type: ignore[misc, assignment]
+    DockerSandboxPool = None  # type: ignore[misc, assignment]
+    LRUCache = None  # type: ignore[misc, assignment]
+
+# Podman-HPC imports (always available - uses subprocess, no external package)
+from .podman_sandbox import (
+    PodmanConfig,
+    PodmanHPCSandbox,
+    PodmanHPCSandboxPool,
+    PodmanError,
+)
+
+__all__ = [
+    # Types
+    "CompileStatus",
+    "RunStatus",
+    "CompileResult",
+    "ExecutionResult",
+    "TestCase",
+    "TestResult",
+    "BatchTestResult",
+    # Protocols
+    "Sandbox",
+    "SandboxPool",
+    "TestAdapter",
+    "OutputVerifier",
+    "CodeRunner",
+    # Implementations
+    "ExactMatchVerifier",
+    "StdinStdoutRunner",
+    # Environment
+    "CodeExecConfig",
+    "CodeExecEnv",
+    # Utilities
+    "compute_hash",
+    "hash_tests",
+    # Backend detection
+    "SandboxBackend",
+    "detect_available_backend",
+    "is_docker_available",
+    "is_podman_hpc_available",
+    "is_singularity_available",
+    "get_backend_info",
+    # Factory
+    "create_sandbox_pool",
+    # Podman-HPC (always available)
+    "PodmanConfig",
+    "PodmanHPCSandbox",
+    "PodmanHPCSandboxPool",
+    "PodmanError",
+]
+
+# Add Docker-related exports only if available
+if _DOCKER_AVAILABLE:
+    __all__.extend([
+        "DockerSandboxConfig",
+        "DockerSandbox",
+        "DockerSandboxPool",
+        "LRUCache",
+    ])
diff --git a/src/ludic/envs/code_exec/adapters/__init__.py b/src/ludic/envs/code_exec/adapters/__init__.py
new file mode 100644
index 0000000..f73237b
--- /dev/null
+++ b/src/ludic/envs/code_exec/adapters/__init__.py
@@ -0,0 +1,19 @@
+"""
+Dataset adapters for code execution environments.
+
+Each adapter knows how to extract test cases and prompts from a specific
+dataset format (APPS, HumanEval, LeetCode, etc.).
+"""
+
+from __future__ import annotations
+
+from .apps import APPS_SYSTEM_PROMPT, APPSTestAdapter
+from .base import ExactMatchVerifier, OutputVerifier, TestAdapter
+
+__all__ = [
+    "TestAdapter",
+    "OutputVerifier",
+    "ExactMatchVerifier",
+    "APPSTestAdapter",
+    "APPS_SYSTEM_PROMPT",
+]
diff --git a/src/ludic/envs/code_exec/adapters/apps.py b/src/ludic/envs/code_exec/adapters/apps.py
new file mode 100644
index 0000000..cff6bb9
--- /dev/null
+++ b/src/ludic/envs/code_exec/adapters/apps.py
@@ -0,0 +1,144 @@
+"""
+APPS dataset adapter.
+
+Compatible with:
+  - codeparrot/apps
+  - RoganInglis/apps-control-arena
+  - Similar stdin/stdout format datasets
+"""
+
+from __future__ import annotations
+
+import hashlib
+from typing import Any, Dict, List
+
+from ..types import TestCase
+
+
+APPS_SYSTEM_PROMPT = """You are an expert Python programmer solving competitive programming problems.
+
+Your solution will be tested against multiple test cases with different inputs. All tests must pass.
+
+CRITICAL REQUIREMENTS:
+1. Read the problem specification carefully - understand input/output format, constraints, and edge cases
+2. Write a complete, self-contained Python script
+3. Read input using input() or sys.stdin
+4. Print output using print() - match the exact format required
+5. Your code must compile without errors and handle all test cases
+
+OUTPUT FORMAT (you MUST follow this exactly):
+
+<think>
+Brief analysis:
+- Input/output format
+- Key algorithm or approach
+- Edge cases to handle
+</think>
+
+<code>
+```python
+# Your complete solution here
+```
+</code>
+
+IMPORTANT:
+- Keep <think> concise - focus on problem understanding and approach
+- Ensure your code compiles cleanly (no syntax errors)
+- Match output format exactly (spacing, newlines, etc.)
+- Test your logic within <think> before writing
+- Your solution will be executed against hidden test cases"""
+
+
+class APPSTestAdapter:
+    """
+    Test adapter for APPS-style datasets.
+
+    Compatible with:
+      - codeparrot/apps
+      - RoganInglis/apps-control-arena
+      - Similar stdin/stdout datasets
+
+    APPS format:
+      - question: problem description (string)
+      - inputs: list of stdin strings
+      - outputs: list of expected stdout strings
+      - problem_id: unique identifier
+
+    Code is expected to be a Python script that reads from stdin
+    and writes to stdout.
+    """
+
+    def __init__(
+        self,
+        *,
+        question_key: str = "question",
+        inputs_key: str = "inputs",
+        outputs_key: str = "outputs",
+        problem_id_key: str = "problem_id",
+    ) -> None:
+        """
+        Args:
+            question_key: Key for problem description
+            inputs_key: Key for test inputs list
+            outputs_key: Key for expected outputs list
+            problem_id_key: Key for problem identifier
+        """
+        self._question_key = question_key
+        self._inputs_key = inputs_key
+        self._outputs_key = outputs_key
+        self._problem_id_key = problem_id_key
+
+    def get_prompt(self, sample: Dict[str, Any]) -> str:
+        """Extract problem description from sample."""
+        return str(sample[self._question_key])
+
+    def get_problem_id(self, sample: Dict[str, Any]) -> str:
+        """Extract problem identifier from sample."""
+        return str(sample.get(self._problem_id_key, "unknown"))
+
+    def get_tests(self, sample: Dict[str, Any]) -> List[TestCase]:
+        """
+        Extract test cases from sample.
+
+        Args:
+            sample: Dataset sample with inputs and outputs lists
+
+        Returns:
+            List of TestCase objects for stdin/stdout testing
+
+        Raises:
+            ValueError: If inputs and outputs lists have different lengths
+        """
+        inputs = sample[self._inputs_key]
+        outputs = sample[self._outputs_key]
+
+        if len(inputs) != len(outputs):
+            raise ValueError(
+                f"Mismatched test case counts: {len(inputs)} inputs, "
+                f"{len(outputs)} outputs"
+            )
+
+        return [
+            TestCase(input=inp, expected=out, id=f"test_{i}")
+            for i, (inp, out) in enumerate(zip(inputs, outputs))
+        ]
+
+    def hash_tests(self, tests: List[TestCase]) -> str:
+        """
+        Compute stable hash of test cases for caching.
+
+        Args:
+            tests: List of test cases to hash
+
+        Returns:
+            16-character hex hash
+        """
+        # Create canonical representation
+        canonical = [(t.input, t.expected) for t in tests]
+        canonical_str = str(canonical)
+
+        # Hash with SHA256
+        hash_obj = hashlib.sha256(canonical_str.encode("utf-8"))
+
+        # Return first 16 hex characters
+        return hash_obj.hexdigest()[:16]
diff --git a/src/ludic/envs/code_exec/adapters/base.py b/src/ludic/envs/code_exec/adapters/base.py
new file mode 100644
index 0000000..99d63bf
--- /dev/null
+++ b/src/ludic/envs/code_exec/adapters/base.py
@@ -0,0 +1,249 @@
+"""
+Base protocols and default implementations for dataset adapters.
+"""
+
+from __future__ import annotations
+
+from typing import Any, Dict, List, Optional, Protocol, Tuple, runtime_checkable
+
+from ..types import TestCase
+
+
+@runtime_checkable
+class TestAdapter(Protocol):
+    """
+    Extracts test cases from a dataset sample.
+
+    Each dataset format needs its own adapter to map from the sample
+    schema to the TestCase abstraction. This decouples the CodeExecEnv
+    from any specific dataset format.
+
+    Implementations should be stateless and reusable across samples.
+    """
+
+    __test__ = False  # Prevent pytest from collecting this as a test class
+
+    def get_tests(self, sample: Dict[str, Any]) -> List[TestCase]:
+        """
+        Extract test cases from a sample.
+
+        Args:
+            sample: A single dataset sample (row)
+
+        Returns:
+            List of TestCase objects ready for execution
+        """
+        ...
+
+    def get_prompt(self, sample: Dict[str, Any]) -> str:
+        """
+        Extract the problem prompt/question from a sample.
+
+        This is the text shown to the agent as the initial observation.
+
+        Args:
+            sample: A single dataset sample (row)
+
+        Returns:
+            The problem description string
+        """
+        ...
+
+    def get_problem_id(self, sample: Dict[str, Any]) -> str:
+        """
+        Extract unique problem identifier.
+
+        Used for logging, caching keys, and result tracking.
+
+        Args:
+            sample: A single dataset sample (row)
+
+        Returns:
+            Unique identifier string
+        """
+        ...
+
+    def hash_tests(self, tests: List[TestCase]) -> str:
+        """
+        Compute a stable hash of test cases for caching.
+
+        The hash should be deterministic and capture all test inputs
+        and expected outputs. Used as part of the cache key.
+
+        Args:
+            tests: List of test cases to hash
+
+        Returns:
+            Hex string hash (typically 16 chars)
+        """
+        ...
+
+
+@runtime_checkable
+class OutputVerifier(Protocol):
+    """
+    Compares actual output against expected output.
+
+    Separated from TestAdapter because the same comparison logic
+    (e.g., float tolerance, whitespace normalization) often applies
+    across different dataset formats.
+    """
+
+    def verify(self, actual: str, expected: str) -> Tuple[bool, Optional[str]]:
+        """
+        Compare actual output against expected.
+
+        Args:
+            actual: The actual output from code execution
+            expected: The expected output from the test case
+
+        Returns:
+            Tuple of (passed, details) where:
+              - passed: True if outputs match
+              - details: Explanation of mismatch if not passed, None otherwise
+        """
+        ...
+
+
+class ExactMatchVerifier:
+    """
+    Exact string match after stripping whitespace.
+
+    This is the default verifier and works for most competitive
+    programming style problems (APPS, Codeforces, etc.).
+    """
+
+    def __init__(self, *, strip: bool = True, case_sensitive: bool = True) -> None:
+        """
+        Args:
+            strip: Whether to strip leading/trailing whitespace
+            case_sensitive: Whether comparison is case-sensitive
+        """
+        self._strip = strip
+        self._case_sensitive = case_sensitive
+
+    def verify(self, actual: str, expected: str) -> Tuple[bool, Optional[str]]:
+        """Compare actual vs expected with configured normalization."""
+        a = actual.strip() if self._strip else actual
+        e = expected.strip() if self._strip else expected
+
+        if not self._case_sensitive:
+            a = a.lower()
+            e = e.lower()
+
+        if a == e:
+            return True, None
+
+        # Provide useful diff info for debugging
+        details = self._generate_diff_details(a, e)
+        return False, details
+
+    def _generate_diff_details(self, actual: str, expected: str) -> str:
+        """Generate a human-readable diff explanation."""
+        # Length mismatch
+        if len(actual) != len(expected):
+            return (
+                f"Length mismatch: got {len(actual)} chars, "
+                f"expected {len(expected)} chars"
+            )
+
+        # Find first difference
+        for i, (ca, ce) in enumerate(zip(actual, expected)):
+            if ca != ce:
+                # Show context around the difference
+                start = max(0, i - 10)
+                end = min(len(actual), i + 10)
+                actual_ctx = actual[start:end]
+                expected_ctx = expected[start:end]
+                return (
+                    f"First diff at position {i}: "
+                    f"got {repr(ca)}, expected {repr(ce)}. "
+                    f"Context: got '{actual_ctx}', expected '{expected_ctx}'"
+                )
+
+        return "Unknown difference (possibly trailing content)"
+
+
+class WhitespaceNormalizedVerifier:
+    """
+    Verifier that normalizes all whitespace before comparison.
+
+    Useful for problems where output formatting (spaces, newlines)
+    may vary but content should be the same.
+    """
+
+    def verify(self, actual: str, expected: str) -> Tuple[bool, Optional[str]]:
+        """Compare after normalizing all whitespace to single spaces."""
+        a = " ".join(actual.split())
+        e = " ".join(expected.split())
+
+        if a == e:
+            return True, None
+
+        return False, f"Mismatch after whitespace normalization: got '{a[:100]}...', expected '{e[:100]}...'"
+
+
+class FloatTolerantVerifier:
+    """
+    Verifier that handles floating point comparisons with tolerance.
+
+    Useful for numerical problems where small floating point differences
+    are acceptable.
+    """
+
+    def __init__(
+        self,
+        *,
+        abs_tol: float = 1e-9,
+        rel_tol: float = 1e-9,
+        strip: bool = True,
+    ) -> None:
+        """
+        Args:
+            abs_tol: Absolute tolerance for float comparison
+            rel_tol: Relative tolerance for float comparison
+            strip: Whether to strip whitespace
+        """
+        self._abs_tol = abs_tol
+        self._rel_tol = rel_tol
+        self._strip = strip
+
+    def verify(self, actual: str, expected: str) -> Tuple[bool, Optional[str]]:
+        """
+        Compare outputs, using float tolerance where applicable.
+
+        Splits output into tokens and compares each. If both tokens
+        parse as floats, uses tolerance comparison. Otherwise uses
+        exact string match.
+        """
+        a = actual.strip() if self._strip else actual
+        e = expected.strip() if self._strip else expected
+
+        a_tokens = a.split()
+        e_tokens = e.split()
+
+        if len(a_tokens) != len(e_tokens):
+            return False, f"Token count mismatch: got {len(a_tokens)}, expected {len(e_tokens)}"
+
+        for i, (at, et) in enumerate(zip(a_tokens, e_tokens)):
+            if not self._tokens_match(at, et):
+                return False, f"Mismatch at token {i}: got '{at}', expected '{et}'"
+
+        return True, None
+
+    def _tokens_match(self, actual: str, expected: str) -> bool:
+        """Check if two tokens match (with float tolerance if applicable)."""
+        # Try exact match first
+        if actual == expected:
+            return True
+
+        # Try float comparison
+        try:
+            a_float = float(actual)
+            e_float = float(expected)
+            diff = abs(a_float - e_float)
+            threshold = max(self._abs_tol, self._rel_tol * abs(e_float))
+            return diff <= threshold
+        except ValueError:
+            # Not floats, exact match already failed
+            return False
diff --git a/src/ludic/envs/code_exec/backend.py b/src/ludic/envs/code_exec/backend.py
new file mode 100644
index 0000000..c236b41
--- /dev/null
+++ b/src/ludic/envs/code_exec/backend.py
@@ -0,0 +1,171 @@
+"""
+Sandbox backend detection and selection.
+
+This module provides:
+  - SandboxBackend: Enumeration of supported sandbox backends
+  - detect_available_backend(): Auto-detection based on environment
+  - is_*_available(): Individual backend availability checks
+
+Auto-detection priority:
+  - In Slurm job: podman-hpc → docker → error
+  - Outside Slurm: docker → podman-hpc → error
+
+Usage:
+  from ludic.envs.code_exec.backend import detect_available_backend, SandboxBackend
+
+  # Auto-detect
+  backend = detect_available_backend()
+
+  # Manual selection
+  if backend == SandboxBackend.PODMAN_HPC:
+      from ludic.envs.code_exec.podman_sandbox import PodmanHPCSandboxPool
+      pool = PodmanHPCSandboxPool(n_workers=4)
+"""
+
+from __future__ import annotations
+
+import os
+import shutil
+from enum import Enum
+
+
+class SandboxBackend(str, Enum):
+    """Supported sandbox backends."""
+
+    DOCKER = "docker"
+    PODMAN_HPC = "podman-hpc"
+    SINGULARITY = "singularity"
+    AUTO = "auto"
+
+
+def detect_available_backend() -> str:
+    """
+    Auto-detect the best available sandbox backend.
+
+    Detection priority:
+      - In Slurm job (SLURM_JOB_ID set):
+        1. podman-hpc (most common on HPC)
+        2. docker (some HPC clusters have Docker)
+        3. error
+      - Outside Slurm:
+        1. docker (most common for local development)
+        2. podman-hpc
+        3. error
+
+    Returns:
+        Backend identifier (one of SandboxBackend values, excluding AUTO)
+
+    Raises:
+        RuntimeError: If no sandbox backend is available
+    """
+    in_slurm = os.environ.get("SLURM_JOB_ID") is not None
+
+    if in_slurm:
+        # HPC environment: prefer podman-hpc
+        if is_podman_hpc_available():
+            return SandboxBackend.PODMAN_HPC.value
+        if is_docker_available():
+            return SandboxBackend.DOCKER.value
+    else:
+        # Local/cloud environment: prefer Docker
+        if is_docker_available():
+            return SandboxBackend.DOCKER.value
+        if is_podman_hpc_available():
+            return SandboxBackend.PODMAN_HPC.value
+
+    # Singularity is deferred but check for future use
+    if is_singularity_available():
+        # NOTE: Singularity backend not yet implemented
+        pass
+
+    raise RuntimeError(
+        "No sandbox backend available. Install one of:\n"
+        "  - Docker (daemon-based): pip install docker && start Docker daemon\n"
+        "  - Podman-HPC (daemonless): available on HPC clusters with podman-hpc\n"
+        "\n"
+        "For HPC clusters, ensure you're running within a Slurm job:\n"
+        "  srun --pty bash\n"
+        "  # or\n"
+        "  sbatch your_script.sh"
+    )
+
+
+def is_docker_available() -> bool:
+    """
+    Check if Docker daemon is running and accessible.
+
+    Returns:
+        True if Docker is available and responding
+    """
+    try:
+        import docker
+        client = docker.from_env()
+        client.ping()
+        client.close()
+        return True
+    except ImportError:
+        # docker package not installed
+        return False
+    except Exception:
+        # Docker daemon not running or not accessible
+        return False
+
+
+def is_podman_hpc_available() -> bool:
+    """
+    Check if podman-hpc CLI is available.
+
+    Note: This only checks if the command exists, not if containers
+    can actually be run (which may require being in a Slurm job).
+
+    Returns:
+        True if podman-hpc command is in PATH
+    """
+    return shutil.which("podman-hpc") is not None
+
+
+def is_singularity_available() -> bool:
+    """
+    Check if Singularity/Apptainer CLI is available.
+
+    Returns:
+        True if singularity or apptainer command is in PATH
+    """
+    return (
+        shutil.which("singularity") is not None
+        or shutil.which("apptainer") is not None
+    )
+
+
+def get_backend_info() -> dict:
+    """
+    Get information about all backend availability.
+
+    Useful for debugging and status reporting.
+
+    Returns:
+        Dict with backend names as keys and availability info as values
+    """
+    in_slurm = os.environ.get("SLURM_JOB_ID") is not None
+
+    return {
+        "environment": {
+            "in_slurm": in_slurm,
+            "slurm_job_id": os.environ.get("SLURM_JOB_ID"),
+        },
+        "backends": {
+            SandboxBackend.DOCKER.value: {
+                "available": is_docker_available(),
+                "requires": "Docker daemon + docker package",
+            },
+            SandboxBackend.PODMAN_HPC.value: {
+                "available": is_podman_hpc_available(),
+                "requires": "podman-hpc command (HPC clusters)",
+            },
+            SandboxBackend.SINGULARITY.value: {
+                "available": is_singularity_available(),
+                "requires": "singularity/apptainer command",
+                "note": "Not yet implemented",
+            },
+        },
+    }
diff --git a/src/ludic/envs/code_exec/batch_runner.py b/src/ludic/envs/code_exec/batch_runner.py
new file mode 100644
index 0000000..ae0d998
--- /dev/null
+++ b/src/ludic/envs/code_exec/batch_runner.py
@@ -0,0 +1,326 @@
+#!/usr/bin/env python3
+"""
+Batch test runner for ludic code execution sandbox.
+
+This script runs inside the container. It:
+1. Reads manifest.json for test configuration
+2. Optionally compiles the solution using py_compile
+3. Runs tests in PARALLEL using multiprocessing.Pool (default 16 workers)
+4. Outputs streaming JSONL results (one JSON object per line, flushed immediately)
+
+Usage:
+    python batch_runner.py [manifest_path]
+
+The manifest.json format:
+    {
+        "code_file": "solution.py",
+        "compile_first": true,
+        "timeout_s": 5.0,
+        "stop_on_first_failure": true,
+        "num_workers": 16,
+        "tests": [
+            {"id": "test_0", "stdin": "5\\n", "expected": "25\\n"},
+            {"id": "test_1", "stdin": "3\\n", "expected": "9\\n"}
+        ]
+    }
+
+Output format (streaming JSONL):
+    {"type": "compile", "status": "success", "duration_ms": 12.5}
+    {"type": "test", "id": "test_0", "status": "success", "stdout": "25\\n", ...}
+    {"type": "test", "id": "test_1", "status": "timeout", ...}
+    {"type": "done", "total_tests": 2, "passed": 1, "failed": 1, "compile_failed": false}
+
+Status values:
+    compile: success, syntax_error, timeout
+    test: success, runtime_error, timeout, memory_exceeded, not_run
+
+Note: This script is designed to be self-contained with no external dependencies
+beyond Python's standard library. It will be bundled into the container at runtime.
+"""
+
+from __future__ import annotations
+
+import json
+import multiprocessing
+import py_compile
+import subprocess
+import sys
+import time
+from typing import Any, Dict, Iterator, List, Optional, Tuple
+
+
+def emit(obj: Dict[str, Any]) -> None:
+    """Write JSON line and flush immediately for streaming.
+
+    Each line must be a complete JSON object to enable partial result
+    recovery if the container crashes mid-execution.
+    """
+    print(json.dumps(obj), flush=True)
+
+
+def compile_check(code_file: str, timeout_s: float) -> Dict[str, Any]:
+    """Run py_compile and return result dict.
+
+    Args:
+        code_file: Path to the Python file to compile
+        timeout_s: Timeout for compilation (not currently enforced for py_compile)
+
+    Returns:
+        Dict with type="compile" and status/error info
+    """
+    start = time.perf_counter()
+    try:
+        py_compile.compile(code_file, doraise=True)
+        return {
+            "type": "compile",
+            "status": "success",
+            "duration_ms": (time.perf_counter() - start) * 1000,
+        }
+    except py_compile.PyCompileError as e:
+        # Extract line number from the exception
+        # PyCompileError has exc_value which contains the SyntaxError
+        error_line: Optional[int] = None
+        error_column: Optional[int] = None
+        error_message = str(e)
+
+        # Try to extract line/column from the underlying SyntaxError
+        if hasattr(e, "exc_value") and e.exc_value is not None:
+            exc = e.exc_value
+            if hasattr(exc, "lineno"):
+                error_line = exc.lineno
+            if hasattr(exc, "offset"):
+                error_column = exc.offset
+            if hasattr(exc, "msg"):
+                error_message = exc.msg
+
+        return {
+            "type": "compile",
+            "status": "syntax_error",
+            "error_message": error_message,
+            "error_line": error_line,
+            "error_column": error_column,
+            "duration_ms": (time.perf_counter() - start) * 1000,
+        }
+
+
+def run_test(code_file: str, test: Dict[str, Any], timeout_s: float) -> Dict[str, Any]:
+    """Run a single test and return result dict.
+
+    Args:
+        code_file: Path to the Python file to execute
+        test: Test specification with id, stdin, expected (optional)
+        timeout_s: Timeout in seconds for the test execution
+
+    Returns:
+        Dict with type="test" and execution results
+    """
+    start = time.perf_counter()
+    test_id = test.get("id", "unknown")
+    stdin_data = test.get("stdin", "")
+
+    try:
+        proc = subprocess.run(
+            [sys.executable, code_file],
+            input=stdin_data,
+            capture_output=True,
+            text=True,
+            timeout=timeout_s,
+        )
+        duration_ms = (time.perf_counter() - start) * 1000
+
+        # Classify status based on return code
+        if proc.returncode == 0:
+            status = "success"
+        elif proc.returncode == 137:
+            # SIGKILL - typically OOM killer
+            status = "memory_exceeded"
+        elif proc.returncode == 143:
+            # SIGTERM
+            status = "killed"
+        else:
+            status = "runtime_error"
+
+        return {
+            "type": "test",
+            "id": test_id,
+            "status": status,
+            "stdout": proc.stdout,
+            "stderr": proc.stderr,
+            "exit_code": proc.returncode,
+            "duration_ms": duration_ms,
+        }
+
+    except subprocess.TimeoutExpired as e:
+        # Capture any partial output
+        stdout = e.stdout.decode("utf-8", errors="replace") if e.stdout else ""
+        stderr = e.stderr.decode("utf-8", errors="replace") if e.stderr else ""
+
+        return {
+            "type": "test",
+            "id": test_id,
+            "status": "timeout",
+            "stdout": stdout,
+            "stderr": stderr,
+            "exit_code": None,
+            "duration_ms": timeout_s * 1000,
+        }
+
+    except Exception as e:
+        # Catch any unexpected errors (e.g., file not found)
+        duration_ms = (time.perf_counter() - start) * 1000
+        return {
+            "type": "test",
+            "id": test_id,
+            "status": "runtime_error",
+            "stdout": "",
+            "stderr": f"Execution error: {e}",
+            "exit_code": None,
+            "duration_ms": duration_ms,
+        }
+
+
+def _run_test_wrapper(args: Tuple[int, Dict[str, Any], str, float]) -> Tuple[int, Dict[str, Any]]:
+    """Wrapper for multiprocessing - must be top-level function for pickling.
+
+    Args:
+        args: Tuple of (test_index, test_dict, code_file, timeout_s)
+
+    Returns:
+        Tuple of (test_index, result_dict) to preserve ordering info
+    """
+    i, test, code_file, timeout_s = args
+    result = run_test(code_file, test, timeout_s)
+    return (i, result)
+
+
+def run_tests_parallel(
+    code_file: str,
+    tests: List[Dict[str, Any]],
+    timeout_s: float,
+    num_workers: int = 16,
+) -> Iterator[Dict[str, Any]]:
+    """Run tests in parallel using multiprocessing.Pool.
+
+    Uses imap_unordered for streaming results as they complete (not waiting
+    for all tests). This dramatically reduces wall-clock time when tests
+    have varying execution times.
+
+    Args:
+        code_file: Path to the Python file to execute
+        tests: List of test specifications
+        timeout_s: Timeout per test in seconds
+        num_workers: Number of parallel worker processes (default 16 for HPC)
+
+    Yields:
+        Test result dicts as they complete (unordered)
+    """
+    if not tests:
+        return
+
+    # Prepare arguments for each test
+    args_list = [(i, test, code_file, timeout_s) for i, test in enumerate(tests)]
+
+    # Use spawn context to avoid fork issues with subprocess-heavy workloads
+    # This is safer on HPC systems where fork can cause issues with MPI, CUDA, etc.
+    ctx = multiprocessing.get_context("spawn")
+
+    with ctx.Pool(processes=min(num_workers, len(tests))) as pool:
+        # imap_unordered streams results as they complete
+        for _i, result in pool.imap_unordered(_run_test_wrapper, args_list):
+            yield result
+
+
+def main() -> None:
+    """Main entry point for batch runner."""
+    # Get manifest path from command line or use default
+    manifest_path = sys.argv[1] if len(sys.argv) > 1 else "manifest.json"
+
+    # Load manifest
+    try:
+        with open(manifest_path) as f:
+            manifest = json.load(f)
+    except Exception as e:
+        emit({
+            "type": "error",
+            "message": f"Failed to load manifest: {e}",
+        })
+        emit({
+            "type": "done",
+            "total_tests": 0,
+            "passed": 0,
+            "failed": 0,
+            "compile_failed": False,
+        })
+        return
+
+    # Extract configuration
+    code_file = manifest.get("code_file", "solution.py")
+    compile_first = manifest.get("compile_first", True)
+    timeout_s = manifest.get("timeout_s", 5.0)
+    stop_on_first_failure = manifest.get("stop_on_first_failure", True)
+    num_workers = manifest.get("num_workers", 16)  # Configurable via manifest
+    tests: List[Dict[str, Any]] = manifest.get("tests", [])
+
+    # Step 1: Compile check (optional)
+    if compile_first:
+        result = compile_check(code_file, timeout_s)
+        emit(result)
+
+        if result["status"] != "success":
+            # Compilation failed - emit done and exit
+            emit({
+                "type": "done",
+                "total_tests": len(tests),
+                "passed": 0,
+                "failed": 0,
+                "compile_failed": True,
+            })
+            return
+
+    # Step 2: Run tests in parallel
+    passed = 0
+    failed = 0
+    received_ids: set[str] = set()
+
+    # Use parallel execution for better throughput on HPC
+    for result in run_tests_parallel(code_file, tests, timeout_s, num_workers):
+        emit(result)  # Stream immediately as each test completes
+        received_ids.add(result.get("id", "unknown"))
+
+        if result["status"] == "success":
+            passed += 1
+        else:
+            failed += 1
+
+            if stop_on_first_failure:
+                # Early termination - emit remaining tests as "not_run"
+                # Note: with parallel execution, some tests may have already
+                # started but the pool will be terminated on context exit
+                break
+
+    # Emit any tests that didn't run (due to early termination or errors)
+    for test in tests:
+        test_id = test.get("id", "unknown")
+        if test_id not in received_ids:
+            emit({
+                "type": "test",
+                "id": test_id,
+                "status": "not_run",
+                "stdout": "",
+                "stderr": "",
+                "exit_code": None,
+                "duration_ms": 0,
+            })
+
+    # Step 3: Emit done marker
+    emit({
+        "type": "done",
+        "total_tests": len(tests),
+        "passed": passed,
+        "failed": failed,
+        "compile_failed": False,
+    })
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/ludic/envs/code_exec/cache.py b/src/ludic/envs/code_exec/cache.py
new file mode 100644
index 0000000..a5275b6
--- /dev/null
+++ b/src/ludic/envs/code_exec/cache.py
@@ -0,0 +1,129 @@
+"""
+Shared LRU cache for code execution results.
+
+Provides thread-safe caching of BatchTestResult keyed by (code_hash, tests_hash).
+Used by both Docker and Podman sandbox pools to avoid redundant execution of
+identical code/test combinations.
+"""
+
+from __future__ import annotations
+
+import threading
+from collections import OrderedDict
+from typing import Dict, Optional
+
+from .types import BatchTestResult
+
+
+class LRUCache:
+    """
+    Thread-safe LRU cache for BatchTestResult.
+
+    Uses OrderedDict for LRU semantics and threading.Lock for safety.
+    Suitable for use across multiple async tasks sharing the same pool.
+
+    Args:
+        max_size: Maximum number of entries to cache. Oldest entries are
+            evicted when this limit is exceeded.
+    """
+
+    def __init__(self, max_size: int = 10000):
+        self._max_size = max_size
+        self._cache: OrderedDict[tuple[str, str], BatchTestResult] = OrderedDict()
+        self._lock = threading.Lock()
+        self._hits = 0
+        self._misses = 0
+
+    def get(
+        self,
+        code_hash: str,
+        tests_hash: str,
+    ) -> Optional[BatchTestResult]:
+        """
+        Get cached result.
+
+        On hit, moves item to end (most recently used).
+        Thread-safe.
+
+        Args:
+            code_hash: Hash of the code being executed.
+            tests_hash: Hash of the test cases.
+
+        Returns:
+            Cached BatchTestResult if found, None otherwise.
+        """
+        key = (code_hash, tests_hash)
+        with self._lock:
+            if key in self._cache:
+                # Move to end (most recently used)
+                self._cache.move_to_end(key)
+                self._hits += 1
+                return self._cache[key]
+            else:
+                self._misses += 1
+                return None
+
+    def put(
+        self,
+        code_hash: str,
+        tests_hash: str,
+        result: BatchTestResult,
+    ) -> None:
+        """
+        Cache a result.
+
+        Evicts oldest item if cache is full.
+        Thread-safe.
+
+        Args:
+            code_hash: Hash of the code being executed.
+            tests_hash: Hash of the test cases.
+            result: The test result to cache.
+        """
+        key = (code_hash, tests_hash)
+        with self._lock:
+            if key in self._cache:
+                # Update existing entry and move to end
+                self._cache[key] = result
+                self._cache.move_to_end(key)
+            else:
+                # Add new entry
+                self._cache[key] = result
+                # Evict oldest if over limit
+                if len(self._cache) > self._max_size:
+                    self._cache.popitem(last=False)  # FIFO: remove oldest
+
+    def clear(self) -> None:
+        """Clear all cached entries. Thread-safe."""
+        with self._lock:
+            self._cache.clear()
+            # Note: We don't reset hit/miss counters on clear
+
+    @property
+    def stats(self) -> Dict[str, int]:
+        """
+        Get cache statistics (thread-safe).
+
+        Returns:
+            Dict with keys: hits, misses, size, max_size
+        """
+        with self._lock:
+            return {
+                "hits": self._hits,
+                "misses": self._misses,
+                "size": len(self._cache),
+                "max_size": self._max_size,
+            }
+
+    @property
+    def hit_rate(self) -> float:
+        """
+        Get cache hit rate as a float between 0 and 1.
+
+        Returns 0.0 if no lookups have been performed.
+        """
+        with self._lock:
+            total = self._hits + self._misses
+            if total == 0:
+                return 0.0
+            return self._hits / total
diff --git a/src/ludic/envs/code_exec/docker_sandbox.py b/src/ludic/envs/code_exec/docker_sandbox.py
new file mode 100644
index 0000000..5b5f354
--- /dev/null
+++ b/src/ludic/envs/code_exec/docker_sandbox.py
@@ -0,0 +1,725 @@
+"""
+Docker-based sandbox implementation for code execution.
+
+This module provides:
+  - DockerSandboxConfig: Configuration for Docker containers
+  - DockerSandbox: Async Docker container sandbox
+  - DockerSandboxPool: Pool of Docker sandboxes with caching
+
+Requires: docker>=7.0.0
+Install with: pip install 'ludic[code-exec]'
+"""
+
+from __future__ import annotations
+
+import asyncio
+import hashlib
+import io
+import json
+import logging
+import os
+import re
+import tarfile
+import time
+import uuid
+from concurrent.futures import ThreadPoolExecutor
+from dataclasses import dataclass, field
+from typing import AsyncIterator, Dict, List, Optional, Union
+
+logger = logging.getLogger(__name__)
+
+try:
+    import docker
+    from docker.models.containers import Container
+except ImportError as e:
+    raise ImportError(
+        "Docker is not installed. Install it with: pip install 'ludic[code-exec]'"
+    ) from e
+
+from .parsing import (
+    get_batch_runner_script,
+    parse_batch_compile_result,
+    parse_batch_test_result,
+    parse_syntax_error,
+)
+from .pool import BaseSandboxPool
+from .sandbox import Sandbox, SandboxPool
+from .types import (
+    BatchExecutionSpec,
+    BatchTestResult,
+    CompileResult,
+    CompileStatus,
+    ExecutionResult,
+    RunStatus,
+    TestCase,
+)
+
+
+@dataclass
+class DockerSandboxConfig:
+    """Configuration for Docker-based sandboxes."""
+
+    python_version: str = "3.11"
+    base_image: Optional[str] = None
+    memory_limit: str = "256m"
+    cpu_quota: int = 50000  # 50% of one CPU (out of 100000)
+    network_disabled: bool = True
+    working_dir: str = "/workspace"
+
+    @property
+    def image(self) -> str:
+        """Get Docker image name (auto-generated or explicit)."""
+        if self.base_image:
+            return self.base_image
+        return f"python:{self.python_version}-slim"
+
+
+class DockerSandbox:
+    """
+    Async Docker container sandbox for Python code execution.
+
+    Uses ThreadPoolExecutor to make docker-py calls non-blocking.
+    Implements the Sandbox protocol with full async support.
+    """
+
+    def __init__(
+        self,
+        container: Container,
+        config: DockerSandboxConfig,
+        executor: ThreadPoolExecutor,
+    ):
+        self._container = container
+        self._config = config
+        self._executor = executor
+        self._memory_limit_warned = False
+
+    @property
+    def python_version(self) -> str:
+        return self._config.python_version
+
+    async def reset(self) -> None:
+        """Clear workspace directory."""
+
+        def _reset():
+            # Remove all files in workspace
+            self._container.exec_run(
+                f"sh -c 'rm -rf {self._config.working_dir}/*'",
+                workdir=self._config.working_dir,
+            )
+
+        loop = asyncio.get_event_loop()
+        await loop.run_in_executor(self._executor, _reset)
+
+    async def compile(
+        self,
+        code: str,
+        *,
+        timeout_s: float = 5.0,
+    ) -> CompileResult:
+        """
+        Syntax-check code using py_compile.
+
+        Returns rich error info including line and column numbers.
+        """
+        start = time.perf_counter()
+
+        def _compile():
+            # Write code to temp file
+            self._write_file("_check.py", code)
+
+            # Run py_compile
+            result = self._container.exec_run(
+                "python -m py_compile _check.py",
+                workdir=self._config.working_dir,
+                demux=True,
+            )
+            return result
+
+        loop = asyncio.get_event_loop()
+        try:
+            # Run with timeout
+            result = await asyncio.wait_for(
+                loop.run_in_executor(self._executor, _compile),
+                timeout=timeout_s,
+            )
+
+            duration_ms = (time.perf_counter() - start) * 1000
+
+            exit_code = result.exit_code
+            stdout, stderr = result.output
+
+            if exit_code == 0:
+                return CompileResult(
+                    status=CompileStatus.SUCCESS,
+                    duration_ms=duration_ms,
+                )
+
+            # Parse error message
+            error_msg = (stderr or b"").decode("utf-8", errors="replace")
+            line, column, clean_msg = parse_syntax_error(error_msg)
+
+            # Classify error type
+            status = CompileStatus.SYNTAX_ERROR
+            if "ImportError" in error_msg or "ModuleNotFoundError" in error_msg:
+                status = CompileStatus.IMPORT_ERROR
+            elif not clean_msg:
+                status = CompileStatus.UNKNOWN_ERROR
+
+            return CompileResult(
+                status=status,
+                error_message=clean_msg or error_msg,
+                error_line=line,
+                error_column=column,
+                duration_ms=duration_ms,
+            )
+
+        except asyncio.TimeoutError:
+            duration_ms = (time.perf_counter() - start) * 1000
+            return CompileResult(
+                status=CompileStatus.TIMEOUT,
+                error_message=f"Compilation timed out after {timeout_s}s",
+                duration_ms=duration_ms,
+            )
+
+    async def execute(
+        self,
+        code: str,
+        *,
+        stdin: str = "",
+        skip_compile: bool = False,
+        timeout_s: float = 10.0,
+        memory_limit_mb: Optional[int] = None,
+        env_vars: Optional[Dict[str, str]] = None,
+    ) -> ExecutionResult:
+        """
+        Execute code with full resource isolation and rich metadata.
+
+        Compiles first, then executes if compilation succeeds (unless skip_compile=True).
+        """
+        # Log warning for memory_limit_mb if provided (only once per sandbox)
+        if memory_limit_mb is not None and not self._memory_limit_warned:
+            logger.warning(
+                "Per-execution memory limits are not supported by docker exec. "
+                "Container-level memory limit (%s) is enforced instead.",
+                self._config.memory_limit,
+            )
+            self._memory_limit_warned = True
+
+        # Step 1: Compile
+        if skip_compile:
+            compile_result = CompileResult(status=CompileStatus.SUCCESS)
+        else:
+            compile_result = await self.compile(code, timeout_s=timeout_s)
+
+        total_start = time.perf_counter()
+
+        if not compile_result.success:
+            # Return early with compilation failure
+            total_ms = (time.perf_counter() - total_start) * 1000
+            return ExecutionResult(
+                compile_result=compile_result,
+                run_status=RunStatus.NOT_RUN,
+                compile_duration_ms=compile_result.duration_ms,
+                total_duration_ms=total_ms,
+            )
+
+        # Step 2: Execute
+        run_start = time.perf_counter()
+
+        def _execute():
+            # Generate unique execution ID to avoid race conditions
+            exec_id = uuid.uuid4().hex[:8]
+            exec_file = f"_exec_{exec_id}.py"
+            input_file = f"input_{exec_id}.txt"
+
+            # Write code to file
+            self._write_file(exec_file, code)
+
+            # Write stdin to file if provided
+            if stdin:
+                self._write_file(input_file, stdin)
+                # Build command with stdin redirection
+                cmd = f"python {self._config.working_dir}/{exec_file} < {self._config.working_dir}/{input_file}"
+            else:
+                # Build command without redirection
+                cmd = f"python {self._config.working_dir}/{exec_file}"
+
+            # Prepare environment
+            environment = env_vars or {}
+
+            # Run with resource limits
+            result = self._container.exec_run(
+                cmd,
+                workdir=self._config.working_dir,
+                demux=True,
+                environment=environment,
+            )
+
+            return result
+
+        loop = asyncio.get_event_loop()
+
+        try:
+            # Run with timeout
+            result = await asyncio.wait_for(
+                loop.run_in_executor(self._executor, _execute),
+                timeout=timeout_s,
+            )
+
+            run_ms = (time.perf_counter() - run_start) * 1000
+            total_ms = (time.perf_counter() - total_start) * 1000
+
+            exit_code = result.exit_code
+            stdout, stderr = result.output
+
+            stdout_str = (stdout or b"").decode("utf-8", errors="replace")
+            stderr_str = (stderr or b"").decode("utf-8", errors="replace")
+
+            # Classify run status
+            if exit_code == 0:
+                run_status = RunStatus.SUCCESS
+            elif exit_code == 137:  # SIGKILL (OOM)
+                run_status = RunStatus.MEMORY_EXCEEDED
+            elif exit_code == 143:  # SIGTERM
+                run_status = RunStatus.KILLED
+            else:
+                run_status = RunStatus.RUNTIME_ERROR
+
+            return ExecutionResult(
+                compile_result=compile_result,
+                run_status=run_status,
+                stdout=stdout_str,
+                stderr=stderr_str,
+                exit_code=exit_code,
+                compile_duration_ms=compile_result.duration_ms,
+                run_duration_ms=run_ms,
+                total_duration_ms=total_ms,
+            )
+
+        except asyncio.TimeoutError:
+            run_ms = (time.perf_counter() - run_start) * 1000
+            total_ms = (time.perf_counter() - total_start) * 1000
+
+            # Try to kill the process
+            try:
+                await loop.run_in_executor(
+                    self._executor,
+                    lambda: self._container.exec_run("pkill -9 python"),
+                )
+            except Exception:
+                pass  # Best effort cleanup
+
+            return ExecutionResult(
+                compile_result=compile_result,
+                run_status=RunStatus.TIMEOUT,
+                stderr=f"Execution timed out after {timeout_s}s",
+                compile_duration_ms=compile_result.duration_ms,
+                run_duration_ms=run_ms,
+                total_duration_ms=total_ms,
+            )
+
+    def _write_file(self, path: str, content: str) -> None:
+        """
+        Write a file to the container using tarfile.
+
+        Docker API doesn't have a direct "write file" method,
+        so we create a tar archive in memory and extract it.
+        """
+        # Create tar archive in memory
+        tar_buffer = io.BytesIO()
+        tar = tarfile.open(fileobj=tar_buffer, mode="w")
+
+        # Add file to archive
+        file_data = content.encode("utf-8")
+        tarinfo = tarfile.TarInfo(name=path)
+        tarinfo.size = len(file_data)
+        tarinfo.mtime = time.time()
+        tar.addfile(tarinfo, io.BytesIO(file_data))
+        tar.close()
+
+        # Extract to container
+        tar_buffer.seek(0)
+        self._container.put_archive(self._config.working_dir, tar_buffer)
+
+    # -------------------------------------------------------------------------
+    # Batch execution (reduces ThreadPoolExecutor calls from O(N) to O(1))
+    # -------------------------------------------------------------------------
+
+    async def execute_batch(
+        self,
+        spec: BatchExecutionSpec,
+    ) -> AsyncIterator[Union[CompileResult, ExecutionResult]]:
+        """
+        Execute all tests in a single batch with streaming results.
+
+        This method reduces the number of ThreadPoolExecutor calls by:
+        1. Bundling code, manifest, and runner into a single tar
+        2. Executing the batch runner once, which runs all tests sequentially
+        3. Streaming results back as JSONL
+
+        Args:
+            spec: Batch execution specification with code, tests, and options
+
+        Yields:
+            CompileResult (if compile_first=True), then ExecutionResult for each test
+        """
+        batch_dir = "_batch"
+        batch_start = time.perf_counter()
+        loop = asyncio.get_event_loop()
+
+        # Build manifest for the batch runner
+        manifest = {
+            "code_file": "solution.py",
+            "compile_first": spec.compile_first,
+            "timeout_s": spec.timeout_s,
+            "stop_on_first_failure": spec.stop_on_first_failure,
+            "tests": [
+                {"id": t.id or f"test_{i}", "stdin": t.input, "expected": t.expected}
+                for i, t in enumerate(spec.tests)
+            ],
+        }
+
+        # Build and write tar archive
+        tar_data = self._build_batch_tar(
+            manifest=manifest,
+            code=spec.code,
+            runner_script=get_batch_runner_script(),
+            batch_dir=batch_dir,
+        )
+
+        def _write_tar():
+            tar_buffer = io.BytesIO(tar_data)
+            self._container.put_archive(self._config.working_dir, tar_buffer)
+
+        await loop.run_in_executor(self._executor, _write_tar)
+
+        # Execute batch runner and stream results
+        manifest_path = f"{self._config.working_dir}/{batch_dir}/manifest.json"
+        runner_path = f"{self._config.working_dir}/{batch_dir}/batch_runner.py"
+
+        run_start = time.perf_counter()
+        received_done = False
+        received_test_ids: set[str] = set()
+        compile_result: Optional[CompileResult] = None
+
+        def _execute():
+            result = self._container.exec_run(
+                f"python {runner_path} {manifest_path}",
+                workdir=f"{self._config.working_dir}/{batch_dir}",
+                demux=True,
+            )
+            return result
+
+        try:
+            result = await asyncio.wait_for(
+                loop.run_in_executor(self._executor, _execute),
+                timeout=spec.timeout_s * len(spec.tests) + 10.0,  # Extra buffer
+            )
+
+            stdout, stderr = result.output
+            stdout_str = (stdout or b"").decode("utf-8", errors="replace")
+
+            # Parse JSONL output
+            for line in stdout_str.strip().split("\n"):
+                if not line:
+                    continue
+
+                try:
+                    result_dict = json.loads(line)
+                except json.JSONDecodeError:
+                    logger.warning(f"Invalid JSON from batch runner: {line}")
+                    continue
+
+                result_type = result_dict.get("type")
+
+                if result_type == "compile":
+                    compile_result = parse_batch_compile_result(result_dict)
+                    yield compile_result
+                    if not compile_result.success:
+                        break
+
+                elif result_type == "test":
+                    test_id = result_dict.get("id", "unknown")
+                    received_test_ids.add(test_id)
+                    exec_result = parse_batch_test_result(result_dict, run_start)
+                    yield exec_result
+
+                elif result_type == "done":
+                    received_done = True
+                    break
+
+                elif result_type == "error":
+                    logger.error(f"Batch runner error: {result_dict.get('message')}")
+
+        except asyncio.TimeoutError:
+            logger.warning(f"Batch execution timed out")
+
+        except Exception as e:
+            logger.warning(f"Batch execution failed: {e}")
+
+        # Handle missing tests
+        if not received_done and compile_result is None:
+            compile_result = CompileResult(
+                status=CompileStatus.UNKNOWN_ERROR,
+                error_message="Batch execution terminated unexpectedly",
+                duration_ms=(time.perf_counter() - batch_start) * 1000,
+            )
+            yield compile_result
+
+        if not received_done and (compile_result is None or compile_result.success):
+            for i, test in enumerate(spec.tests):
+                test_id = test.id or f"test_{i}"
+                if test_id not in received_test_ids:
+                    run_ms = (time.perf_counter() - run_start) * 1000
+                    yield ExecutionResult(
+                        compile_result=compile_result or CompileResult(
+                            status=CompileStatus.SUCCESS
+                        ),
+                        run_status=RunStatus.SANDBOX_ERROR,
+                        stdout="",
+                        stderr="Batch execution terminated unexpectedly",
+                        exit_code=None,
+                        run_duration_ms=run_ms,
+                        total_duration_ms=run_ms,
+                    )
+
+    def _build_batch_tar(
+        self,
+        manifest: dict,
+        code: str,
+        runner_script: str,
+        batch_dir: str = "_batch",
+    ) -> bytes:
+        """Build tar archive containing batch execution files."""
+        buf = io.BytesIO()
+        with tarfile.open(fileobj=buf, mode="w") as tar:
+            # Create directory entry first
+            dir_info = tarfile.TarInfo(name=batch_dir)
+            dir_info.type = tarfile.DIRTYPE
+            dir_info.mode = 0o755
+            dir_info.mtime = int(time.time())
+            tar.addfile(dir_info)
+
+            # Add manifest.json
+            manifest_data = json.dumps(manifest, indent=2).encode("utf-8")
+            info = tarfile.TarInfo(name=f"{batch_dir}/manifest.json")
+            info.size = len(manifest_data)
+            info.mtime = int(time.time())
+            tar.addfile(info, io.BytesIO(manifest_data))
+
+            # Add solution.py
+            code_data = code.encode("utf-8")
+            info = tarfile.TarInfo(name=f"{batch_dir}/solution.py")
+            info.size = len(code_data)
+            info.mtime = int(time.time())
+            tar.addfile(info, io.BytesIO(code_data))
+
+            # Add batch_runner.py
+            runner_data = runner_script.encode("utf-8")
+            info = tarfile.TarInfo(name=f"{batch_dir}/batch_runner.py")
+            info.size = len(runner_data)
+            info.mtime = int(time.time())
+            tar.addfile(info, io.BytesIO(runner_data))
+
+        buf.seek(0)
+        return buf.read()
+
+
+class DockerSandboxPool(BaseSandboxPool[DockerSandbox]):
+    """
+    Pool of Docker sandboxes with LRU caching.
+
+    Manages container lifecycle, checkout/release, and execution caching.
+    Inherits background reset pattern from BaseSandboxPool.
+    """
+
+    def __init__(
+        self,
+        n_workers: int = 4,
+        config: Optional[DockerSandboxConfig] = None,
+        cache_size: int = 10000,
+        executor_threads: int = 8,
+        auto_replace_failed: bool = False,
+        max_consecutive_failures: int = 5,
+        max_concurrent_ops: int = 8,
+    ):
+        # Initialize base pool
+        super().__init__(
+            n_workers=n_workers,
+            cache_size=cache_size,
+            auto_replace_failed=auto_replace_failed,
+            max_consecutive_failures=max_consecutive_failures,
+            max_concurrent_ops=max_concurrent_ops,
+        )
+
+        # Docker-specific configuration
+        self._config = config or DockerSandboxConfig()
+        self._executor = ThreadPoolExecutor(max_workers=executor_threads)
+        self._docker_client: Optional[docker.DockerClient] = None
+
+    @property
+    def python_version(self) -> str:
+        return self._config.python_version
+
+    async def _create_sandboxes(self) -> list[DockerSandbox]:
+        """
+        Create all Docker containers.
+
+        Pulls the image if needed, creates containers with resource limits.
+        Called by base class start() method.
+        """
+        loop = asyncio.get_event_loop()
+
+        def _start():
+            # Create Docker client
+            client = docker.from_env()
+
+            # Pull image if not present
+            try:
+                client.images.get(self._config.image)
+            except docker.errors.ImageNotFound:
+                print(f"Pulling image {self._config.image}...")
+                client.images.pull(self._config.image)
+
+            # Define function to create a single container
+            def create_container(i: int):
+                # Generate container name with PID for uniqueness
+                container_name = f"ludic-sandbox-{self._config.python_version}-{os.getpid()}-{i}"
+
+                # Remove existing container if present
+                try:
+                    old = client.containers.get(container_name)
+                    old.remove(force=True)
+                except docker.errors.NotFound:
+                    pass
+
+                # Create container with resource limits
+                container = client.containers.create(
+                    image=self._config.image,
+                    name=container_name,
+                    detach=True,
+                    command="sleep infinity",  # Keep container alive
+                    mem_limit=self._config.memory_limit,
+                    cpu_quota=self._config.cpu_quota,
+                    cpu_period=100000,  # Standard 100ms period
+                    network_disabled=self._config.network_disabled,
+                    working_dir=self._config.working_dir,
+                    auto_remove=False,  # We'll manage cleanup
+                )
+
+                # Start container
+                container.start()
+
+                # Create sandbox wrapper
+                return DockerSandbox(
+                    container=container,
+                    config=self._config,
+                    executor=self._executor,
+                )
+
+            # Parallelize container creation
+            with ThreadPoolExecutor(max_workers=self._n_workers) as pool:
+                sandboxes = list(pool.map(create_container, range(self._n_workers)))
+
+            return client, sandboxes
+
+        # Run container creation in executor
+        self._docker_client, sandboxes = await loop.run_in_executor(
+            self._executor, _start
+        )
+
+        return sandboxes
+
+    async def _stop_sandbox(self, sandbox: DockerSandbox) -> None:
+        """
+        Stop and remove a single Docker container.
+
+        Called during shutdown and when replacing a failed sandbox.
+        Errors are logged but not raised.
+        """
+        loop = asyncio.get_event_loop()
+
+        def _stop():
+            try:
+                sandbox._container.stop(timeout=2)
+                sandbox._container.remove(force=True)
+            except Exception as e:
+                print(f"Warning: Failed to remove container: {e}")
+
+        await loop.run_in_executor(self._executor, _stop)
+
+    async def _create_replacement_sandbox(self) -> Optional[DockerSandbox]:
+        """
+        Create a single replacement Docker container.
+
+        Called when a sandbox fails to reset and auto_replace_failed is True.
+        Returns None if container creation fails.
+        """
+        loop = asyncio.get_event_loop()
+
+        def _create():
+            if self._docker_client is None:
+                return None
+
+            try:
+                # Generate unique container name
+                import random
+                i = random.randint(10000, 99999)
+                container_name = f"ludic-sandbox-{self._config.python_version}-{os.getpid()}-{i}"
+
+                # Remove existing container if present
+                try:
+                    old = self._docker_client.containers.get(container_name)
+                    old.remove(force=True)
+                except docker.errors.NotFound:
+                    pass
+
+                # Create container with resource limits
+                container = self._docker_client.containers.create(
+                    image=self._config.image,
+                    name=container_name,
+                    detach=True,
+                    command="sleep infinity",
+                    mem_limit=self._config.memory_limit,
+                    cpu_quota=self._config.cpu_quota,
+                    cpu_period=100000,
+                    network_disabled=self._config.network_disabled,
+                    working_dir=self._config.working_dir,
+                    auto_remove=False,
+                )
+
+                # Start container
+                container.start()
+
+                # Create sandbox wrapper
+                return DockerSandbox(
+                    container=container,
+                    config=self._config,
+                    executor=self._executor,
+                )
+            except Exception:
+                return None
+
+        return await loop.run_in_executor(self._executor, _create)
+
+    async def shutdown(self) -> None:
+        """
+        Tear down all containers and release resources.
+
+        Waits for pending resets, stops containers, closes Docker client,
+        and shuts down executor.
+        """
+        # Base shutdown handles pending resets and calls _stop_sandbox
+        await super().shutdown()
+
+        # Docker-specific cleanup
+        loop = asyncio.get_event_loop()
+
+        def _close_client():
+            if self._docker_client:
+                self._docker_client.close()
+
+        await loop.run_in_executor(self._executor, _close_client)
+
+        # Shutdown executor
+        self._executor.shutdown(wait=True)
diff --git a/src/ludic/envs/code_exec/env.py b/src/ludic/envs/code_exec/env.py
new file mode 100644
index 0000000..1e0f190
--- /dev/null
+++ b/src/ludic/envs/code_exec/env.py
@@ -0,0 +1,452 @@
+"""
+Main environment for code execution RL tasks.
+
+This environment bridges the world of RL agents and code execution sandboxes,
+providing a clean SingleAgentEnv interface for training LLMs to write code.
+
+Key design decisions:
+  1. env_reset and env_step are async to support async sandbox operations
+  2. The interaction protocol (Phase 6) must detect and await these coroutines
+  3. Caching is handled at the pool level but controllable via config
+  4. Rich info dict includes all execution metadata for analysis/logging
+
+Note: env_reset and env_step are async methods. The interaction protocol
+must detect this and await them. See Phase 6 integration.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple
+
+from ludic.envs.single_agent_env import SingleAgentEnv
+from ludic.types import Info, Observation, StepOutcome
+
+from .adapters.base import ExactMatchVerifier, OutputVerifier, TestAdapter
+from .runners import CodeRunner, StdinStdoutRunner, compute_hash, hash_tests
+from .sandbox import SandboxPool
+from .types import BatchTestResult, TestCase
+
+
+@dataclass
+class CodeExecConfig:
+    """Configuration for CodeExecEnv behavior."""
+
+    # Execution limits
+    timeout_per_test_s: float = 5.0  # efficiency-focused default
+    memory_limit_mb: int = 256
+    max_tests: Optional[int] = None  # limit number of tests
+    stop_on_first_failure: bool = True
+    compile_first: bool = True
+
+    # Reward shaping
+    partial_credit: bool = False  # reward = fraction passed
+    compile_failure_reward: float = -0.1
+
+    # Observations
+    include_stderr_in_obs: bool = True
+    max_error_length: int = 500
+
+    # Caching
+    use_cache: bool = True
+
+
+class CodeExecEnv(SingleAgentEnv):
+    """
+    Code execution environment for RL training.
+
+    This environment:
+      - Takes a dataset sample containing a problem + test cases
+      - Extracts prompt and tests via a TestAdapter
+      - Executes submitted code in a Sandbox from a SandboxPool
+      - Verifies outputs using an OutputVerifier
+      - Computes rewards based on test results
+      - Returns rich info dicts for logging/analysis
+
+    The environment is single-step by design: agent submits code once,
+    gets results, episode ends. For multi-step refinement, wrap this
+    in a meta-environment or use a ReAct-style agent with tool calling.
+
+    Example usage:
+        ```python
+        pool = await create_sandbox_pool(size=4)
+        adapter = APPSAdapter()
+
+        env = CodeExecEnv(
+            sample=dataset[0],
+            sandbox_pool=pool,
+            test_adapter=adapter,
+            config=CodeExecConfig(partial_credit=True),
+        )
+
+        obs, info = await env.env_reset()
+        outcome = await env.env_step(agent_code)
+        ```
+    """
+
+    def __init__(
+        self,
+        sample: Dict[str, Any],
+        *,
+        sandbox_pool: SandboxPool,
+        test_adapter: TestAdapter,
+        code_runner: Optional[CodeRunner] = None,
+        verifier: Optional[OutputVerifier] = None,
+        config: Optional[CodeExecConfig] = None,
+        system_prompt: Optional[str] = None,
+    ) -> None:
+        """
+        Initialize the code execution environment.
+
+        Args:
+            sample: Dataset sample containing problem and tests
+            sandbox_pool: Shared pool of sandboxes for execution
+            test_adapter: Adapter to extract prompt/tests from sample
+            code_runner: Runner for executing code (default: StdinStdoutRunner)
+            verifier: Output verifier (default: ExactMatchVerifier)
+            config: Environment configuration (default: CodeExecConfig())
+            system_prompt: Optional system prompt for the agent
+        """
+        super().__init__()
+
+        self._sample = sample
+        self._sandbox_pool = sandbox_pool
+        self._test_adapter = test_adapter
+        self._code_runner = code_runner or StdinStdoutRunner(
+            default_timeout_s=config.timeout_per_test_s if config else 5.0,
+            memory_limit_mb=config.memory_limit_mb if config else 256,
+        )
+        self._verifier = verifier or ExactMatchVerifier()
+        self._config = config or CodeExecConfig()
+        self._system_prompt = system_prompt
+
+        # Episode state (set during reset)
+        self._problem_id: Optional[str] = None
+        self._prompt: Optional[str] = None
+        self._tests: Optional[List[TestCase]] = None
+        self._tests_hash: Optional[str] = None
+        self._current_obs: Optional[Observation] = None
+
+    @property
+    def suggested_sysprompt(self) -> Optional[str]:
+        """Return the configured system prompt."""
+        return self._system_prompt
+
+    async def env_reset(
+        self, *, seed: Optional[int] = None
+    ) -> Tuple[Observation, Info]:
+        """
+        Reset the environment for a new episode.
+
+        Extracts the problem prompt and test cases from the sample,
+        but does not checkout a sandbox yet (that happens on step).
+
+        Args:
+            seed: Optional random seed (unused in this deterministic env)
+
+        Returns:
+            Tuple of (prompt, info) where info contains problem metadata
+        """
+        # Extract problem components via adapter
+        self._problem_id = self._test_adapter.get_problem_id(self._sample)
+        self._prompt = self._test_adapter.get_prompt(self._sample)
+        self._tests = self._test_adapter.get_tests(self._sample)
+
+        # Handle case where no tests were extracted
+        if not self._tests:
+            error_msg = f"No tests extracted for problem {self._problem_id}"
+            self._current_obs = error_msg
+            return self._current_obs, {
+                "problem_id": self._problem_id,
+                "error": "no_tests_extracted",
+            }
+
+        # Apply max_tests limit if configured
+        if self._config.max_tests is not None:
+            self._tests = self._tests[: self._config.max_tests]
+
+        # Compute tests hash for caching
+        self._tests_hash = hash_tests(self._tests)
+
+        # Set current observation to the prompt
+        self._current_obs = self._prompt
+
+        # Build info dict with episode metadata
+        info: Info = {
+            "problem_id": self._problem_id,
+            "num_tests": len(self._tests),
+            "tests_hash": self._tests_hash,
+            "python_version": self._sandbox_pool.python_version,
+        }
+
+        return self._current_obs, info
+
+    async def env_step(self, action: str) -> StepOutcome:
+        """
+        Execute submitted code and return results.
+
+        This is the core of the environment: takes the agent's code,
+        runs it through the sandbox, computes rewards, and builds
+        rich observations and info dicts.
+
+        Args:
+            action: The code submitted by the agent
+
+        Returns:
+            StepOutcome with observation, reward, termination flags, and info
+        """
+        # Sanity check: ensure reset was called
+        if self._tests is None or self._tests_hash is None:
+            error_obs = "Error: env_reset() must be called before env_step()"
+            return StepOutcome(
+                obs=error_obs,
+                reward=-1.0,
+                truncated=False,
+                terminated=True,
+                info={"error": "reset_not_called"},
+            )
+
+        # Handle empty code submission
+        if not action.strip():
+            error_obs = "Error: Empty code submission"
+            return StepOutcome(
+                obs=error_obs,
+                reward=self._config.compile_failure_reward,
+                truncated=False,
+                terminated=True,
+                info={"error": "empty_code"},
+            )
+
+        # Compute code hash for caching
+        code = action.strip()
+        code_hash = compute_hash(code)
+
+        # Check cache FIRST, before checkout
+        result: Optional[BatchTestResult] = None
+        cache_hit = False
+
+        if self._config.use_cache:
+            result = self._sandbox_pool.get_cached(code_hash, self._tests_hash)
+            if result is not None:
+                cache_hit = True
+
+        # Only checkout sandbox if cache miss
+        if result is None:
+            # Checkout sandbox from pool
+            sandbox = await self._sandbox_pool.checkout()
+
+            try:
+                # Run tests via code runner
+                result = await self._code_runner.run_tests(
+                    sandbox=sandbox,
+                    code=code,
+                    tests=self._tests,
+                    verifier=self._verifier,
+                    stop_on_first_failure=self._config.stop_on_first_failure,
+                    compile_first=self._config.compile_first,
+                )
+
+                # Cache result if enabled
+                if self._config.use_cache:
+                    self._sandbox_pool.put_cached(code_hash, self._tests_hash, result)
+
+            finally:
+                # Always release sandbox back to pool
+                await self._sandbox_pool.release(sandbox)
+
+        # Compute reward based on results
+        reward = self._compute_reward(result)
+
+        # Build observation for agent
+        obs = self._build_observation(result)
+        self._current_obs = obs
+
+        # Build rich info dict for logging/analysis
+        info = self._build_info(result, code_hash, cache_hit)
+
+        # Episode ends after single step (single-shot code generation)
+        return StepOutcome(
+            obs=obs,
+            reward=reward,
+            truncated=False,
+            terminated=True,
+            info=info,
+        )
+
+    def env_current_obs(self) -> Observation:
+        """
+        Return the current observation.
+
+        Returns:
+            The current observation string
+        """
+        if self._current_obs is None:
+            return "Error: No observation available (call env_reset first)"
+        return self._current_obs
+
+    def _compute_reward(self, result: BatchTestResult) -> float:
+        """
+        Compute reward from test results.
+
+        Reward schemes:
+          - partial_credit=False: 1.0 if all passed, 0.0 otherwise
+          - partial_credit=True: fraction of tests passed (0.0 to 1.0)
+          - Compilation failures get compile_failure_reward
+
+        Args:
+            result: Batch test results
+
+        Returns:
+            Scalar reward value
+        """
+        # Compilation failure gets special penalty
+        if result.compile_failed:
+            return self._config.compile_failure_reward
+
+        # All tests passed
+        if result.all_passed:
+            return 1.0
+
+        # Partial credit
+        if self._config.partial_credit:
+            return result.pass_rate
+
+        # Binary reward (all or nothing)
+        return 0.0
+
+    def _build_observation(self, result: BatchTestResult) -> str:
+        """
+        Build observation string from test results.
+
+        The observation provides feedback to the agent about what went wrong,
+        including compilation errors, runtime errors, or test failures.
+
+        Args:
+            result: Batch test results
+
+        Returns:
+            Observation string for the agent
+        """
+        # All tests passed - success message
+        if result.all_passed:
+            return (
+                f"All {result.total_count} tests passed! "
+                f"Total execution time: {result.total_run_ms:.1f}ms"
+            )
+
+        # Compilation failed - show compile error
+        if result.compile_failed:
+            first = result.results[0]
+            compile_err = (
+                first.execution.compile_result.error_message or "Unknown error"
+            )
+
+            # Truncate error if too long
+            if len(compile_err) > self._config.max_error_length:
+                compile_err = compile_err[: self._config.max_error_length] + "..."
+
+            obs = f"Compilation failed: {compile_err}"
+
+            if first.execution.compile_result.error_line is not None:
+                obs += f" (line {first.execution.compile_result.error_line})"
+
+            return obs
+
+        # Some tests failed - show first failure details
+        first_failure = result.first_failure
+        if first_failure is None:
+            # Should never happen, but handle gracefully
+            return f"Tests failed: {result.passed_count}/{result.total_count} passed"
+
+        obs_parts = [f"Tests failed: {result.passed_count}/{result.total_count} passed"]
+
+        # Add first failure details
+        if first_failure.comparison_details:
+            details = first_failure.comparison_details
+            if len(details) > self._config.max_error_length:
+                details = details[: self._config.max_error_length] + "..."
+            obs_parts.append(f"\nFirst failure: {details}")
+
+        # Add stderr if configured and available
+        if self._config.include_stderr_in_obs and first_failure.execution.stderr:
+            stderr = first_failure.execution.stderr.strip()
+            if stderr:
+                if len(stderr) > self._config.max_error_length:
+                    stderr = stderr[: self._config.max_error_length] + "..."
+                obs_parts.append(f"\nStderr: {stderr}")
+
+        return "".join(obs_parts)
+
+    def _build_info(
+        self,
+        result: BatchTestResult,
+        code_hash: str,
+        cache_hit: bool,
+    ) -> Info:
+        """
+        Build rich info dict with all execution metadata.
+
+        The info dict is JSON-serializable and includes everything needed
+        for logging, analysis, and debugging.
+
+        Args:
+            result: Batch test results
+            code_hash: Hash of the submitted code
+            cache_hit: Whether result came from cache
+
+        Returns:
+            Info dict with comprehensive metadata
+        """
+        # Build per-test result summaries
+        test_results = []
+        for test_result in result.results:
+            test_info = {
+                "test_id": test_result.test_case.id,
+                "passed": test_result.passed,
+                "compiled": test_result.compiled,
+                "ran": test_result.ran,
+                "run_status": (
+                    test_result.execution.run_status.value
+                    if test_result.execution.run_status
+                    else None
+                ),
+                "compile_status": test_result.execution.compile_result.status.value,
+                "run_duration_ms": test_result.execution.run_duration_ms,
+                "stdout": test_result.execution.stdout,
+                "stderr": test_result.execution.stderr,
+            }
+
+            # Optionally include failure details
+            if not test_result.passed and test_result.comparison_details:
+                test_info["failure_reason"] = test_result.comparison_details
+
+            test_results.append(test_info)
+
+        # Build complete info dict
+        info: Info = {
+            # Problem metadata
+            "problem_id": self._problem_id,
+            "code_hash": code_hash,
+            "tests_hash": self._tests_hash,
+            # Test results summary
+            "passed": result.passed_count,
+            "total": result.total_count,
+            "all_passed": result.all_passed,
+            "pass_rate": result.pass_rate,
+            "compile_failed": result.compile_failed,
+            # Detailed test results
+            "test_results": test_results,
+            # Timing
+            "timing": {
+                "total_compile_ms": result.total_compile_ms,
+                "total_run_ms": result.total_run_ms,
+                "total_execution_ms": result.total_execution_ms,
+            },
+            # Cache info
+            "cache_hit": cache_hit,
+            "cache_stats": self._sandbox_pool.cache_stats,
+            # Environment metadata
+            "python_version": self._sandbox_pool.python_version,
+        }
+
+        return info
diff --git a/src/ludic/envs/code_exec/factory.py b/src/ludic/envs/code_exec/factory.py
new file mode 100644
index 0000000..d993dee
--- /dev/null
+++ b/src/ludic/envs/code_exec/factory.py
@@ -0,0 +1,210 @@
+"""
+Unified factory for creating sandbox pools.
+
+This module provides:
+  - create_sandbox_pool(): Async factory that auto-detects or uses specified backend
+
+Usage:
+  from ludic.envs.code_exec import create_sandbox_pool
+
+  # Auto-detect backend
+  pool = await create_sandbox_pool(n_workers=4)
+
+  # Explicit backend
+  pool = await create_sandbox_pool(n_workers=4, backend="podman-hpc")
+
+  # With custom config
+  pool = await create_sandbox_pool(
+      n_workers=4,
+      backend="docker",
+      python_version="3.11",
+      memory_limit="512m",
+  )
+"""
+
+from __future__ import annotations
+
+from typing import Any, Optional
+
+from .backend import SandboxBackend, detect_available_backend
+from .sandbox import SandboxPool
+
+
+async def create_sandbox_pool(
+    n_workers: int = 4,
+    backend: str = "auto",
+    python_version: str = "3.11",
+    cache_size: int = 10000,
+    max_concurrent_ops: int = 8,
+    workspace_base_dir: str = "auto",
+    **backend_kwargs: Any,
+) -> SandboxPool:
+    """
+    Create and start a sandbox pool with the specified or auto-detected backend.
+
+    This is the recommended way to create sandbox pools as it handles:
+      - Backend auto-detection based on environment
+      - Consistent configuration across backends
+      - Proper initialization (pull images, start containers)
+
+    Args:
+        n_workers: Number of parallel sandboxes in the pool
+        backend: Backend to use ("auto", "docker", "podman-hpc", "singularity")
+        python_version: Python version for the sandbox containers
+        cache_size: Maximum number of cached execution results
+        max_concurrent_ops: Maximum concurrent sandbox operations (resets, exec
+            calls). Prevents deadlock in HPC environments. Default 8.
+        workspace_base_dir: Base directory for host-mounted workspaces.
+            - "auto": Auto-detect (use /local on HPC if SLURM_JOB_ID set)
+            - explicit path: Use this directory
+            - None: Disable bind mounts, use tar-based I/O
+        **backend_kwargs: Additional backend-specific configuration:
+            - memory_limit (str): Memory limit (e.g., "256m", "1g")
+            - cpu_quota (float): CPU limit as fraction (e.g., 0.5 = 50% of one CPU)
+            - network_disabled (bool): Disable network access (default: True)
+            - gpu (bool): Enable GPU access (podman-hpc only)
+            - image (str): Custom container image (overrides python_version)
+            - sif_path (str): Path to .sif file (singularity only)
+
+    Returns:
+        Started SandboxPool instance
+
+    Raises:
+        RuntimeError: If the specified backend is not available
+        ValueError: If an unknown backend is specified
+
+    Examples:
+        # Auto-detect (recommended)
+        pool = await create_sandbox_pool(n_workers=4)
+
+        # Docker with custom memory
+        pool = await create_sandbox_pool(
+            n_workers=4,
+            backend="docker",
+            memory_limit="512m",
+        )
+
+        # Podman-HPC with GPU
+        pool = await create_sandbox_pool(
+            n_workers=4,
+            backend="podman-hpc",
+            gpu=True,
+        )
+    """
+    # Resolve backend
+    if backend == "auto" or backend == SandboxBackend.AUTO:
+        resolved_backend = detect_available_backend()
+        print(f"Auto-detected sandbox backend: {resolved_backend}")
+    else:
+        resolved_backend = backend
+
+    # Create pool based on backend
+    if resolved_backend == SandboxBackend.DOCKER.value:
+        pool = _create_docker_pool(
+            n_workers=n_workers,
+            python_version=python_version,
+            cache_size=cache_size,
+            max_concurrent_ops=max_concurrent_ops,
+            **backend_kwargs,
+        )
+
+    elif resolved_backend == SandboxBackend.PODMAN_HPC.value:
+        pool = _create_podman_hpc_pool(
+            n_workers=n_workers,
+            python_version=python_version,
+            cache_size=cache_size,
+            max_concurrent_ops=max_concurrent_ops,
+            workspace_base_dir=workspace_base_dir,
+            **backend_kwargs,
+        )
+
+    elif resolved_backend == SandboxBackend.SINGULARITY.value:
+        raise NotImplementedError(
+            "Singularity backend is not yet implemented. "
+            "Use 'docker' or 'podman-hpc' instead."
+        )
+
+    else:
+        raise ValueError(
+            f"Unknown backend: {resolved_backend}. "
+            f"Valid options: {', '.join(b.value for b in SandboxBackend if b != SandboxBackend.AUTO)}"
+        )
+
+    # Start the pool
+    await pool.start()
+    return pool
+
+
+def _create_docker_pool(
+    n_workers: int,
+    python_version: str,
+    cache_size: int,
+    max_concurrent_ops: int = 8,
+    memory_limit: str = "256m",
+    cpu_quota: int = 50000,
+    network_disabled: bool = True,
+    image: Optional[str] = None,
+    **_kwargs: Any,
+) -> SandboxPool:
+    """Create DockerSandboxPool with configuration."""
+    try:
+        from .docker_sandbox import DockerSandboxConfig, DockerSandboxPool
+    except ImportError:
+        raise RuntimeError(
+            "Docker backend requires the docker package:\n"
+            "  pip install docker>=7.0.0"
+        )
+
+    config = DockerSandboxConfig(
+        python_version=python_version,
+        base_image=image,
+        memory_limit=memory_limit,
+        cpu_quota=cpu_quota,
+        network_disabled=network_disabled,
+    )
+
+    return DockerSandboxPool(
+        n_workers=n_workers,
+        config=config,
+        cache_size=cache_size,
+        max_concurrent_ops=max_concurrent_ops,
+    )
+
+
+def _create_podman_hpc_pool(
+    n_workers: int,
+    python_version: str,
+    cache_size: int,
+    max_concurrent_ops: int = 8,
+    workspace_base_dir: str = "auto",
+    memory_limit: str = "256m",
+    cpu_quota: Optional[float] = None,
+    network_disabled: bool = True,
+    gpu: bool = False,
+    image: Optional[str] = None,
+    extra_args: Optional[list[str]] = None,
+    **_kwargs: Any,
+) -> SandboxPool:
+    """Create PodmanHPCSandboxPool with configuration."""
+    from .podman_sandbox import PodmanConfig, PodmanHPCSandboxPool
+
+    config = PodmanConfig(
+        memory_limit=memory_limit,
+        cpu_quota=cpu_quota,
+        network_disabled=network_disabled,
+        gpu=gpu,
+        extra_args=extra_args,
+    )
+
+    # Determine image
+    if image is None:
+        image = f"python:{python_version}-slim"
+
+    return PodmanHPCSandboxPool(
+        n_workers=n_workers,
+        image=image,
+        config=config,
+        cache_size=cache_size,
+        max_concurrent_ops=max_concurrent_ops,
+        workspace_base_dir=workspace_base_dir,
+    )
diff --git a/src/ludic/envs/code_exec/parsing.py b/src/ludic/envs/code_exec/parsing.py
new file mode 100644
index 0000000..a24e6e3
--- /dev/null
+++ b/src/ludic/envs/code_exec/parsing.py
@@ -0,0 +1,127 @@
+"""Shared parsing utilities for code execution sandboxes."""
+
+from __future__ import annotations
+
+import re
+import time
+from typing import Optional
+
+from .types import CompileResult, CompileStatus, ExecutionResult, RunStatus
+
+# Import batch runner script using importlib.resources
+try:
+    from importlib.resources import files
+
+    _BATCH_RUNNER_SCRIPT: Optional[str] = None
+
+    def get_batch_runner_script() -> str:
+        """Lazy-load the batch runner script from package resources."""
+        global _BATCH_RUNNER_SCRIPT
+        if _BATCH_RUNNER_SCRIPT is None:
+            _BATCH_RUNNER_SCRIPT = (
+                files("ludic.envs.code_exec")
+                .joinpath("batch_runner.py")
+                .read_text()
+            )
+        return _BATCH_RUNNER_SCRIPT
+
+except ImportError:
+    # Fallback for older Python versions
+    import pkg_resources
+
+    def get_batch_runner_script() -> str:
+        return pkg_resources.resource_string(
+            "ludic.envs.code_exec", "batch_runner.py"
+        ).decode("utf-8")
+
+
+def parse_syntax_error(error_msg: str) -> tuple[Optional[int], Optional[int], str]:
+    """Parse Python syntax error to extract line, column, and clean message."""
+    line = None
+    column = None
+    clean_msg = ""
+
+    # Try to find line number
+    line_match = re.search(r'line (\d+)', error_msg)
+    if line_match:
+        line = int(line_match.group(1))
+
+    # Try to find column number
+    col_match = re.search(r'column (\d+)', error_msg)
+    if col_match:
+        column = int(col_match.group(1))
+
+    # Extract error type and message
+    error_type_match = re.search(
+        r'(SyntaxError|IndentationError|TabError):\s*(.+)', error_msg
+    )
+    if error_type_match:
+        error_type = error_type_match.group(1)
+        msg = error_type_match.group(2).strip()
+        clean_msg = f"{error_type}: {msg}"
+    else:
+        # Fall back to just extracting the last line
+        lines = [l.strip() for l in error_msg.split('\n') if l.strip()]
+        if lines:
+            clean_msg = lines[-1]
+
+    return line, column, clean_msg
+
+
+def parse_batch_compile_result(result: dict) -> CompileResult:
+    """Parse compile result from batch runner JSON."""
+    status_str = result.get("status", "unknown_error")
+
+    if status_str == "success":
+        status = CompileStatus.SUCCESS
+    elif status_str == "syntax_error":
+        status = CompileStatus.SYNTAX_ERROR
+    elif status_str == "timeout":
+        status = CompileStatus.TIMEOUT
+    else:
+        status = CompileStatus.UNKNOWN_ERROR
+
+    return CompileResult(
+        status=status,
+        error_message=result.get("error_message"),
+        error_line=result.get("error_line"),
+        error_column=result.get("error_column"),
+        duration_ms=result.get("duration_ms", 0.0),
+    )
+
+
+def parse_batch_test_result(
+    result: dict,
+    run_start: float,
+) -> ExecutionResult:
+    """Parse test result from batch runner JSON."""
+    status_str = result.get("status", "runtime_error")
+
+    if status_str == "success":
+        run_status = RunStatus.SUCCESS
+    elif status_str == "runtime_error":
+        run_status = RunStatus.RUNTIME_ERROR
+    elif status_str == "timeout":
+        run_status = RunStatus.TIMEOUT
+    elif status_str == "memory_exceeded":
+        run_status = RunStatus.MEMORY_EXCEEDED
+    elif status_str == "not_run":
+        run_status = RunStatus.NOT_RUN
+    elif status_str == "killed":
+        run_status = RunStatus.KILLED
+    else:
+        run_status = RunStatus.RUNTIME_ERROR
+
+    duration_ms = result.get("duration_ms", 0.0)
+    total_ms = (time.perf_counter() - run_start) * 1000
+
+    return ExecutionResult(
+        compile_result=CompileResult(status=CompileStatus.SUCCESS),
+        run_status=run_status,
+        stdout=result.get("stdout", ""),
+        stderr=result.get("stderr", ""),
+        exit_code=result.get("exit_code"),
+        run_duration_ms=duration_ms,
+        total_duration_ms=total_ms,
+        cache_key=result.get("id", ""),  # Pass test_id for matching in runner
+    )
diff --git a/src/ludic/envs/code_exec/podman_sandbox.py b/src/ludic/envs/code_exec/podman_sandbox.py
new file mode 100644
index 0000000..e8b89e8
--- /dev/null
+++ b/src/ludic/envs/code_exec/podman_sandbox.py
@@ -0,0 +1,1031 @@
+"""
+Podman-HPC sandbox implementation for code execution on HPC clusters.
+
+Provides:
+  - PodmanConfig: Configuration for Podman containers
+  - PodmanHPCSandbox: Async Podman container sandbox using subprocess
+  - PodmanHPCSandboxPool: Pool of Podman sandboxes with caching
+
+Podman-HPC is a daemonless container runtime wrapper for HPC clusters (e.g., Isambard).
+Uses asyncio.create_subprocess_exec instead of docker-py SDK.
+
+**Important**: On some HPC systems (Isambard), podman-hpc's squashfs conversion
+breaks the PATH variable. All commands in this module use absolute paths:
+  - /bin/sleep, /bin/mkdir, /bin/sh
+  - /usr/local/bin/python
+  - /usr/bin/pkill
+"""
+
+from __future__ import annotations
+
+import asyncio
+import io
+import json
+import logging
+import math
+import os
+import re
+import shutil
+import tarfile
+import time
+import uuid
+from dataclasses import dataclass
+from pathlib import Path
+from typing import AsyncIterator, Dict, List, Optional, Union
+
+from .parsing import (
+    get_batch_runner_script,
+    parse_batch_compile_result,
+    parse_batch_test_result,
+    parse_syntax_error,
+)
+from .pool import BaseSandboxPool
+from .sandbox import Sandbox, SandboxPool
+from .types import (
+    BatchExecutionSpec,
+    BatchTestResult,
+    CompileResult,
+    CompileStatus,
+    ExecutionResult,
+    RunStatus,
+    TestCase,
+)
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class PodmanConfig:
+    """Configuration for Podman-HPC sandboxes."""
+
+    memory_limit: str = "256m"
+    cpu_quota: Optional[float] = None  # CPU limit (e.g., 0.5 = 50% of one CPU)
+    network_disabled: bool = True
+    working_dir: str = "/workspace"
+    gpu: bool = False  # Pass --gpu flag for GPU access
+    extra_args: Optional[list[str]] = None  # Additional podman-hpc run args
+
+
+def _get_container_name_prefix() -> str:
+    """
+    Get container name prefix including SLURM_JOB_ID if in a Slurm job.
+
+    Returns:
+        Container name prefix like "ludic-sandbox-12345" or "ludic-sandbox-local"
+    """
+    slurm_job_id = os.environ.get("SLURM_JOB_ID")
+    if slurm_job_id:
+        return f"ludic-sandbox-{slurm_job_id}"
+    return "ludic-sandbox-local"
+
+
+class PodmanHPCSandbox:
+    """
+    Async Podman-HPC container sandbox for Python code execution.
+
+    Uses persistent containers (sleep infinity) with exec for code execution.
+    All operations use asyncio.create_subprocess_exec for non-blocking I/O.
+
+    Podman Concurrency Note:
+        Podman has known issues with concurrent operations (deadlock above ~8
+        simultaneous exec calls). All sandboxes in a pool share an exec_semaphore
+        to prevent overwhelming podman's lock manager.
+    """
+
+    def __init__(
+        self,
+        container_name: str,
+        image: str,
+        config: PodmanConfig,
+        python_version: str = "3.11",
+        exec_semaphore: Optional[asyncio.Semaphore] = None,
+        workspace_host_dir: Optional[str] = None,
+    ):
+        self._container_name = container_name
+        self._image = image
+        self._config = config
+        self._python_version = python_version
+        self._exec_semaphore = exec_semaphore  # Shared across all sandboxes in pool
+        self._workspace_host_dir = workspace_host_dir
+        self._started = False
+
+    @property
+    def python_version(self) -> str:
+        return self._python_version
+
+    async def start(self) -> None:
+        """Create and start the persistent container."""
+        if self._started:
+            return
+
+        # Remove existing container if present
+        await self._run_podman("rm", "-f", self._container_name, check=False)
+
+        # Build run command
+        cmd = ["run", "-d", "--name", self._container_name]
+
+        # Resource limits
+        if self._config.memory_limit:
+            cmd.extend(["--memory", self._config.memory_limit])
+        if self._config.cpu_quota:
+            cmd.extend(["--cpus", str(self._config.cpu_quota)])
+        if self._config.network_disabled:
+            cmd.extend(["--network", "none"])
+        if self._config.gpu:
+            cmd.append("--gpu")
+        if self._config.extra_args:
+            cmd.extend(self._config.extra_args)
+
+        # Add bind mount if workspace_host_dir is set
+        if self._workspace_host_dir:
+            logger.info(
+                f"[{self._container_name}] Using bind mount: "
+                f"{self._workspace_host_dir} -> {self._config.working_dir}"
+            )
+            cmd.extend(
+                ["-v", f"{self._workspace_host_dir}:{self._config.working_dir}:rw"]
+            )
+
+        # Image and command (use full path for HPC compatibility)
+        cmd.extend([self._image, "/bin/sleep", "infinity"])
+
+        # Capture stderr to provide useful error messages
+        await self._run_podman(*cmd, capture=True)
+
+        # Ensure workspace directory exists (use full path for HPC compatibility)
+        # Skip if using bind mount (host directory should already exist)
+        if not self._workspace_host_dir:
+            await self._run_podman(
+                "exec",
+                self._container_name,
+                "/bin/mkdir",
+                "-p",
+                self._config.working_dir,
+                capture=True,
+            )
+
+        self._started = True
+
+    async def stop(self) -> None:
+        """Stop and remove the container."""
+        if not self._started:
+            return
+
+        await self._run_podman("stop", "-t", "2", self._container_name, check=False)
+        await self._run_podman("rm", "-f", self._container_name, check=False)
+        self._started = False
+
+    async def reset(self) -> None:
+        """Clear workspace directory (in-place, no container restart)."""
+        if not self._started:
+            return
+
+        if self._workspace_host_dir:
+            # Direct host filesystem cleanup - no podman exec, no semaphore
+            logger.debug(
+                f"[{self._container_name}] reset() using direct host cleanup..."
+            )
+            start = time.perf_counter()
+
+            workspace_path = Path(self._workspace_host_dir)
+            for item in workspace_path.iterdir():
+                if item.is_dir():
+                    shutil.rmtree(item)
+                else:
+                    item.unlink()
+
+            elapsed = time.perf_counter() - start
+            logger.debug(
+                f"[{self._container_name}] reset() completed in {elapsed:.3f}s (direct)"
+            )
+            return
+
+        logger.debug(f"[{self._container_name}] reset() starting podman-hpc exec...")
+        start = time.perf_counter()
+
+        await self._run_podman(
+            "exec",
+            self._container_name,
+            "/bin/sh",
+            "-c",
+            f"rm -rf {self._config.working_dir}/*",
+        )
+
+        elapsed = time.perf_counter() - start
+        logger.debug(f"[{self._container_name}] reset() completed in {elapsed:.3f}s")
+
+    async def compile(
+        self,
+        code: str,
+        *,
+        timeout_s: float = 5.0,
+    ) -> CompileResult:
+        """Syntax-check code using py_compile."""
+        start = time.perf_counter()
+
+        try:
+            # Write code to container
+            await self._write_file("_check.py", code, timeout_s=timeout_s)
+
+            # Run py_compile (use full path for HPC compatibility)
+            proc = await asyncio.wait_for(
+                self._run_podman(
+                    "exec",
+                    self._container_name,
+                    "/usr/local/bin/python",
+                    "-m",
+                    "py_compile",
+                    f"{self._config.working_dir}/_check.py",
+                    check=False,
+                    capture=True,
+                ),
+                timeout=timeout_s,
+            )
+
+            duration_ms = (time.perf_counter() - start) * 1000
+
+            if proc.returncode == 0:
+                return CompileResult(
+                    status=CompileStatus.SUCCESS,
+                    duration_ms=duration_ms,
+                )
+
+            # Parse error message
+            error_msg = proc.stderr or proc.stdout or ""
+            line, column, clean_msg = parse_syntax_error(error_msg)
+
+            # Classify error type
+            status = CompileStatus.SYNTAX_ERROR
+            if "ImportError" in error_msg or "ModuleNotFoundError" in error_msg:
+                status = CompileStatus.IMPORT_ERROR
+            elif not clean_msg:
+                status = CompileStatus.UNKNOWN_ERROR
+
+            return CompileResult(
+                status=status,
+                error_message=clean_msg or error_msg,
+                error_line=line,
+                error_column=column,
+                duration_ms=duration_ms,
+            )
+
+        except asyncio.TimeoutError:
+            duration_ms = (time.perf_counter() - start) * 1000
+            return CompileResult(
+                status=CompileStatus.TIMEOUT,
+                error_message=f"Compilation timed out after {timeout_s}s",
+                duration_ms=duration_ms,
+            )
+
+    async def execute(
+        self,
+        code: str,
+        *,
+        stdin: str = "",
+        skip_compile: bool = False,
+        timeout_s: float = 10.0,
+        memory_limit_mb: Optional[int] = None,
+        env_vars: Optional[Dict[str, str]] = None,
+    ) -> ExecutionResult:
+        """Execute code with full resource isolation."""
+        # Step 1: Compile
+        if skip_compile:
+            compile_result = CompileResult(status=CompileStatus.SUCCESS)
+        else:
+            compile_result = await self.compile(code, timeout_s=timeout_s)
+
+        total_start = time.perf_counter()
+
+        if not compile_result.success:
+            total_ms = (time.perf_counter() - total_start) * 1000
+            return ExecutionResult(
+                compile_result=compile_result,
+                run_status=RunStatus.NOT_RUN,
+                compile_duration_ms=compile_result.duration_ms,
+                total_duration_ms=total_ms,
+            )
+
+        # Step 2: Execute
+        run_start = time.perf_counter()
+
+        try:
+            # Generate unique filename to avoid race conditions
+            exec_id = uuid.uuid4().hex[:8]
+            exec_filename = f"_exec_{exec_id}.py"
+
+            # Write code to container
+            await self._write_file(exec_filename, code, timeout_s=timeout_s)
+
+            # Build exec command
+            exec_cmd = ["exec"]
+            if stdin:
+                exec_cmd.append("-i")
+
+            # Add environment variables
+            if env_vars:
+                for key, val in env_vars.items():
+                    exec_cmd.extend(["-e", f"{key}={val}"])
+
+            exec_cmd.extend(
+                [
+                    self._container_name,
+                    "/usr/local/bin/python",
+                    f"{self._config.working_dir}/{exec_filename}",
+                ]
+            )
+
+            # Run with timeout
+            proc = await asyncio.wait_for(
+                self._run_podman(
+                    *exec_cmd,
+                    check=False,
+                    capture=True,
+                    input_data=stdin.encode("utf-8") if stdin else None,
+                ),
+                timeout=timeout_s,
+            )
+
+            run_ms = (time.perf_counter() - run_start) * 1000
+            total_ms = (time.perf_counter() - total_start) * 1000
+
+            # Classify run status
+            exit_code = proc.returncode
+            if exit_code == 0:
+                run_status = RunStatus.SUCCESS
+            elif exit_code == 137:  # SIGKILL (OOM)
+                run_status = RunStatus.MEMORY_EXCEEDED
+            elif exit_code == 143:  # SIGTERM
+                run_status = RunStatus.KILLED
+            else:
+                run_status = RunStatus.RUNTIME_ERROR
+
+            return ExecutionResult(
+                compile_result=compile_result,
+                run_status=run_status,
+                stdout=proc.stdout or "",
+                stderr=proc.stderr or "",
+                exit_code=exit_code,
+                compile_duration_ms=compile_result.duration_ms,
+                run_duration_ms=run_ms,
+                total_duration_ms=total_ms,
+            )
+
+        except asyncio.TimeoutError:
+            run_ms = (time.perf_counter() - run_start) * 1000
+            total_ms = (time.perf_counter() - total_start) * 1000
+
+            # Best-effort cleanup - goes through exec_semaphore so won't deadlock
+            try:
+                await self._run_podman(
+                    "exec",
+                    self._container_name,
+                    "/usr/bin/pkill",
+                    "-9",
+                    "python",
+                    check=False,
+                    capture=True,
+                )
+            except Exception:
+                pass  # Best effort, reset() will clean up anyway
+
+            return ExecutionResult(
+                compile_result=compile_result,
+                run_status=RunStatus.TIMEOUT,
+                stderr=f"Execution timed out after {timeout_s}s",
+                compile_duration_ms=compile_result.duration_ms,
+                run_duration_ms=run_ms,
+                total_duration_ms=total_ms,
+            )
+
+    async def _write_file(
+        self,
+        filename: str,
+        content: str,
+        *,
+        timeout_s: float = 5.0,
+    ) -> None:
+        """
+        Write a file to the container using tar pipe.
+
+        Creates a tar archive in memory and pipes it to container.
+        This is more robust than echo for handling special characters.
+        """
+        if self._workspace_host_dir:
+            # Direct host filesystem write - no podman exec, no semaphore
+            path = Path(self._workspace_host_dir) / filename
+            path.parent.mkdir(parents=True, exist_ok=True)
+            path.write_text(content)
+            return
+
+        # Create tar archive in memory
+        tar_buffer = io.BytesIO()
+        with tarfile.open(fileobj=tar_buffer, mode="w") as tar:
+            file_data = content.encode("utf-8")
+            tarinfo = tarfile.TarInfo(name=filename)
+            tarinfo.size = len(file_data)
+            tarinfo.mtime = int(time.time())
+            tar.addfile(tarinfo, io.BytesIO(file_data))
+        tar_buffer.seek(0)
+
+        # Pipe tar to container
+        await asyncio.wait_for(
+            self._run_podman(
+                "exec",
+                "-i",
+                self._container_name,
+                "tar",
+                "-xC",
+                self._config.working_dir,
+                check=True,
+                capture=True,
+                input_data=tar_buffer.read(),
+            ),
+            timeout=timeout_s,
+        )
+
+    async def _run_podman(
+        self,
+        *args: str,
+        check: bool = True,
+        capture: bool = False,
+        input_data: Optional[bytes] = None,
+    ) -> "PodmanResult":
+        """
+        Run a podman-hpc command asynchronously.
+
+        For 'exec' commands, acquires the shared semaphore to prevent
+        overwhelming podman's lock manager (which deadlocks above ~8
+        concurrent operations).
+
+        Args:
+            *args: Command arguments (e.g., "exec", container_name, "python", ...)
+            check: Raise exception if command fails
+            capture: Capture stdout/stderr
+            input_data: Data to pipe to stdin
+
+        Returns:
+            PodmanResult with returncode, stdout, stderr
+        """
+        is_exec = args and args[0] == "exec"
+
+        # Use semaphore for exec commands to prevent podman deadlock
+        if is_exec and self._exec_semaphore:
+            async with self._exec_semaphore:
+                return await self._run_podman_inner(
+                    *args, check=check, capture=capture, input_data=input_data
+                )
+        else:
+            return await self._run_podman_inner(
+                *args, check=check, capture=capture, input_data=input_data
+            )
+
+    async def _run_podman_inner(
+        self,
+        *args: str,
+        check: bool = True,
+        capture: bool = False,
+        input_data: Optional[bytes] = None,
+    ) -> "PodmanResult":
+        """Actually run the podman-hpc command (called by _run_podman)."""
+        start = time.perf_counter()
+
+        proc = await asyncio.create_subprocess_exec(
+            "podman-hpc",
+            *args,
+            stdin=asyncio.subprocess.PIPE if input_data else None,
+            stdout=asyncio.subprocess.PIPE if capture else asyncio.subprocess.DEVNULL,
+            stderr=asyncio.subprocess.PIPE if capture else asyncio.subprocess.DEVNULL,
+        )
+
+        stdout_bytes, stderr_bytes = await proc.communicate(input=input_data)
+
+        elapsed = time.perf_counter() - start
+        if elapsed > 1.0:
+            cmd_preview = " ".join(args[:4])
+            logger.warning(
+                f"[{self._container_name}] SLOW podman-hpc {cmd_preview}... "
+                f"took {elapsed:.2f}s"
+            )
+
+        result = PodmanResult(
+            returncode=proc.returncode or 0,
+            stdout=stdout_bytes.decode("utf-8", errors="replace")
+            if stdout_bytes
+            else "",
+            stderr=stderr_bytes.decode("utf-8", errors="replace")
+            if stderr_bytes
+            else "",
+        )
+
+        if check and result.returncode != 0:
+            raise PodmanError(
+                f"podman-hpc {' '.join(args)} failed with exit code {result.returncode}:\n"
+                f"{result.stderr}"
+            )
+
+        return result
+
+    # -------------------------------------------------------------------------
+    # Batch execution (reduces semaphore acquisitions from O(N) to O(1))
+    # -------------------------------------------------------------------------
+
+    async def execute_batch(
+        self,
+        spec: BatchExecutionSpec,
+    ) -> AsyncIterator[Union[CompileResult, ExecutionResult]]:
+        """
+        Execute all tests in a single batch with streaming results.
+
+        This method reduces semaphore acquisitions from O(2N+1) to O(3) by:
+        1. Bundling code, manifest, and runner into a single tar
+        2. Executing the batch runner once, which runs all tests sequentially
+        3. Streaming results back as JSONL
+
+        Args:
+            spec: Batch execution specification with code, tests, and options
+
+        Yields:
+            CompileResult (if compile_first=True), then ExecutionResult for each test
+        """
+        batch_dir = "_batch"
+        batch_start = time.perf_counter()
+
+        # Build manifest for the batch runner
+        manifest = {
+            "code_file": "solution.py",
+            "compile_first": spec.compile_first,
+            "timeout_s": spec.timeout_s,
+            "stop_on_first_failure": spec.stop_on_first_failure,
+            "tests": [
+                {"id": t.id or f"test_{i}", "stdin": t.input, "expected": t.expected}
+                for i, t in enumerate(spec.tests)
+            ],
+        }
+
+        # Build tar archive with all files
+        tar_data = self._build_batch_tar(
+            manifest=manifest,
+            code=spec.code,
+            runner_script=get_batch_runner_script(),
+            batch_dir=batch_dir,
+        )
+
+        # Write tar to container (1 semaphore acquisition)
+        await self._write_tar(tar_data, timeout_s=spec.timeout_s)
+
+        # Execute batch runner and stream results (1 semaphore acquisition)
+        manifest_path = f"{self._config.working_dir}/{batch_dir}/manifest.json"
+        runner_path = f"{self._config.working_dir}/{batch_dir}/batch_runner.py"
+
+        # Track timing and received results
+        run_start = time.perf_counter()
+        received_done = False
+        received_test_ids: set[str] = set()
+        compile_result: Optional[CompileResult] = None
+
+        # Calculate aggregate timeout accounting for parallelization in batch_runner
+        # With N workers: timeout = (ceil(N_tests / workers) × timeout_per_test) + buffer
+        num_workers = 16  # Matches batch_runner.py default for HPC
+        parallel_batches = math.ceil(len(spec.tests) / num_workers) if spec.tests else 1
+        aggregate_timeout = (
+            spec.timeout_s * parallel_batches + 60.0
+        )  # 60s buffer for HPC
+
+        try:
+            async with self._exec_semaphore:
+                proc = await asyncio.create_subprocess_exec(
+                    "podman-hpc",
+                    "exec",
+                    "--workdir",
+                    f"{self._config.working_dir}/{batch_dir}",
+                    self._container_name,
+                    "python",
+                    runner_path,
+                    manifest_path,
+                    stdout=asyncio.subprocess.PIPE,
+                    stderr=asyncio.subprocess.PIPE,
+                )
+
+                # Results collected from streaming to yield after timeout handling
+                streamed_results: list = []
+
+                async def _stream_results():
+                    """Stream results from batch runner, updating nonlocal state."""
+                    nonlocal received_done, compile_result
+                    async for line_bytes in proc.stdout:
+                        line = line_bytes.decode("utf-8", errors="replace").strip()
+                        if not line:
+                            continue
+
+                        try:
+                            result = json.loads(line)
+                        except json.JSONDecodeError:
+                            logger.warning(f"Invalid JSON from batch runner: {line}")
+                            continue
+
+                        result_type = result.get("type")
+
+                        if result_type == "compile":
+                            compile_result = parse_batch_compile_result(result)
+                            streamed_results.append(("compile", compile_result))
+                            if not compile_result.success:
+                                # Compilation failed, we're done
+                                break
+
+                        elif result_type == "test":
+                            test_id = result.get("id", "unknown")
+                            received_test_ids.add(test_id)
+                            exec_result = parse_batch_test_result(result, run_start)
+                            streamed_results.append(("test", exec_result))
+
+                        elif result_type == "done":
+                            received_done = True
+                            break
+
+                        elif result_type == "error":
+                            logger.error(f"Batch runner error: {result.get('message')}")
+
+                    # Wait for process to complete
+                    await proc.wait()
+
+                try:
+                    await asyncio.wait_for(_stream_results(), timeout=aggregate_timeout)
+                except asyncio.TimeoutError:
+                    logger.warning(
+                        f"[{self._container_name}] Batch timed out after {aggregate_timeout:.1f}s "
+                        f"({len(received_test_ids)}/{len(spec.tests)} tests received)"
+                    )
+                    proc.kill()
+                    await proc.wait()
+
+                # Yield all collected results
+                for result_type, result in streamed_results:
+                    yield result
+
+        except asyncio.TimeoutError:
+            logger.warning(f"Batch execution timed out after {aggregate_timeout:.1f}s")
+
+        except Exception as e:
+            logger.warning(f"Batch execution stream broke: {e}")
+
+        # Handle missing tests (stream truncated before "done")
+        if not received_done and compile_result is None:
+            # No compile result received - emit a failure
+            compile_result = CompileResult(
+                status=CompileStatus.UNKNOWN_ERROR,
+                error_message="Batch execution terminated unexpectedly",
+                duration_ms=(time.perf_counter() - batch_start) * 1000,
+            )
+            yield compile_result
+
+        if not received_done and (compile_result is None or compile_result.success):
+            # Some tests may not have been run
+            for i, test in enumerate(spec.tests):
+                test_id = test.id or f"test_{i}"
+                if test_id not in received_test_ids:
+                    run_ms = (time.perf_counter() - run_start) * 1000
+                    yield ExecutionResult(
+                        compile_result=compile_result
+                        or CompileResult(status=CompileStatus.SUCCESS),
+                        run_status=RunStatus.SANDBOX_ERROR,
+                        stdout="",
+                        stderr="Batch execution terminated unexpectedly",
+                        exit_code=None,
+                        run_duration_ms=run_ms,
+                        total_duration_ms=run_ms,
+                    )
+
+    def _build_batch_tar(
+        self,
+        manifest: dict,
+        code: str,
+        runner_script: str,
+        batch_dir: str = "_batch",
+    ) -> bytes:
+        """Build tar archive containing batch execution files.
+
+        Creates a tar with:
+        - {batch_dir}/manifest.json: Test configuration
+        - {batch_dir}/solution.py: Code under test
+        - {batch_dir}/batch_runner.py: Self-contained test runner
+
+        Args:
+            manifest: Test configuration dict
+            code: Python code to test
+            runner_script: Content of batch_runner.py
+            batch_dir: Directory name within workspace
+
+        Returns:
+            Tar archive bytes
+        """
+        buf = io.BytesIO()
+        with tarfile.open(fileobj=buf, mode="w") as tar:
+            # Create directory entry first
+            dir_info = tarfile.TarInfo(name=batch_dir)
+            dir_info.type = tarfile.DIRTYPE
+            dir_info.mode = 0o755
+            dir_info.mtime = int(time.time())
+            tar.addfile(dir_info)
+
+            # Add manifest.json
+            manifest_data = json.dumps(manifest, indent=2).encode("utf-8")
+            info = tarfile.TarInfo(name=f"{batch_dir}/manifest.json")
+            info.size = len(manifest_data)
+            info.mtime = int(time.time())
+            tar.addfile(info, io.BytesIO(manifest_data))
+
+            # Add solution.py
+            code_data = code.encode("utf-8")
+            info = tarfile.TarInfo(name=f"{batch_dir}/solution.py")
+            info.size = len(code_data)
+            info.mtime = int(time.time())
+            tar.addfile(info, io.BytesIO(code_data))
+
+            # Add batch_runner.py
+            runner_data = runner_script.encode("utf-8")
+            info = tarfile.TarInfo(name=f"{batch_dir}/batch_runner.py")
+            info.size = len(runner_data)
+            info.mtime = int(time.time())
+            tar.addfile(info, io.BytesIO(runner_data))
+
+        buf.seek(0)
+        return buf.read()
+
+    async def _write_tar(
+        self,
+        tar_data: bytes,
+        *,
+        timeout_s: float = 5.0,
+    ) -> None:
+        """Write a tar archive directly to the container.
+
+        Similar to _write_file but takes raw tar bytes.
+        """
+        if self._workspace_host_dir:
+            # Extract tar directly to host filesystem - no podman exec
+            buf = io.BytesIO(tar_data)
+            with tarfile.open(fileobj=buf, mode="r") as tar:
+                tar.extractall(path=self._workspace_host_dir)
+            return
+
+        await asyncio.wait_for(
+            self._run_podman(
+                "exec",
+                "-i",
+                self._container_name,
+                "tar",
+                "-xC",
+                self._config.working_dir,
+                check=True,
+                capture=True,
+                input_data=tar_data,
+            ),
+            timeout=timeout_s,
+        )
+
+
+@dataclass
+class PodmanResult:
+    """Result of a podman-hpc command."""
+
+    returncode: int
+    stdout: str
+    stderr: str
+
+
+class PodmanError(Exception):
+    """Error from podman-hpc command."""
+
+    pass
+
+
+class PodmanHPCSandboxPool(BaseSandboxPool[PodmanHPCSandbox]):
+    """
+    Pool of persistent Podman-HPC containers with LRU caching.
+
+    Manages container lifecycle, checkout/release, and execution caching.
+    Designed for HPC environments with Slurm job scheduling.
+
+    Inherits from BaseSandboxPool to use background reset pattern:
+      - checkout() returns pre-reset sandboxes instantly
+      - release() spawns background reset task
+      - shutdown() waits for pending resets before cleanup
+    """
+
+    def __init__(
+        self,
+        n_workers: int = 4,
+        image: str = "python:3.11-slim",
+        config: Optional[PodmanConfig] = None,
+        cache_size: int = 10000,
+        auto_replace_failed: bool = True,
+        max_consecutive_failures: int = 5,
+        max_concurrent_ops: int = 8,
+        workspace_base_dir: str = "auto",
+    ):
+        """
+        Initialize Podman-HPC sandbox pool.
+
+        Args:
+            n_workers: Number of sandboxes to create
+            image: Podman image (e.g., "python:3.11-slim")
+            config: Podman-specific configuration
+            cache_size: Maximum entries in execution cache
+            auto_replace_failed: If True, create new sandbox when reset fails
+            max_consecutive_failures: Maximum consecutive reset failures before raising
+                SandboxPoolExhaustedError (circuit breaker threshold)
+            max_concurrent_ops: Maximum concurrent operations (resets, executions)
+            workspace_base_dir: Base directory for bind mounts. Options:
+                - "auto" (default): Auto-detect; use /local if on HPC, else None
+                - explicit path: Use specified directory for bind mounts
+                - None: Disable bind mounts, use tar-based I/O
+        """
+        super().__init__(
+            n_workers=n_workers,
+            cache_size=cache_size,
+            auto_replace_failed=auto_replace_failed,
+            max_consecutive_failures=max_consecutive_failures,
+            max_concurrent_ops=max_concurrent_ops,
+        )
+        self._image = image
+        self._config = config or PodmanConfig()
+        self._exec_semaphore: Optional[asyncio.Semaphore] = None
+
+        # Extract Python version from image name
+        self._python_version = self._parse_python_version(image)
+
+        # Resolve workspace_base_dir
+        if workspace_base_dir == "auto":
+            # Auto-detect: use /local if on HPC, else None
+            slurm_job_id = os.environ.get("SLURM_JOB_ID")
+            if (
+                slurm_job_id and Path("/home/u5ds/joanv.u5ds").exists()
+            ):  # TODO [joan]: Remove hardcoding
+                self._workspace_base_dir: Optional[str] = (
+                    f"/home/u5ds/joanv.u5ds/sandbox/ludic-{slurm_job_id}"
+                )
+            else:
+                self._workspace_base_dir = None
+        else:
+            self._workspace_base_dir = workspace_base_dir
+
+    @property
+    def python_version(self) -> str:
+        """Python version used by sandboxes in this pool."""
+        return self._python_version
+
+    # -------------------------------------------------------------------------
+    # Abstract method implementations (backend-specific logic)
+    # -------------------------------------------------------------------------
+
+    async def _create_sandboxes(self) -> List[PodmanHPCSandbox]:
+        """
+        Create and start all Podman-HPC container sandboxes.
+
+        Pulls the image (auto-migrates to shared storage on HPC) and creates
+        persistent containers in parallel.
+
+        Returns:
+            List of started PodmanHPCSandbox instances
+        """
+        # Create shared exec semaphore (prevents podman deadlock)
+        self._exec_semaphore = asyncio.Semaphore(self._max_concurrent_ops)
+        logger.info(
+            f"Podman exec semaphore initialized: max_concurrent_ops={self._max_concurrent_ops}"
+        )
+
+        # Pull image (podman-hpc pull auto-migrates to shared storage)
+        logger.info(
+            f"Pulling image {self._image} (may take a moment for HPC migration)..."
+        )
+        proc = await asyncio.create_subprocess_exec(
+            "podman-hpc",
+            "pull",
+            self._image,
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE,
+        )
+        await proc.communicate()
+
+        # If using bind mounts, create base directory
+        if self._workspace_base_dir:
+            Path(self._workspace_base_dir).mkdir(parents=True, exist_ok=True)
+            logger.info(f"Bind mount enabled: {self._workspace_base_dir}")
+        else:
+            logger.info("Bind mount disabled, using tar-based I/O")
+
+        # Create and start sandboxes in parallel
+        container_prefix = _get_container_name_prefix()
+
+        async def _create_and_start(i: int) -> PodmanHPCSandbox:
+            container_name = f"{container_prefix}-{i}"
+
+            # Create per-sandbox host directory if using bind mounts
+            workspace_host_dir = None
+            if self._workspace_base_dir:
+                workspace_host_dir = f"{self._workspace_base_dir}/sandbox-{i}"
+                Path(workspace_host_dir).mkdir(parents=True, exist_ok=True)
+
+            sandbox = PodmanHPCSandbox(
+                container_name=container_name,
+                image=self._image,
+                config=self._config,
+                python_version=self._python_version,
+                exec_semaphore=self._exec_semaphore,  # Shared across all sandboxes
+                workspace_host_dir=workspace_host_dir,
+            )
+            await sandbox.start()
+            return sandbox
+
+        sandboxes = await asyncio.gather(
+            *[_create_and_start(i) for i in range(self._n_workers)]
+        )
+
+        logger.info(f"Podman-HPC sandbox pool ready ({self._n_workers} workers)")
+        return sandboxes
+
+    async def _stop_sandbox(self, sandbox: PodmanHPCSandbox) -> None:
+        """
+        Stop and remove a single Podman container.
+
+        Called during shutdown and when replacing a failed sandbox.
+        Handles errors gracefully (logs warnings, doesn't raise).
+
+        Args:
+            sandbox: The sandbox to stop
+        """
+        try:
+            await sandbox.stop()
+        except Exception as e:
+            logger.warning(f"Failed to stop Podman container: {e}")
+
+    async def _create_replacement_sandbox(self) -> Optional[PodmanHPCSandbox]:
+        """
+        Create a single replacement sandbox for a failed one.
+
+        Creates a new container with the same configuration and starts it.
+
+        Returns:
+            New PodmanHPCSandbox instance, or None if creation fails
+        """
+        try:
+            container_prefix = _get_container_name_prefix()
+            # Use timestamp to ensure unique container name
+            container_name = f"{container_prefix}-replacement-{int(time.time())}"
+
+            # Create per-sandbox host directory if using bind mounts
+            workspace_host_dir = None
+            if self._workspace_base_dir:
+                workspace_host_dir = (
+                    f"{self._workspace_base_dir}/sandbox-replacement-{int(time.time())}"
+                )
+                Path(workspace_host_dir).mkdir(parents=True, exist_ok=True)
+
+            sandbox = PodmanHPCSandbox(
+                container_name=container_name,
+                image=self._image,
+                config=self._config,
+                python_version=self._python_version,
+                exec_semaphore=self._exec_semaphore,  # Use shared semaphore
+                workspace_host_dir=workspace_host_dir,
+            )
+            await sandbox.start()
+            logger.info(f"Created replacement Podman sandbox: {container_name}")
+            return sandbox
+        except Exception as e:
+            logger.error(f"Failed to create replacement Podman sandbox: {e}")
+            return None
+
+    async def shutdown(self) -> None:
+        """
+        Shutdown pool and clean up resources.
+
+        Stops all sandboxes and removes workspace directories if using bind mounts.
+        """
+        # Call parent shutdown to stop sandboxes
+        await super().shutdown()
+
+        # Clean up host workspace directories
+        if self._workspace_base_dir:
+            workspace_path = Path(self._workspace_base_dir)
+            if workspace_path.exists():
+                try:
+                    shutil.rmtree(self._workspace_base_dir, ignore_errors=True)
+                    logger.info(
+                        f"Cleaned up workspace directory: {self._workspace_base_dir}"
+                    )
+                except Exception as e:
+                    logger.warning(f"Failed to clean up workspace directory: {e}")
+
+    # -------------------------------------------------------------------------
+    # Helper methods
+    # -------------------------------------------------------------------------
+
+    @staticmethod
+    def _parse_python_version(image: str) -> str:
+        """Extract Python version from image name."""
+        # Common patterns: python:3.11-slim, python:3.11, ghcr.io/.../python:3.11
+        match = re.search(r"python:(\d+\.\d+)", image)
+        if match:
+            return match.group(1)
+        return "3.11"  # Default fallback
diff --git a/src/ludic/envs/code_exec/pool.py b/src/ludic/envs/code_exec/pool.py
new file mode 100644
index 0000000..8a8cde4
--- /dev/null
+++ b/src/ludic/envs/code_exec/pool.py
@@ -0,0 +1,483 @@
+"""
+Base sandbox pool with background reset pattern.
+
+Provides shared pool management logic for Docker, Podman, and other backends.
+The background reset pattern ensures that sandbox cleanup happens off the
+critical path, maximizing throughput for rollout generation.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+from abc import ABC, abstractmethod
+from typing import Dict, Generic, List, Optional, Set, TypeVar
+
+from .cache import LRUCache
+from .sandbox import Sandbox
+from .types import BatchTestResult, SandboxPoolExhaustedError
+
+logger = logging.getLogger(__name__)
+
+# Type variable for sandbox implementations
+S = TypeVar("S", bound=Sandbox)
+
+
+class BaseSandboxPool(ABC, Generic[S]):
+    """
+    Abstract base class for sandbox pools with background reset.
+
+    Provides queue-based checkout/release with sandboxes reset in background
+    tasks (off critical path). Includes LRU caching, pending task tracking,
+    and error handling for failed resets.
+
+    Subclasses must implement: _create_sandboxes(), _stop_sandbox(), python_version.
+
+    Background Reset Pattern:
+      Released sandboxes are reset asynchronously and returned to the queue.
+      checkout() receives already-clean sandboxes instantly, hiding reset latency.
+      Failed resets discard the sandbox and optionally create a replacement.
+    """
+
+    def __init__(
+        self,
+        n_workers: int = 4,
+        cache_size: int = 10000,
+        auto_replace_failed: bool = True,
+        max_consecutive_failures: int = 5,
+        max_concurrent_ops: int = 8,
+    ):
+        """
+        Initialize the pool.
+
+        Args:
+            n_workers: Number of sandboxes to create
+            cache_size: Maximum entries in the execution cache
+            auto_replace_failed: If True, create new sandbox when reset fails
+            max_consecutive_failures: Maximum consecutive reset failures before raising
+                SandboxPoolExhaustedError (circuit breaker threshold)
+            max_concurrent_ops: Maximum concurrent sandbox operations (resets, exec
+                calls). Prevents podman/docker deadlock with too many simultaneous calls.
+        """
+        self._n_workers = n_workers
+        self._cache = LRUCache(max_size=cache_size)
+        self._auto_replace_failed = auto_replace_failed
+        self._max_consecutive_failures = max_consecutive_failures
+        self._max_concurrent_ops = max_concurrent_ops
+
+        self._sandboxes: List[S] = []
+        self._queue: Optional[asyncio.Queue[S]] = None
+        self._pending_resets: Set[asyncio.Task] = set()
+        self._started = False
+        self._shutting_down = False
+        self._consecutive_failures = 0
+
+    # -------------------------------------------------------------------------
+    # Abstract methods (must be implemented by subclasses)
+    # -------------------------------------------------------------------------
+
+    @property
+    @abstractmethod
+    def python_version(self) -> str:
+        """Python version used by sandboxes in this pool."""
+        ...
+
+    @abstractmethod
+    async def _create_sandboxes(self) -> List[S]:
+        """
+        Create all sandbox instances.
+
+        Called by start(). Should create n_workers sandboxes, start them,
+        and return the list. This is where backend-specific logic lives
+        (Docker container creation, Podman-HPC setup, etc.).
+
+        Returns:
+            List of started sandbox instances
+        """
+        ...
+
+    @abstractmethod
+    async def _stop_sandbox(self, sandbox: S) -> None:
+        """
+        Stop and cleanup a single sandbox.
+
+        Called during shutdown and when replacing a failed sandbox.
+        Should handle errors gracefully (log warnings, don't raise).
+
+        Args:
+            sandbox: The sandbox to stop
+        """
+        ...
+
+    async def _create_replacement_sandbox(self) -> Optional[S]:
+        """
+        Create a single replacement sandbox.
+
+        Called when a sandbox fails to reset and auto_replace_failed is True.
+        Default implementation returns None (no replacement). Override in
+        subclass if dynamic sandbox creation is supported.
+
+        Returns:
+            New sandbox instance, or None if replacement not supported
+        """
+        return None
+
+    # -------------------------------------------------------------------------
+    # Pool lifecycle
+    # -------------------------------------------------------------------------
+
+    @property
+    def available(self) -> int:
+        """Number of sandboxes currently available for checkout."""
+        if self._queue is None:
+            return 0
+        return self._queue.qsize()
+
+    @property
+    def cache_stats(self) -> Dict[str, int]:
+        """Cache statistics (hits, misses, size, max_size)."""
+        return self._cache.stats
+
+    @property
+    def pending_resets(self) -> int:
+        """Number of background reset tasks currently running."""
+        return len(self._pending_resets)
+
+    async def start(self) -> None:
+        """
+        Initialize the pool.
+
+        Creates all sandboxes and makes them available for checkout.
+        Idempotent - calling multiple times has no effect.
+        """
+        if self._started:
+            return
+
+        # Limits concurrent background reset TASKS (admission control)
+        # Prevents podman/docker deadlock with too many simultaneous operations
+        self._ops_semaphore = asyncio.Semaphore(self._max_concurrent_ops)
+        logger.info(
+            f"Pool starting: n_workers={self._n_workers}, "
+            f"max_concurrent_ops={self._max_concurrent_ops}"
+        )
+
+        # Create sandboxes (backend-specific)
+        self._sandboxes = await self._create_sandboxes()
+
+        # Create queue and populate with all sandboxes
+        self._queue = asyncio.Queue()
+        for sandbox in self._sandboxes:
+            await self._queue.put(sandbox)
+
+        self._started = True
+        logger.info(
+            f"Pool started: {len(self._sandboxes)} sandboxes ready, "
+            f"queue_size={self._queue.qsize()}"
+        )
+
+    async def shutdown(self) -> None:
+        """
+        Tear down all sandboxes and release resources.
+
+        Waits for all pending reset tasks to complete before stopping
+        sandboxes, ensuring clean shutdown without orphaned tasks.
+        """
+        if not self._started:
+            return
+
+        self._shutting_down = True
+
+        # Wait for all pending reset tasks to complete
+        if self._pending_resets:
+            logger.debug(f"Waiting for {len(self._pending_resets)} pending resets...")
+            await asyncio.gather(*self._pending_resets, return_exceptions=True)
+
+        # Stop all sandboxes
+        for sandbox in self._sandboxes:
+            await self._stop_sandbox(sandbox)
+
+        self._sandboxes.clear()
+        self._started = False
+        self._queue = None
+        self._shutting_down = False
+
+    async def drain_pending_resets(self, timeout_s: float = 60.0) -> int:
+        """
+        Wait for all pending reset tasks to complete.
+
+        Call this before switching between high-concurrency phases
+        (e.g., before eval after training step) to ensure all sandboxes
+        are available in the queue.
+
+        Uses asyncio.wait() instead of wait_for(gather()) to avoid
+        cancelling tasks on timeout (which would destroy sandboxes).
+
+        Args:
+            timeout_s: Maximum time to wait for resets to complete
+
+        Returns:
+            Number of resets that completed
+        """
+        if not self._pending_resets:
+            logger.debug("Drain called but no pending resets")
+            return 0
+
+        # Snapshot the current tasks (set may change during await)
+        tasks = list(self._pending_resets)
+        count = len(tasks)
+        logger.info(
+            f"Draining {count} pending resets... "
+            f"queue: {self.available}/{self._n_workers}"
+        )
+
+        import time
+        start = time.time()
+
+        # Use wait() instead of wait_for(gather()) - doesn't cancel on timeout
+        done, pending = await asyncio.wait(tasks, timeout=timeout_s)
+
+        elapsed = time.time() - start
+
+        if pending:
+            logger.warning(
+                f"Drain timeout after {elapsed:.1f}s! "
+                f"Completed: {len(done)}, still pending: {len(pending)}, "
+                f"queue: {self.available}/{self._n_workers}"
+            )
+        else:
+            logger.info(
+                f"Drain complete in {elapsed:.1f}s: "
+                f"{len(done)} resets finished, "
+                f"queue: {self.available}/{self._n_workers}"
+            )
+
+        return len(done)
+
+    # -------------------------------------------------------------------------
+    # Checkout / Release with background reset
+    # -------------------------------------------------------------------------
+
+    async def checkout(self, timeout_s: float = 30.0) -> Sandbox:
+        """
+        Get exclusive access to a sandbox.
+
+        The returned sandbox is guaranteed to be in a clean state (reset
+        was performed in the background after the previous release).
+
+        Waits on the queue for a sandbox to become available. Background
+        resets are rate-limited by semaphore to prevent backend deadlock.
+
+        Args:
+            timeout_s: Maximum time to wait for a sandbox
+
+        Returns:
+            Exclusive Sandbox handle
+
+        Raises:
+            RuntimeError: If pool not started
+            TimeoutError: If no sandbox available within timeout
+        """
+        if not self._started or self._queue is None:
+            raise RuntimeError("Pool not started. Call start() first.")
+
+        import time
+
+        start_time = time.monotonic()
+        deadline = start_time + timeout_s
+        attempt = 0
+
+        while True:
+            remaining = deadline - time.monotonic()
+            attempt += 1
+
+            if remaining <= 0:
+                # Detailed timeout diagnostics
+                semaphore_free = self._ops_semaphore._value if self._ops_semaphore else 0
+                logger.error(
+                    f"CHECKOUT TIMEOUT after {timeout_s}s! "
+                    f"Pool: {self._n_workers}, available: {self.available}, "
+                    f"pending_resets: {self.pending_resets}, "
+                    f"semaphore: {semaphore_free}/{self._max_concurrent_ops} free, "
+                    f"attempts: {attempt}"
+                )
+                raise TimeoutError(
+                    f"No sandbox available after {timeout_s}s. "
+                    f"Pool size: {self._n_workers}, available: {self.available}, "
+                    f"pending resets: {self.pending_resets}"
+                )
+
+            # Log if we're waiting with empty queue
+            if self._queue.empty() and attempt == 1:
+                logger.info(
+                    f"Checkout waiting: queue empty, "
+                    f"pending_resets: {self.pending_resets}"
+                )
+
+            try:
+                sandbox = await asyncio.wait_for(
+                    self._queue.get(),
+                    timeout=min(remaining, 5.0),  # Short timeout to recheck
+                )
+                wait_time = time.monotonic() - start_time
+                if wait_time > 1.0:
+                    logger.info(
+                        f"Checkout OK after {wait_time:.2f}s wait, "
+                        f"queue now: {self._queue.qsize()}/{self._n_workers}"
+                    )
+                return sandbox
+            except asyncio.TimeoutError:
+                # Log periodic status during long waits
+                elapsed = time.monotonic() - start_time
+                logger.warning(
+                    f"Checkout still waiting after {elapsed:.1f}s: "
+                    f"available: {self.available}, pending: {self.pending_resets}"
+                )
+                continue
+
+    async def release(self, sandbox: Sandbox) -> None:
+        """
+        Return a sandbox to the pool.
+
+        The sandbox is reset in a background task, then returned to the
+        available queue. This makes release() return immediately without
+        blocking the caller.
+
+        Args:
+            sandbox: The sandbox to release (must have been obtained via checkout)
+
+        Raises:
+            RuntimeError: If pool not started
+        """
+        if not self._started or self._queue is None:
+            raise RuntimeError("Pool not started")
+
+        if self._shutting_down:
+            # During shutdown, don't spawn new tasks
+            return
+
+        # Spawn background reset task
+        task = asyncio.create_task(
+            self._background_reset(sandbox),  # type: ignore
+            name=f"sandbox-reset-{id(sandbox)}",
+        )
+        self._pending_resets.add(task)
+        task.add_done_callback(self._pending_resets.discard)
+
+    async def _background_reset(self, sandbox: S) -> None:
+        """
+        Reset sandbox and return to queue (runs in background).
+
+        Uses semaphore to limit concurrent reset operations (prevents
+        podman deadlock with too many simultaneous exec calls).
+
+        On success, the sandbox is returned to the available queue.
+        On failure, the sandbox is discarded and optionally replaced.
+        """
+        import time
+
+        sandbox_id = id(sandbox) % 10000  # Short ID for logging
+        wait_start = time.time()
+
+        # Limit concurrent ops to prevent podman/docker deadlock
+        async with self._ops_semaphore:
+            wait_elapsed = time.time() - wait_start
+            if wait_elapsed > 0.1:
+                logger.debug(f"[SB-{sandbox_id}] Semaphore acquired after {wait_elapsed:.2f}s wait")
+
+            reset_start = time.time()
+            try:
+                await sandbox.reset()
+                reset_elapsed = time.time() - reset_start
+                total_elapsed = time.time() - wait_start
+
+                if self._queue is not None and not self._shutting_down:
+                    await self._queue.put(sandbox)
+                    logger.debug(
+                        f"[SB-{sandbox_id}] Reset OK: {reset_elapsed:.2f}s reset, "
+                        f"{total_elapsed:.2f}s total. "
+                        f"Queue now: {self._queue.qsize()}/{self._n_workers}"
+                    )
+            except Exception as e:
+                reset_elapsed = time.time() - reset_start
+                logger.error(
+                    f"[SB-{sandbox_id}] Reset FAILED after {reset_elapsed:.2f}s: {e}"
+                )
+                await self._handle_reset_failure(sandbox, e)
+
+    async def _handle_reset_failure(self, sandbox: S, error: Exception) -> None:
+        """
+        Handle a sandbox that failed to reset.
+
+        Logs the error, removes the sandbox from the pool, and optionally
+        creates a replacement. Implements circuit breaker pattern to detect
+        systemic failures.
+
+        Raises:
+            SandboxPoolExhaustedError: If consecutive failures exceed threshold
+        """
+        # Increment failure counter
+        self._consecutive_failures += 1
+
+        logger.warning(
+            f"Sandbox reset failed: {error}. Discarding sandbox. "
+            f"Consecutive failures: {self._consecutive_failures}/{self._max_consecutive_failures}"
+        )
+
+        # Check circuit breaker threshold
+        if self._consecutive_failures >= self._max_consecutive_failures:
+            logger.error(
+                f"Circuit breaker triggered: {self._consecutive_failures} consecutive "
+                f"sandbox reset failures. Pool is exhausted."
+            )
+            raise SandboxPoolExhaustedError(
+                f"Sandbox pool exhausted after {self._consecutive_failures} consecutive "
+                f"reset failures. This indicates a systemic issue requiring operator intervention."
+            )
+
+        # Remove from tracked sandboxes
+        if sandbox in self._sandboxes:
+            self._sandboxes.remove(sandbox)
+
+        # Try to stop the failed sandbox
+        try:
+            await self._stop_sandbox(sandbox)
+        except Exception as stop_error:
+            logger.warning(f"Failed to stop broken sandbox: {stop_error}")
+
+        # Optionally create replacement
+        if self._auto_replace_failed and not self._shutting_down:
+            try:
+                replacement = await self._create_replacement_sandbox()
+                if replacement is not None:
+                    self._sandboxes.append(replacement)
+                    if self._queue is not None:
+                        await self._queue.put(replacement)
+                    # Reset failure counter on successful replacement
+                    self._consecutive_failures = 0
+                    logger.info(
+                        "Created replacement sandbox after reset failure. "
+                        "Consecutive failure counter reset."
+                    )
+            except Exception as create_error:
+                logger.warning(f"Failed to create replacement sandbox: {create_error}")
+
+    # -------------------------------------------------------------------------
+    # Cache interface
+    # -------------------------------------------------------------------------
+
+    def get_cached(
+        self,
+        code_hash: str,
+        tests_hash: str,
+    ) -> Optional[BatchTestResult]:
+        """Check cache for result (sync, thread-safe)."""
+        return self._cache.get(code_hash, tests_hash)
+
+    def put_cached(
+        self,
+        code_hash: str,
+        tests_hash: str,
+        result: BatchTestResult,
+    ) -> None:
+        """Store result in cache (sync, thread-safe)."""
+        self._cache.put(code_hash, tests_hash, result)
diff --git a/src/ludic/envs/code_exec/runners.py b/src/ludic/envs/code_exec/runners.py
new file mode 100644
index 0000000..6f0f93d
--- /dev/null
+++ b/src/ludic/envs/code_exec/runners.py
@@ -0,0 +1,620 @@
+"""
+Code runners for executing code against test cases.
+
+This module defines the CodeRunner protocol and concrete implementations
+for different test execution strategies (stdin/stdout, function calls, etc.).
+
+The runner is responsible for:
+  1. Orchestrating compilation and execution via a Sandbox
+  2. Running code against multiple TestCases
+  3. Using an OutputVerifier to compare results
+  4. Building rich TestResult and BatchTestResult objects
+"""
+
+from __future__ import annotations
+
+import asyncio
+import hashlib
+import json
+import logging
+from typing import List, Optional, Protocol, Set, runtime_checkable
+
+from .adapters.base import OutputVerifier
+from .sandbox import Sandbox
+from .types import (
+    BatchExecutionSpec,
+    BatchTestResult,
+    CompileResult,
+    CompileStatus,
+    ExecutionResult,
+    RunStatus,
+    TestCase,
+    TestResult,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def compute_hash(content: str) -> str:
+    """
+    Compute SHA256 hash, return first 16 hex chars.
+
+    This is used for cache keys to uniquely identify code and test sets.
+    16 hex chars = 64 bits, which gives collision probability < 1e-10
+    for reasonable dataset sizes.
+
+    Args:
+        content: String to hash
+
+    Returns:
+        First 16 characters of SHA256 hex digest
+    """
+    return hashlib.sha256(content.encode()).hexdigest()[:16]
+
+
+def hash_tests(tests: List[TestCase]) -> str:
+    """
+    Compute stable hash of test cases for caching.
+
+    Creates a deterministic hash by converting test inputs and expected
+    outputs to a canonical JSON representation with sorted keys, then hashing.
+
+    Args:
+        tests: List of test cases to hash
+
+    Returns:
+        16-character hash string
+    """
+    # Use JSON with sorted keys for deterministic serialization
+    content = json.dumps(
+        [(t.input, t.expected) for t in tests],
+        sort_keys=True,
+        default=str,  # Handle non-JSON-serializable types
+    )
+    return compute_hash(content)
+
+
+@runtime_checkable
+class CodeRunner(Protocol):
+    """
+    Protocol for running code against test cases.
+
+    A runner orchestrates the interaction between a Sandbox and test cases,
+    using an OutputVerifier to determine if each test passes. It handles
+    compilation, execution, error recovery, and early stopping.
+
+    Implementations should be stateless and reusable across multiple
+    test runs. All state is passed explicitly via arguments.
+    """
+
+    async def run_tests(
+        self,
+        sandbox: Sandbox,
+        code: str,
+        tests: List[TestCase],
+        *,
+        verifier: OutputVerifier,
+        stop_on_first_failure: bool = False,
+        compile_first: bool = True,
+    ) -> BatchTestResult:
+        """
+        Run code against all test cases and return aggregated results.
+
+        Args:
+            sandbox: Sandbox to execute code in (must be checked out)
+            code: Source code to test
+            tests: List of test cases to run
+            verifier: Verifier to compare actual vs expected output
+            stop_on_first_failure: If True, skip remaining tests after first failure
+            compile_first: If True, compile once before running tests
+
+        Returns:
+            BatchTestResult with individual test results and metadata
+        """
+        ...
+
+
+class StdinStdoutRunner:
+    """
+    Runner for APPS-style stdin/stdout testing.
+
+    This runner executes code that reads from stdin and writes to stdout,
+    comparing the output against expected values. This is the standard
+    format for competitive programming problems (Codeforces, APPS, etc.).
+
+    Each test case's `input` field is passed as stdin, and the `expected`
+    field is compared against stdout using the provided verifier.
+
+    Design notes:
+      - Default timeout is 5.0s for efficiency (per user specification)
+      - Compilation is checked first by default to get early failure signal
+      - All operations are async to avoid blocking the event loop
+      - Rich error details in TestResult.comparison_details
+    """
+
+    def __init__(
+        self,
+        default_timeout_s: float = 5.0,
+        memory_limit_mb: Optional[int] = 256,
+        use_batch_execution: bool = True,
+    ) -> None:
+        """
+        Initialize the runner with default resource limits.
+
+        Args:
+            default_timeout_s: Default execution timeout per test (seconds).
+                              Tests can override via metadata["timeout_s"].
+            memory_limit_mb: Memory limit for execution (None = no limit)
+            use_batch_execution: If True and sandbox supports it, use batched
+                                execution to reduce semaphore acquisitions.
+        """
+        self._default_timeout_s = default_timeout_s
+        self._memory_limit_mb = memory_limit_mb
+        self._use_batch_execution = use_batch_execution
+
+    async def run_tests(
+        self,
+        sandbox: Sandbox,
+        code: str,
+        tests: List[TestCase],
+        *,
+        verifier: OutputVerifier,
+        stop_on_first_failure: bool = False,
+        compile_first: bool = True,
+    ) -> BatchTestResult:
+        """
+        Run stdin/stdout tests against code.
+
+        Implementation steps:
+          1. Compute code_hash and tests_hash for caching
+          2. If compile_first=True, compile code and fail fast if it fails
+          3. For each test:
+             - Execute with test.input as stdin
+             - Compare stdout against test.expected using verifier
+             - Build TestResult with full metadata
+          4. If stop_on_first_failure=True, mark remaining tests NOT_RUN
+          5. Return BatchTestResult
+
+        Args:
+            sandbox: Sandbox to execute code in (must be checked out)
+            code: Source code to test
+            tests: List of test cases (input/expected are stdin/stdout strings)
+            verifier: Verifier to compare stdout vs expected
+            stop_on_first_failure: If True, skip remaining tests after first failure
+            compile_first: If True, compile once before running tests
+
+        Returns:
+            BatchTestResult with results for each test
+        """
+        import time
+
+        run_start = time.perf_counter()
+
+        # Compute hashes for caching
+        code_hash = compute_hash(code)
+        tests_hash_val = hash_tests(tests)
+
+        # Use batch execution if enabled and sandbox supports it
+        has_batch = hasattr(sandbox, "execute_batch")
+        logger.debug(
+            f"run_tests: use_batch={self._use_batch_execution}, "
+            f"has_execute_batch={has_batch}, num_tests={len(tests)}"
+        )
+
+        if self._use_batch_execution and has_batch:
+            result = await self._run_tests_batched(
+                sandbox=sandbox,
+                code=code,
+                tests=tests,
+                verifier=verifier,
+                stop_on_first_failure=stop_on_first_failure,
+                compile_first=compile_first,
+                code_hash=code_hash,
+                tests_hash=tests_hash_val,
+            )
+            elapsed_ms = (time.perf_counter() - run_start) * 1000
+            logger.debug(
+                f"Batch execution completed: {len(tests)} tests in {elapsed_ms:.1f}ms, "
+                f"passed={result.passed_count}/{result.total_count}"
+            )
+            return result
+
+        # Non-batch execution
+        # Step 1: Compile first if requested
+        compile_result: Optional[CompileResult] = None
+        if compile_first:
+            compile_result = await sandbox.compile(
+                code,
+                timeout_s=self._default_timeout_s,
+            )
+
+            # If compilation failed, all tests fail without execution
+            if not compile_result.success:
+                return self._create_all_failed_batch(
+                    tests=tests,
+                    code_hash=code_hash,
+                    tests_hash=tests_hash_val,
+                    compile_result=compile_result,
+                    reason="compilation_failed",
+                )
+
+        # Step 2: Run tests (in parallel when possible)
+        if stop_on_first_failure:
+            # Sequential execution with early stopping
+            results: List[TestResult] = []
+            for test_case in tests:
+                # Get timeout for this test (allow per-test override)
+                timeout_s = test_case.metadata.get("timeout_s", self._default_timeout_s)
+                memory_limit = test_case.metadata.get(
+                    "memory_limit_mb", self._memory_limit_mb
+                )
+
+                # Execute the test
+                test_result = await self._run_single_test(
+                    sandbox=sandbox,
+                    code=code,
+                    test_case=test_case,
+                    verifier=verifier,
+                    timeout_s=timeout_s,
+                    memory_limit_mb=memory_limit,
+                    skip_compile=compile_first,  # Skip if we already compiled
+                )
+
+                results.append(test_result)
+
+                # Stop on first failure
+                if not test_result.passed:
+                    # Mark remaining tests as NOT_RUN
+                    for remaining_test in tests[len(results) :]:
+                        not_run_result = self._create_not_run_result(
+                            test_case=remaining_test,
+                            code_hash=code_hash,
+                        )
+                        results.append(not_run_result)
+                    break
+        else:
+            # Parallel execution with asyncio.gather
+            async def run_test_with_metadata(test_case: TestCase) -> TestResult:
+                timeout_s = test_case.metadata.get("timeout_s", self._default_timeout_s)
+                memory_limit = test_case.metadata.get(
+                    "memory_limit_mb", self._memory_limit_mb
+                )
+                return await self._run_single_test(
+                    sandbox=sandbox,
+                    code=code,
+                    test_case=test_case,
+                    verifier=verifier,
+                    timeout_s=timeout_s,
+                    memory_limit_mb=memory_limit,
+                    skip_compile=compile_first,  # Skip if we already compiled
+                )
+
+            # Run all tests in parallel
+            results = await asyncio.gather(
+                *[run_test_with_metadata(test) for test in tests]
+            )
+
+        return BatchTestResult(
+            results=list(results),
+            code_hash=code_hash,
+            tests_hash=tests_hash_val,
+        )
+
+    async def _run_single_test(
+        self,
+        sandbox: Sandbox,
+        code: str,
+        test_case: TestCase,
+        verifier: OutputVerifier,
+        timeout_s: float,
+        memory_limit_mb: Optional[int],
+        skip_compile: bool = False,
+    ) -> TestResult:
+        """
+        Run a single test case.
+
+        Args:
+            sandbox: Sandbox to execute in
+            code: Source code
+            test_case: Test to run
+            verifier: Output verifier
+            timeout_s: Execution timeout
+            memory_limit_mb: Memory limit
+            skip_compile: If True, skip compilation (assumes already compiled)
+
+        Returns:
+            TestResult for this test
+        """
+        # Execute code with test input
+        execution = await sandbox.execute(
+            code=code,
+            stdin=str(test_case.input),  # Ensure input is string
+            skip_compile=skip_compile,
+            timeout_s=timeout_s,
+            memory_limit_mb=memory_limit_mb,
+        )
+
+        # If execution failed (didn't compile or runtime error), test fails
+        if not execution.succeeded:
+            return TestResult(
+                test_case=test_case,
+                passed=False,
+                actual=execution.stdout,
+                execution=execution,
+                comparison_details=self._get_execution_failure_details(execution),
+            )
+
+        # Execution succeeded, compare output
+        actual_output = execution.stdout
+        expected_output = str(test_case.expected)
+
+        passed, comparison_details = verifier.verify(actual_output, expected_output)
+
+        return TestResult(
+            test_case=test_case,
+            passed=passed,
+            actual=actual_output,
+            execution=execution,
+            comparison_details=comparison_details,
+        )
+
+    async def _run_tests_batched(
+        self,
+        sandbox: Sandbox,
+        code: str,
+        tests: List[TestCase],
+        verifier: OutputVerifier,
+        stop_on_first_failure: bool,
+        compile_first: bool,
+        code_hash: str,
+        tests_hash: str,
+    ) -> BatchTestResult:
+        """
+        Run tests using batch execution API with crash resilience.
+
+        This method uses the sandbox's execute_batch() to run all tests
+        in a single podman exec call, reducing semaphore acquisitions
+        from O(2N) to O(2).
+
+        Args:
+            sandbox: Sandbox with execute_batch() method
+            code: Source code to test
+            tests: List of test cases
+            verifier: Output verifier for comparing results
+            stop_on_first_failure: If True, stop after first failure
+            compile_first: If True, compile before running tests
+            code_hash: Pre-computed hash of code
+            tests_hash: Pre-computed hash of tests
+
+        Returns:
+            BatchTestResult with results for each test
+        """
+        spec = BatchExecutionSpec(
+            code=code,
+            tests=tests,
+            compile_first=compile_first,
+            timeout_s=self._default_timeout_s,
+            stop_on_first_failure=stop_on_first_failure,
+        )
+
+        results: List[TestResult] = []
+        compile_result: Optional[CompileResult] = None
+        received_done = False
+        received_test_ids: Set[str] = set()
+
+        # Build lookup for test cases by ID
+        test_by_id = {t.id: t for t in tests}
+
+        try:
+            async for result in sandbox.execute_batch(spec):
+                if isinstance(result, CompileResult):
+                    compile_result = result
+                    if not result.success:
+                        # Compilation failed - return batch with all tests failed
+                        return self._create_all_failed_batch(
+                            tests=tests,
+                            code_hash=code_hash,
+                            tests_hash=tests_hash,
+                            compile_result=compile_result,
+                            reason="compilation_failed",
+                        )
+                elif isinstance(result, ExecutionResult):
+                    # This is a test result - find the matching test case
+                    # The execute_batch implementation tags results with test_id
+                    # in the cache_key field
+                    test_id = result.cache_key or ""
+                    received_test_ids.add(test_id)
+
+                    test_case = test_by_id.get(test_id)
+                    if test_case is None:
+                        logger.warning(
+                            f"Received result for unknown test_id: {test_id}"
+                        )
+                        continue
+
+                    # Build TestResult from ExecutionResult
+                    if not result.succeeded:
+                        # Execution failed
+                        test_result = TestResult(
+                            test_case=test_case,
+                            passed=False,
+                            actual=result.stdout,
+                            execution=result,
+                            comparison_details=self._get_execution_failure_details(
+                                result
+                            ),
+                        )
+                    else:
+                        # Execution succeeded, compare output
+                        actual_output = result.stdout
+                        expected_output = str(test_case.expected)
+                        passed, comparison_details = verifier.verify(
+                            actual_output, expected_output
+                        )
+                        test_result = TestResult(
+                            test_case=test_case,
+                            passed=passed,
+                            actual=actual_output,
+                            execution=result,
+                            comparison_details=comparison_details,
+                        )
+                    results.append(test_result)
+                elif isinstance(result, dict) and result.get("type") == "done":
+                    received_done = True
+                    break
+
+        except Exception as e:
+            # Stream broke unexpectedly (OOM, container killed, etc.)
+            logger.warning(f"Batch execution stream broke: {e}")
+
+        # Handle missing tests (stream truncated before "done")
+        if not received_done:
+            for test in tests:
+                if test.id not in received_test_ids:
+                    # Create SANDBOX_ERROR result for missing tests
+                    execution = ExecutionResult(
+                        compile_result=compile_result
+                        or CompileResult(status=CompileStatus.SUCCESS),
+                        run_status=RunStatus.SANDBOX_ERROR,
+                        stdout="",
+                        stderr="Batch execution terminated unexpectedly",
+                        exit_code=None,
+                    )
+                    results.append(
+                        TestResult(
+                            test_case=test,
+                            passed=False,
+                            actual="",
+                            execution=execution,
+                            comparison_details="Sandbox crashed before this test completed",
+                        )
+                    )
+
+        return BatchTestResult(
+            results=results,
+            code_hash=code_hash,
+            tests_hash=tests_hash,
+        )
+
+    def _get_execution_failure_details(self, execution: ExecutionResult) -> str:
+        """
+        Generate human-readable details for execution failures.
+
+        Args:
+            execution: The failed execution result
+
+        Returns:
+            Explanation of why execution failed
+        """
+        # Compilation failure
+        if not execution.compiled:
+            compile_msg = execution.compile_result.error_message or "Unknown error"
+            if execution.compile_result.error_line is not None:
+                return f"Compilation failed at line {execution.compile_result.error_line}: {compile_msg}"
+            return f"Compilation failed: {compile_msg}"
+
+        # Runtime failure
+        if execution.run_status == RunStatus.TIMEOUT:
+            return f"Execution timed out after {execution.run_duration_ms:.0f}ms"
+
+        if execution.run_status == RunStatus.MEMORY_EXCEEDED:
+            return "Memory limit exceeded"
+
+        if execution.run_status == RunStatus.RUNTIME_ERROR:
+            stderr = execution.stderr.strip()
+            if stderr:
+                # Show first few lines of stderr for debugging
+                stderr_lines = stderr.split("\n")
+                preview = "\n".join(stderr_lines[:5])
+                if len(stderr_lines) > 5:
+                    preview += f"\n... ({len(stderr_lines) - 5} more lines)"
+                return f"Runtime error:\n{preview}"
+            return f"Runtime error (exit code {execution.exit_code})"
+
+        # Other failure
+        return f"Execution failed with status: {execution.run_status}"
+
+    def _create_all_failed_batch(
+        self,
+        tests: List[TestCase],
+        code_hash: str,
+        tests_hash: str,
+        compile_result: CompileResult,
+        reason: str,
+    ) -> BatchTestResult:
+        """
+        Create a BatchTestResult where all tests failed due to compilation error.
+
+        Args:
+            tests: All test cases
+            code_hash: Hash of the code
+            tests_hash: Hash of the tests
+            compile_result: The failed compilation result
+            reason: Reason for batch failure
+
+        Returns:
+            BatchTestResult with all tests marked as failed
+        """
+        results: List[TestResult] = []
+
+        for test_case in tests:
+            # Create ExecutionResult with the compile failure
+            execution = ExecutionResult(
+                compile_result=compile_result,
+                run_status=None,  # Never ran
+                stdout="",
+                stderr="",
+                exit_code=None,
+                compile_duration_ms=compile_result.duration_ms,
+                run_duration_ms=0.0,
+                total_duration_ms=compile_result.duration_ms,
+            )
+
+            test_result = TestResult(
+                test_case=test_case,
+                passed=False,
+                actual="",
+                execution=execution,
+                comparison_details=self._get_execution_failure_details(execution),
+            )
+            results.append(test_result)
+
+        return BatchTestResult(
+            results=results,
+            code_hash=code_hash,
+            tests_hash=tests_hash,
+        )
+
+    def _create_not_run_result(
+        self,
+        test_case: TestCase,
+        code_hash: str,
+    ) -> TestResult:
+        """
+        Create a TestResult for a test that was skipped.
+
+        Args:
+            test_case: The test case that was skipped
+            code_hash: Hash of the code (for metadata)
+
+        Returns:
+            TestResult marked as NOT_RUN
+        """
+        # Create a minimal ExecutionResult indicating the test wasn't run
+        execution = ExecutionResult(
+            compile_result=CompileResult(
+                status=CompileStatus.SUCCESS  # Compilation already succeeded
+            ),
+            run_status=RunStatus.NOT_RUN,
+            stdout="",
+            stderr="",
+            exit_code=None,
+        )
+
+        return TestResult(
+            test_case=test_case,
+            passed=False,
+            actual="",
+            execution=execution,
+            comparison_details="Test skipped (stop_on_first_failure=True)",
+        )
diff --git a/src/ludic/envs/code_exec/sandbox.py b/src/ludic/envs/code_exec/sandbox.py
new file mode 100644
index 0000000..d2b2d81
--- /dev/null
+++ b/src/ludic/envs/code_exec/sandbox.py
@@ -0,0 +1,239 @@
+"""
+Sandbox protocols for isolated code execution.
+
+These protocols define the contract for sandbox implementations.
+The actual implementations (Docker, subprocess, etc.) live in separate modules.
+"""
+
+from __future__ import annotations
+
+from typing import Dict, Optional, Protocol, runtime_checkable
+
+from .types import BatchTestResult, CompileResult, ExecutionResult
+
+
+@runtime_checkable
+class Sandbox(Protocol):
+    """
+    Async handle to a single isolated execution environment.
+
+    Invariants:
+      - A sandbox is exclusive to one env instance at a time
+      - reset() clears all state from previous executions
+      - All operations are async to avoid blocking the event loop
+
+    Lifecycle:
+      1. Obtained via SandboxPool.checkout()
+      2. reset() called to ensure clean state
+      3. compile() and/or execute() called as needed
+      4. Returned via SandboxPool.release()
+
+    Implementations should ensure:
+      - Network isolation (no external access)
+      - Resource limits (CPU, memory)
+      - Timeout enforcement
+      - Filesystem isolation between uses
+    """
+
+    @property
+    def python_version(self) -> str:
+        """Python version in this sandbox (e.g., '3.11')."""
+        ...
+
+    async def reset(self) -> None:
+        """
+        Clear filesystem, kill processes, restore to clean state.
+
+        Must be called before first use and is automatically called
+        by SandboxPool.release().
+        """
+        ...
+
+    async def compile(
+        self,
+        code: str,
+        *,
+        timeout_s: float = 5.0,
+    ) -> CompileResult:
+        """
+        Syntax-check / compile code without executing.
+
+        For Python: runs py_compile or ast.parse to catch syntax errors.
+        For compiled languages: runs the compiler.
+
+        Args:
+            code: Source code to compile/check
+            timeout_s: Maximum time for compilation
+
+        Returns:
+            CompileResult with status and error details if failed
+        """
+        ...
+
+    async def execute(
+        self,
+        code: str,
+        *,
+        stdin: str = "",
+        skip_compile: bool = False,
+        timeout_s: float = 10.0,
+        memory_limit_mb: Optional[int] = None,
+        env_vars: Optional[Dict[str, str]] = None,
+    ) -> ExecutionResult:
+        """
+        Execute code and return rich results.
+
+        Implicitly compiles first if not already compiled (unless skip_compile=True).
+        The compile result is included in the returned ExecutionResult.
+
+        Args:
+            code: Source code to execute
+            stdin: Input to feed to the process via stdin
+            skip_compile: If True, skip compilation step (assumes code already compiled)
+            timeout_s: Maximum execution time (excluding compilation)
+            memory_limit_mb: Memory limit override (None uses sandbox default)
+            env_vars: Additional environment variables
+
+        Returns:
+            ExecutionResult with compile status, output, timing, etc.
+        """
+        ...
+
+
+@runtime_checkable
+class SandboxPool(Protocol):
+    """
+    Async pool of reusable sandboxes with caching.
+
+    The pool manages:
+      1. Sandbox lifecycle (start/stop containers, processes, etc.)
+      2. Checkout/release of exclusive sandbox handles
+      3. Execution cache (code+tests -> result)
+
+    Lifecycle:
+      1. start() - Initialize pool (start containers, etc.)
+      2. checkout() - Get exclusive sandbox access
+      3. release() - Return sandbox to pool
+      4. shutdown() - Tear down all sandboxes
+
+    The pool should be started once at application startup and shared
+    across all CodeExecEnv instances via factory closure injection.
+
+    Caching:
+      The pool maintains an LRU cache keyed by (code_hash, tests_hash).
+      This avoids redundant execution when the same code is submitted
+      for the same tests (common in GRPO where multiple generations
+      are evaluated against the same problem).
+    """
+
+    @property
+    def python_version(self) -> str:
+        """Python version used by sandboxes in this pool."""
+        ...
+
+    @property
+    def available(self) -> int:
+        """Number of sandboxes currently available for checkout."""
+        ...
+
+    @property
+    def cache_stats(self) -> Dict[str, int]:
+        """
+        Cache statistics.
+
+        Returns dict with keys:
+          - hits: number of cache hits
+          - misses: number of cache misses
+          - size: current cache size
+          - max_size: maximum cache size
+        """
+        ...
+
+    async def start(self) -> None:
+        """
+        Initialize the pool.
+
+        This starts all sandboxes (containers, processes, etc.).
+        Should be called once before any checkout() calls.
+        Idempotent - calling multiple times has no effect.
+        """
+        ...
+
+    async def checkout(self, timeout_s: float = 30.0) -> Sandbox:
+        """
+        Get exclusive access to a sandbox.
+
+        Blocks until a sandbox is available or timeout is reached.
+        The returned sandbox is guaranteed to be in a clean state.
+
+        Args:
+            timeout_s: Maximum time to wait for a sandbox
+
+        Returns:
+            Exclusive Sandbox handle
+
+        Raises:
+            TimeoutError: If no sandbox available within timeout
+        """
+        ...
+
+    async def release(self, sandbox: Sandbox) -> None:
+        """
+        Return a sandbox to the pool.
+
+        The sandbox is automatically reset before being made available
+        to other callers.
+
+        Args:
+            sandbox: The sandbox to release (must have been obtained via checkout)
+        """
+        ...
+
+    async def shutdown(self) -> None:
+        """
+        Tear down all sandboxes and release resources.
+
+        After shutdown(), the pool cannot be used again without calling start().
+        """
+        ...
+
+    # ----- Cache interface -----
+
+    def get_cached(
+        self,
+        code_hash: str,
+        tests_hash: str,
+    ) -> Optional[BatchTestResult]:
+        """
+        Check if we have a cached result for this code+tests pair.
+
+        This is a synchronous method for use from env_step().
+        Thread-safe.
+
+        Args:
+            code_hash: Hash of the submitted code
+            tests_hash: Hash of the test cases
+
+        Returns:
+            Cached BatchTestResult if found, None otherwise
+        """
+        ...
+
+    def put_cached(
+        self,
+        code_hash: str,
+        tests_hash: str,
+        result: BatchTestResult,
+    ) -> None:
+        """
+        Cache a result for future lookups.
+
+        This is a synchronous method for use from env_step().
+        Thread-safe. Uses LRU eviction when cache is full.
+
+        Args:
+            code_hash: Hash of the submitted code
+            tests_hash: Hash of the test cases
+            result: The BatchTestResult to cache
+        """
+        ...
diff --git a/src/ludic/envs/code_exec/types.py b/src/ludic/envs/code_exec/types.py
new file mode 100644
index 0000000..806559b
--- /dev/null
+++ b/src/ludic/envs/code_exec/types.py
@@ -0,0 +1,258 @@
+"""
+Core types for code execution environments.
+
+These types capture rich metadata about code compilation and execution,
+providing RL-relevant signals for reward shaping and analysis.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Any, Dict, List, Optional
+
+
+class SandboxPoolExhaustedError(Exception):
+    """
+    Raised when sandbox pool experiences too many consecutive failures.
+
+    This indicates a systemic issue with sandbox creation/reset that
+    requires operator intervention.
+    """
+
+    pass
+
+
+class CompileStatus(Enum):
+    """Status of code compilation/syntax checking."""
+
+    SUCCESS = "success"
+    SYNTAX_ERROR = "syntax_error"
+    IMPORT_ERROR = "import_error"
+    TIMEOUT = "timeout"
+    UNKNOWN_ERROR = "unknown_error"
+
+
+class RunStatus(Enum):
+    """Status of code execution."""
+
+    SUCCESS = "success"
+    RUNTIME_ERROR = "runtime_error"
+    TIMEOUT = "timeout"
+    MEMORY_EXCEEDED = "memory_exceeded"
+    KILLED = "killed"
+    NOT_RUN = "not_run"  # e.g., skipped due to earlier failure
+    SANDBOX_ERROR = "sandbox_error"  # sandbox crashed, not user code
+
+
+@dataclass
+class CompileResult:
+    """
+    Result of compiling/syntax-checking code.
+
+    For Python, this typically uses py_compile or ast.parse to catch
+    syntax errors before execution.
+    """
+
+    status: CompileStatus
+    error_message: Optional[str] = None
+    error_line: Optional[int] = None
+    error_column: Optional[int] = None
+    duration_ms: float = 0.0
+
+    @property
+    def success(self) -> bool:
+        return self.status == CompileStatus.SUCCESS
+
+
+@dataclass
+class ExecutionResult:
+    """
+    Rich result of running code in a sandbox.
+
+    All fields are RL-relevant metadata that can be used for:
+      - Reward shaping (compile errors vs runtime errors vs wrong answer)
+      - Curriculum learning (filter by execution characteristics)
+      - Analysis (understanding failure modes)
+
+    This is the atomic unit returned by sandbox.execute().
+    """
+
+    # Compilation phase
+    compile_result: CompileResult
+
+    # Execution phase (only meaningful if compilation succeeded)
+    run_status: Optional[RunStatus] = None
+    stdout: str = ""
+    stderr: str = ""
+    exit_code: Optional[int] = None
+    return_value: Optional[str] = None  # for function-based testing
+
+    # Timing (all in milliseconds)
+    compile_duration_ms: float = 0.0
+    run_duration_ms: float = 0.0
+    total_duration_ms: float = 0.0
+
+    # Resource usage (optional, depends on sandbox implementation)
+    peak_memory_bytes: Optional[int] = None
+    cpu_time_ms: Optional[float] = None
+
+    # Cache info
+    cache_hit: bool = False
+    cache_key: Optional[str] = None
+
+    @property
+    def compiled(self) -> bool:
+        """True if code compiled successfully."""
+        return self.compile_result.success
+
+    @property
+    def succeeded(self) -> bool:
+        """True if code compiled and ran without errors."""
+        return self.compiled and self.run_status == RunStatus.SUCCESS
+
+    @property
+    def timed_out(self) -> bool:
+        """True if either compilation or execution timed out."""
+        return (
+            self.compile_result.status == CompileStatus.TIMEOUT
+            or self.run_status == RunStatus.TIMEOUT
+        )
+
+
+@dataclass
+class TestCase:
+    """
+    A single test case.
+
+    The interpretation of `input` and `expected` depends on the CodeRunner:
+      - stdin/stdout: input is stdin string, expected is stdout string
+      - function call: input is (args, kwargs), expected is return value
+      - pytest: input is test code, expected is None (pass/fail from exit code)
+    """
+
+    __test__ = False  # Prevent pytest from collecting this as a test class
+
+    input: Any
+    expected: Any
+    id: str = ""
+    weight: float = 1.0  # for weighted partial credit
+    metadata: Dict[str, Any] = field(default_factory=dict)
+
+
+@dataclass
+class BatchExecutionSpec:
+    """
+    Specification for executing multiple tests in a single batch.
+
+    Used by execute_batch() to run all tests with minimal semaphore acquisitions.
+    The batch runner receives this as a manifest and executes tests sequentially
+    inside the container, streaming results back as JSONL.
+    """
+
+    code: str
+    tests: List[TestCase]
+    compile_first: bool = True
+    timeout_s: float = 5.0
+    stop_on_first_failure: bool = True
+
+
+@dataclass
+class TestResult:
+    """Result of running a single test case."""
+
+    __test__ = False  # Prevent pytest from collecting this as a test class
+
+    test_case: TestCase
+    passed: bool
+    actual: Any
+    execution: ExecutionResult
+    comparison_details: Optional[str] = None  # explains why comparison failed
+
+    @property
+    def compiled(self) -> bool:
+        """True if code compiled for this test."""
+        return self.execution.compiled
+
+    @property
+    def ran(self) -> bool:
+        """True if code actually executed (not skipped)."""
+        return self.execution.run_status not in (None, RunStatus.NOT_RUN)
+
+
+@dataclass
+class BatchTestResult:
+    """
+    Result of running all tests for a code submission.
+
+    Aggregates individual TestResults and provides convenience properties
+    for computing rewards and analyzing results.
+    """
+
+    results: List[TestResult]
+    code_hash: str
+    tests_hash: str
+
+    @property
+    def passed_count(self) -> int:
+        """Number of tests that passed."""
+        return sum(1 for r in self.results if r.passed)
+
+    @property
+    def total_count(self) -> int:
+        """Total number of tests."""
+        return len(self.results)
+
+    @property
+    def all_passed(self) -> bool:
+        """True if all tests passed."""
+        return self.passed_count == self.total_count and self.total_count > 0
+
+    @property
+    def pass_rate(self) -> float:
+        """Fraction of tests that passed (0.0 to 1.0)."""
+        if self.total_count == 0:
+            return 0.0
+        return self.passed_count / self.total_count
+
+    @property
+    def first_failure(self) -> Optional[TestResult]:
+        """The first test that failed, or None if all passed."""
+        for r in self.results:
+            if not r.passed:
+                return r
+        return None
+
+    @property
+    def compile_failed(self) -> bool:
+        """True if code failed to compile (before any tests ran)."""
+        if not self.results:
+            return False
+        # If compilation failed, all tests will have the same compile failure
+        return not self.results[0].compiled
+
+    @property
+    def total_execution_ms(self) -> float:
+        """Total execution time across all tests."""
+        return sum(r.execution.total_duration_ms for r in self.results)
+
+    @property
+    def total_compile_ms(self) -> float:
+        """Total compilation time (usually same across tests if compiled once)."""
+        if not self.results:
+            return 0.0
+        # Compilation typically happens once, take max to be safe
+        return max(r.execution.compile_duration_ms for r in self.results)
+
+    @property
+    def total_run_ms(self) -> float:
+        """Total runtime across all tests (excluding compilation)."""
+        return sum(r.execution.run_duration_ms for r in self.results)
+
+    def get_failures(self) -> List[TestResult]:
+        """All tests that failed."""
+        return [r for r in self.results if not r.passed]
+
+    def get_successes(self) -> List[TestResult]:
+        """All tests that passed."""
+        return [r for r in self.results if r.passed]
diff --git a/src/ludic/eval/cli.py b/src/ludic/eval/cli.py
index 1deaee2..2fa9061 100644
--- a/src/ludic/eval/cli.py
+++ b/src/ludic/eval/cli.py
@@ -22,7 +22,7 @@
     SamplingParams,
     ReturnSpec,
 )
-from ludic.interaction import SingleAgentSyncProtocol
+from ludic.interaction import SingleAgentProtocol
 from ludic.parsers import ParseResult
 from ludic.training.batching.rollout_engine import RolloutEngine
 
@@ -88,14 +88,14 @@ def build_single_agent_engine(
 ) -> RolloutEngine:
     make_ctx = context_factory or (lambda sp: FullDialog(system_prompt=sp))
 
-    def protocol_factory() -> SingleAgentSyncProtocol:
+    def protocol_factory() -> SingleAgentProtocol:
         agent = Agent(
             client=client,
             model=model,
             ctx=make_ctx(system_prompt),
             parser=parser,
         )
-        return SingleAgentSyncProtocol(
+        return SingleAgentProtocol(
             agent=agent,
             stop_on_parse_error=stop_on_parse_error,
         )
diff --git a/src/ludic/inference/vllm_server.py b/src/ludic/inference/vllm_server.py
index a985a21..1f9b5f4 100644
--- a/src/ludic/inference/vllm_server.py
+++ b/src/ludic/inference/vllm_server.py
@@ -108,9 +108,9 @@ def init_communicator(self, host: str, port: int, world_size: int) -> None:
         # --- DEBUG: Print internal vLLM parameter names ---
         # This executes on the worker process. We use Rank 0 to avoid duplicates.
         if self.pynccl_comm.rank == 0:
-            print("\n" + "="*60)
+            print("\n" + "=" * 60)
             print("🔍 [DEBUG] vLLM Internal Parameter Names (Worker Rank 0)")
-            print("="*60)
+            print("=" * 60)
             try:
                 # Access the underlying torch model
                 model_instance = self.model_runner.model
@@ -121,7 +121,7 @@ def init_communicator(self, host: str, port: int, world_size: int) -> None:
                 print(f"Total parameters found: {count}")
             except Exception as e:
                 print(f"⚠️ Could not print parameter names: {e}")
-            print("="*60 + "\n")
+            print("=" * 60 + "\n")
         # --------------------------------------------------
 
     def update_named_param(self, name: str, dtype: str, shape: Sequence[int]) -> None:
@@ -230,7 +230,7 @@ def update_state(self, batch_update: Optional[BatchUpdate]) -> None:
                 self.req_state.pop(ridx, None)
 
         # 2) Handle additions
-        for (req_idx, params, prompt_ids, output_ids) in batch_update.added:
+        for req_idx, params, prompt_ids, output_ids in batch_update.added:
             assert isinstance(params, SamplingParams)
             extra_args = getattr(params, "extra_args", None)
 
@@ -248,7 +248,7 @@ def update_state(self, batch_update: Optional[BatchUpdate]) -> None:
             }
 
         # 3) Handle moves
-        for (src, dst, direction) in batch_update.moved:
+        for src, dst, direction in batch_update.moved:
             if direction == MoveDirectionality.UNIDIRECTIONAL:
                 state = self.req_state.pop(src, None)
                 if state is not None:
@@ -371,9 +371,7 @@ async def health() -> dict[str, str]:
 
     @app.get("/get_world_size")
     async def get_world_size() -> dict[str, int]:
-        return {
-            "world_size": args.tensor_parallel_size * args.data_parallel_size
-        }
+        return {"world_size": args.tensor_parallel_size * args.data_parallel_size}
 
     @app.get("/runtime_version")
     async def runtime_version() -> dict[str, int]:
@@ -390,9 +388,7 @@ async def init_communicator(request: Request) -> dict[str, str]:
         world_size = data.get("world_size")
 
         create_background_task(
-            engine.collective_rpc(
-                "init_communicator", args=(host, port, world_size)
-            )
+            engine.collective_rpc("init_communicator", args=(host, port, world_size))
         )
         return {"status": "ok"}
 
@@ -433,18 +429,18 @@ async def update_param_batch(request: Request) -> dict[str, str]:
         """
         data = await request.json()
         metadata = data.get("metadata", [])  # List of {name, dtype, shape}
-        
+
         # --- DEBUG: Verify what the server received ---
-        print("\n" + "="*80)
+        print("\n" + "=" * 80)
         print(f"📥 [SERVER DEBUG] Received Batch Metadata (Total: {len(metadata)})")
-        print("="*80)
+        print("=" * 80)
         for i, m in enumerate(metadata):
             # Print only first 10 to avoid spamming logs, or all if short
             if i < 10:
                 print(f"  • {m.get('name')} | {m.get('shape')}")
         if len(metadata) > 10:
-            print(f"  ... (+{len(metadata)-10} more)")
-        print("="*80 + "\n")
+            print(f"  ... (+{len(metadata) - 10} more)")
+        print("=" * 80 + "\n")
         # ----------------------------------------------
 
         # Check if an explicit version was provided by the Trainer
@@ -462,7 +458,7 @@ async def do_update_batch() -> None:
 
                 # Reset cache and bump version after full batch
                 await engine.reset_prefix_cache()
-                
+
                 global RUNTIME_VERSION
                 async with RUNTIME_VERSION_LOCK:
                     if forced_version is not None:
@@ -499,7 +495,7 @@ async def do_update() -> None:
                     await engine.collective_rpc(
                         "update_named_param", args=(name, dtype, shape)
                     )
-                
+
                 global RUNTIME_VERSION
                 async with RUNTIME_VERSION_LOCK:
                     if requested_version is not None:
@@ -578,7 +574,9 @@ def main() -> None:
     # vLLM can silently override sampling params using the model's Hugging Face
     # `generation_config` unless `--generation-config vllm` is set. Defaulting
     # to `vllm` makes Ludic's SamplingParams the source of truth.
-    if not any(a == "--generation-config" or a.startswith("--generation-config=") for a in argv):
+    if not any(
+        a == "--generation-config" or a.startswith("--generation-config=") for a in argv
+    ):
         argv = [*argv, "--generation-config", "vllm"]
     args = parser.parse_args(argv)
     assert args is not None
diff --git a/src/ludic/interaction/__init__.py b/src/ludic/interaction/__init__.py
index a9f1cf4..a826ea6 100644
--- a/src/ludic/interaction/__init__.py
+++ b/src/ludic/interaction/__init__.py
@@ -3,12 +3,12 @@
 from .base import InteractionProtocol
 from .info import merge_step_info
 from .multi_agent import MultiAgentProtocol
-from .single_agent import SingleAgentSyncProtocol
+from .single_agent import SingleAgentProtocol
 from .step_collector import TraceCollector
 
 __all__ = [
     "InteractionProtocol",
-    "SingleAgentSyncProtocol",
+    "SingleAgentProtocol",
     "MultiAgentProtocol",
     "TraceCollector",
     "merge_step_info",
diff --git a/src/ludic/interaction/multi_agent.py b/src/ludic/interaction/multi_agent.py
index 3992b9c..680f205 100644
--- a/src/ludic/interaction/multi_agent.py
+++ b/src/ludic/interaction/multi_agent.py
@@ -30,6 +30,12 @@ class MultiAgentProtocol(InteractionProtocol):
       logged for the failing agent (reward=parse_result.reward, info
       includes parse_error=True). The failing agent's context is updated
       with the synthetic observation for the next turn.
+
+    Async environment support:
+      This protocol does NOT currently support async environments.
+      It uses the synchronous env.reset() and env.step() methods.
+      For async multi-agent environments, this protocol would need
+      async detection similar to SingleAgentProtocol.
     """
 
     def __init__(self, agents: Dict[str, Agent]):
diff --git a/src/ludic/interaction/single_agent.py b/src/ludic/interaction/single_agent.py
index d7fe6cc..6cd3f98 100644
--- a/src/ludic/interaction/single_agent.py
+++ b/src/ludic/interaction/single_agent.py
@@ -1,4 +1,6 @@
 from __future__ import annotations
+
+import inspect
 from typing import Optional, List
 
 from ludic.envs.env import LudicEnv
@@ -8,15 +10,48 @@
 from .base import InteractionProtocol
 from .info import merge_step_info
 
-class SingleAgentSyncProtocol(InteractionProtocol):
+
+def _has_async_env_methods(env: LudicEnv) -> tuple[bool, bool]:
     """
-    Implements the standard single-agent, synchronous interaction loop.
-    
+    Detect if environment has async env_reset/env_step methods.
+
+    WARNING: If this returns (True, True), you MUST use the async methods
+    directly (env.env_reset(), env.env_step()) rather than the sync wrappers
+    (env.reset(), env.step()). Calling sync wrappers on an async env will
+    return coroutine objects instead of results.
+
+    This is used to support envs like CodeExecEnv that have async methods
+    while maintaining backward compatibility with sync envs.
+
+    Returns:
+        Tuple of (has_async_reset, has_async_step)
+    """
+    has_async_reset = (
+        hasattr(env, "env_reset")
+        and inspect.iscoroutinefunction(env.env_reset)
+    )
+    has_async_step = (
+        hasattr(env, "env_step")
+        and inspect.iscoroutinefunction(env.env_step)
+    )
+    return has_async_reset, has_async_step
+
+class SingleAgentProtocol(InteractionProtocol):
+    """
+    Implements the standard single-agent interaction loop.
+
     This protocol consumes a LudicEnv but ASSUMES it has exactly
     one agent and that this agent is active every step.
-    
+
     It works perfectly with any env inheriting from SingleAgentEnv.
 
+    Async env support:
+      This protocol automatically detects envs with async `env_reset` and
+      `env_step` methods (e.g., CodeExecEnv). For such envs, the protocol
+      calls these methods directly and awaits them, bypassing the sync
+      wrappers in SingleAgentEnv. This provides full backward compatibility
+      with sync envs while supporting async envs transparently.
+
     Parser failures:
       If the agent's parser returns ParseResult.action=None, the protocol
       does not call env.step(). Instead it logs a synthetic Step with
@@ -69,15 +104,22 @@ async def run(
         agent_ids = env.agent_ids
         if len(agent_ids) != 1:
             raise ValueError(
-                f"SingleAgentSyncProtocol requires a LudicEnv with "
+                f"SingleAgentProtocol requires a LudicEnv with "
                 f"exactly one agent, but found {len(agent_ids)}."
             )
         agent_id = agent_ids[0]
 
+        # Check for async env methods (e.g., CodeExecEnv)
+        has_async_reset, has_async_step = _has_async_env_methods(env)
+
         # 2. --- Reset Env ---
-        # env.reset() returns a dict
-        obs_info_dict = env.reset(seed=env_seed)
-        obs, info = obs_info_dict[agent_id]
+        # For async envs, call env_reset directly and await it.
+        # For sync envs, use the standard reset() wrapper.
+        if has_async_reset:
+            obs, info = await env.env_reset(seed=env_seed)  # type: ignore[union-attr]
+        else:
+            obs_info_dict = env.reset(seed=env_seed)
+            obs, info = obs_info_dict[agent_id]
         
         # 3. --- Reset Agent & Feed First Obs ---
         # Choose system prompt: prefer the context's default if set, else env suggestion.
@@ -149,12 +191,14 @@ async def run(
                 parsed_action = parse_result.action
                 parser_reward = parse_result.reward
 
-                # Send action to env in the required dict format
-                actions_dict = {agent_id: parsed_action}
-                outcomes_dict = env.step(actions_dict)
-
-                # Unwrap the outcome for our agent
-                env_outcome = outcomes_dict[agent_id]
+                # For async envs, call env_step directly and await it.
+                # For sync envs, use the standard step() wrapper.
+                if has_async_step:
+                    env_outcome = await env.env_step(parsed_action)  # type: ignore[union-attr]
+                else:
+                    actions_dict = {agent_id: parsed_action}
+                    outcomes_dict = env.step(actions_dict)
+                    env_outcome = outcomes_dict[agent_id]
                 
                 # Combine parser and env rewards
                 total_reward = env_outcome.reward + parser_reward
diff --git a/src/ludic/training/algorithm.py b/src/ludic/training/algorithm.py
index 720d447..90534c2 100644
--- a/src/ludic/training/algorithm.py
+++ b/src/ludic/training/algorithm.py
@@ -1,8 +1,13 @@
 from __future__ import annotations
 
+import logging
+import statistics
+from collections import defaultdict
 from dataclasses import dataclass
 from typing import Any, Dict, Mapping, Optional, Protocol
 
+log = logging.getLogger(__name__)
+
 from jaxtyping import Float
 from torch import nn, Tensor
 
@@ -14,14 +19,23 @@
     ClippedSurrogateLoss,
     TokenClippedSurrogateLoss,
     CISPOLoss,
+    TokenKLLoss,
+    CompositeLoss,
+    LossTerm,
     MaskedCausalLMCrossEntropyLoss,
 )
-from ludic.training.credit_assignment import MonteCarloReturn, GroupNormalizedReturn, ConstantCredit
+from ludic.training.credit_assignment import (
+    MonteCarloReturn,
+    GroupNormalizedReturn,
+    HybridNormalizedReturn,
+    ConstantCredit,
+)
 
 
 Batch = Mapping[str, Tensor]
 Logits = Float[Tensor, "B T V"]
 
+
 class PreprocessFn(Protocol):
     def __call__(self, saw_batch: SAWBatch) -> SAWBatch: ...
 
@@ -47,9 +61,19 @@ def compute_loss(
         self,
         model: nn.Module,
         batch: Batch,
+        *,
+        cast_logits_to_fp32: bool = False,
     ) -> tuple[Tensor, Dict[str, Any]]:
         """
         Runs the forward pass once and delegates to the Loss object.
+
+        Args:
+            model: The trainable model.
+            batch: Collated batch tensors (input_ids, attention_mask, etc.).
+            cast_logits_to_fp32: If True, cast logits to FP32 before loss computation.
+                This improves importance sampling ratio stability for ratio-based
+                objectives (GRPO, CISPO, etc.) by reducing precision errors in
+                exp(log_ratio). Recommended by ScaleRL paper (arXiv:2510.13786).
         """
         # --- Run the forward pass ---
         input_ids = batch["input_ids"]
@@ -60,6 +84,10 @@ def compute_loss(
         )
         logits: Logits = outputs.logits
 
+        # ScaleRL: FP32 logits prevent IS ratio precision issues in exp(logp_new - logp_old)
+        if cast_logits_to_fp32:
+            logits = logits.float()
+
         # Pass the resulting logits to the loss function
         return self.loss.compute(logits, batch)
 
@@ -128,11 +156,102 @@ def drop_zero_weight_samples(
     return saw_batch
 
 
+def filter_zero_variance_groups(
+    saw_batch: SAWBatch,
+    *,
+    eps: float = 1e-6,
+) -> SAWBatch:
+    """
+    Filter out groups where all samples have identical rewards (zero variance).
+
+    These groups contribute zero gradient in GRPO-style training because
+    A_i = R_i - mean(R_group) = 0 for all members when all R_i are equal.
+
+    Unlike drop_zero_weight_samples (which filters by computed advantage weight),
+    this filters entire groups by reward variance before training. This is the
+    DAPO-style "zero-variance prompt" (ZVP) filtering.
+
+    Args:
+        saw_batch: Batch of SAWItems to filter.
+        eps: Threshold for zero-variance detection. Groups with std <= eps are dropped.
+
+    Returns:
+        Filtered SAWBatch with zero-variance groups removed.
+        Sets saw_batch.meta["zvp_filter_frac"] to the fraction of groups dropped.
+    """
+    if eps < 0:
+        raise ValueError("eps must be >= 0.")
+
+    items = saw_batch.items
+    if not items:
+        return saw_batch
+
+    # Group items by group_id
+    groups: dict[str, list] = defaultdict(list)
+    for item in items:
+        group_id = item.meta.get("request_meta", {}).get("group_id")
+        if group_id is None:
+            # Items without group_id pass through (not part of GRPO-style grouping)
+            groups["__ungrouped__"].append(item)
+        else:
+            groups[group_id].append(item)
+
+    kept_items = []
+    total_groups = 0
+    dropped_groups = 0
+
+    for group_id, group_items in groups.items():
+        # Ungrouped items always pass through
+        if group_id == "__ungrouped__":
+            kept_items.extend(group_items)
+            continue
+
+        total_groups += 1
+        weights = [float(it.weight) for it in group_items]
+
+        # Compute std (need at least 2 items for meaningful variance)
+        if len(weights) < 2:
+            kept_items.extend(group_items)
+            continue
+
+        group_std = statistics.stdev(weights) if len(weights) > 1 else 0.0
+
+        if group_std <= eps:
+            # Zero-variance group: drop it
+            dropped_groups += 1
+            log.debug(
+                "filter_zero_variance_groups: dropping group_id=%r (std=%.2e <= eps=%.2e)",
+                group_id,
+                group_std,
+                eps,
+            )
+        else:
+            kept_items.extend(group_items)
+
+    # Track filter rate in batch metadata
+    if total_groups > 0:
+        zvp_frac = dropped_groups / total_groups
+        if saw_batch.meta is None:
+            saw_batch.meta = {}
+        saw_batch.meta["zvp_filter_frac"] = zvp_frac
+        if dropped_groups > 0:
+            log.debug(
+                "filter_zero_variance_groups: dropped %d/%d groups (%.1f%%)",
+                dropped_groups,
+                total_groups,
+                zvp_frac * 100,
+            )
+
+    saw_batch.items = kept_items
+    return saw_batch
+
+
 def compose_preprocess(*fns: PreprocessFn) -> PreprocessFn:
     def _composed(batch: SAWBatch) -> SAWBatch:
         for fn in fns:
             batch = fn(batch)
         return batch
+
     return _composed
 
 
@@ -166,7 +285,9 @@ def make_reinforce(
 
     preprocess = None
     if drop_zero_weight:
-        preprocess = lambda batch: drop_zero_weight_samples(batch, eps=drop_zero_weight_eps)
+        preprocess = lambda batch: drop_zero_weight_samples(
+            batch, eps=drop_zero_weight_eps
+        )
 
     return RLAlgorithm(
         name=name,
@@ -208,7 +329,9 @@ def make_reinforce_baseline(
 
     preprocess = None
     if drop_zero_weight:
-        preprocess = lambda batch: drop_zero_weight_samples(batch, eps=drop_zero_weight_eps)
+        preprocess = lambda batch: drop_zero_weight_samples(
+            batch, eps=drop_zero_weight_eps
+        )
 
     return RLAlgorithm(
         name=name,
@@ -270,7 +393,9 @@ def make_grpo(
     )
     preprocess_fns = []
     if drop_zero_weight:
-        preprocess_fns.append(lambda batch: drop_zero_weight_samples(batch, eps=drop_zero_weight_eps))
+        preprocess_fns.append(
+            lambda batch: drop_zero_weight_samples(batch, eps=drop_zero_weight_eps)
+        )
     preprocess_fns.append(validate_actor_logps)
     preprocess = compose_preprocess(*preprocess_fns)
 
@@ -317,7 +442,9 @@ def make_gspo(
     )
     preprocess_fns = []
     if drop_zero_weight:
-        preprocess_fns.append(lambda batch: drop_zero_weight_samples(batch, eps=drop_zero_weight_eps))
+        preprocess_fns.append(
+            lambda batch: drop_zero_weight_samples(batch, eps=drop_zero_weight_eps)
+        )
     preprocess_fns.append(validate_actor_logps)
     preprocess = compose_preprocess(*preprocess_fns)
 
@@ -337,6 +464,7 @@ def make_cispo(
     clip_eps_low: float = 1e6,
     clip_eps_high: float = 0.2,
     length_normalize: bool = True,
+    kl_coeff: float = 0.0,
     drop_zero_weight: bool = False,
     drop_zero_weight_eps: float = 1e-4,
     name: str = "cispo",
@@ -358,6 +486,9 @@ def make_cispo(
     Loss:
         L = - E[ sg(clip(r_t, 1-ε_low, 1+ε_high)) * A * log π(a_t|s_t) ]
 
+    When kl_coeff > 0, adds a token-level KL penalty for stability:
+        L_total = L_cispo + kl_coeff * KL(π_new || π_actor)
+
     Args:
         group_size: Number of rollouts per group for advantage normalization.
         group_normalize_adv: Whether to normalize advantages within each group.
@@ -366,6 +497,9 @@ def make_cispo(
             value (effectively no lower bound).
         clip_eps_high: Upper bound for IS weight clipping.
         length_normalize: Whether to normalize by number of action tokens.
+        kl_coeff: Coefficient for token-level KL penalty. Set > 0 for
+            additional stability (penalizes divergence from behavior policy).
+            Typical values: 0.01-0.1. Default 0.0 (no KL penalty).
         drop_zero_weight: Whether to drop zero-advantage samples.
         drop_zero_weight_eps: Epsilon for zero-weight detection.
         name: Algorithm name for logging.
@@ -381,14 +515,31 @@ def make_cispo(
         normalize_adv=group_normalize_adv,
         positive_only=positive_only,
     )
-    loss: Loss = CISPOLoss(
+
+    cispo_loss: Loss = CISPOLoss(
         clip_eps_low=clip_eps_low,
         clip_eps_high=clip_eps_high,
         length_normalize=length_normalize,
     )
+
+    # Optionally add token-level KL penalty for stability
+    # CompositeLoss now shares token_logp via SharedContext (memory-efficient)
+    if kl_coeff > 0:
+        kl_loss = TokenKLLoss(coeff=kl_coeff, length_normalize=length_normalize)
+        loss: Loss = CompositeLoss(
+            terms=[
+                LossTerm(name="cispo", loss=cispo_loss, weight=1.0),
+                LossTerm(name="kl", loss=kl_loss, weight=1.0),
+            ]
+        )
+    else:
+        loss = cispo_loss
+
     preprocess_fns = []
     if drop_zero_weight:
-        preprocess_fns.append(lambda batch: drop_zero_weight_samples(batch, eps=drop_zero_weight_eps))
+        preprocess_fns.append(
+            lambda batch: drop_zero_weight_samples(batch, eps=drop_zero_weight_eps)
+        )
     preprocess_fns.append(validate_actor_logps)
     preprocess = compose_preprocess(*preprocess_fns)
 
@@ -450,3 +601,119 @@ def make_sft(
         credit_assigner=credit_assigner,
         loss=loss,
     )
+
+
+# ---------------------------------------------------------------------------
+# ScaleRL (CISPO + Hybrid Normalization + ZVP Filtering)
+# ---------------------------------------------------------------------------
+
+
+def make_scalerl(
+    *,
+    group_size: int,
+    positive_only: bool = False,
+    clip_eps_low: float = 0.20,
+    clip_eps_high: float = 0.28,
+    length_normalize: bool = True,
+    kl_coeff: float = 0.0,
+    filter_zvp: bool = True,
+    zvp_filter_eps: float = 1e-6,
+    drop_zero_weight: bool = False,
+    drop_zero_weight_eps: float = 1e-4,
+    name: str = "scalerl",
+) -> RLAlgorithm:
+    """
+    ScaleRL recipe: CISPO loss + hybrid advantage normalization + ZVP filtering.
+
+    This combines the key sample-efficiency improvements from the ScaleRL paper:
+
+    1. **HybridNormalizedReturn**: Group-mean centering + batch-std scaling.
+       More robust than pure group-level normalization because it avoids
+       std=0 explosions in low-variance groups (easy prompts).
+
+    2. **CISPOLoss**: Truncated IS-weight policy gradient that preserves
+       gradient contributions from rare tokens (crucial for reflective
+       reasoning behaviors like "Wait", "However", "Recheck").
+
+    3. **Zero-variance group filtering**: Drops prompts where all generations
+       received the same reward (all correct OR all incorrect). These contribute
+       zero gradient but consume compute.
+
+    4. **FP32 logits** (via TrainerConfig.cast_logits_to_fp32=True):
+       Recommended for IS ratio stability. Not controlled by this preset—
+       set in TrainerConfig (default True).
+
+    Args:
+        group_size: Number of rollouts per group (required for credit assignment).
+        positive_only: If True, clip negative advantages to zero (REINFORCE-only).
+        clip_eps_low: Lower CISPO clipping bound. Default 0.20 per context-notes.md.
+        clip_eps_high: Upper CISPO clipping bound. Default 0.28 per context-notes.md.
+        length_normalize: Whether to normalize by number of action tokens.
+        kl_coeff: Coefficient for optional token-level KL penalty.
+            Set > 0 for additional stability. Typical: 0.01-0.1. Default 0.0.
+        filter_zvp: If True, filter zero-variance groups before training.
+        zvp_filter_eps: Epsilon for ZVP detection (groups with std <= eps are dropped).
+        drop_zero_weight: If True, additionally drop individual samples with ~zero weight.
+        drop_zero_weight_eps: Epsilon for zero-weight sample detection.
+        name: Algorithm name for logging/metrics.
+
+    Note: Rollouts must carry `group_id` in their metadata and each group
+    must have exactly `group_size` members. Use GRPORequestStrategy for
+    request expansion.
+
+    References:
+        - ScaleRL: arXiv:2510.13786
+        - DAPO (ZVP filtering): arXiv:2503.14476
+        - MiniMax-M1 (CISPO): arXiv:2506.13585
+    """
+    # HybridNormalizedReturn: group-mean baseline + batch-std scaling
+    credit_assigner: CreditAssigner = HybridNormalizedReturn(
+        group_size=group_size,
+        positive_only=positive_only,
+    )
+
+    # CISPO loss with asymmetric clipping
+    cispo_loss: Loss = CISPOLoss(
+        clip_eps_low=clip_eps_low,
+        clip_eps_high=clip_eps_high,
+        length_normalize=length_normalize,
+    )
+
+    # Optionally add token-level KL penalty for stability
+    if kl_coeff > 0:
+        kl_loss = TokenKLLoss(coeff=kl_coeff, length_normalize=length_normalize)
+        loss: Loss = CompositeLoss(
+            terms=[
+                LossTerm(name="cispo", loss=cispo_loss, weight=1.0),
+                LossTerm(name="kl", loss=kl_loss, weight=1.0),
+            ]
+        )
+    else:
+        loss = cispo_loss
+
+    # Build preprocessing pipeline (order matters)
+    preprocess_fns = []
+
+    # 1. Zero-variance group filter (DAPO-style)
+    if filter_zvp:
+        preprocess_fns.append(
+            lambda batch: filter_zero_variance_groups(batch, eps=zvp_filter_eps)
+        )
+
+    # 2. Drop individual zero-weight samples (optional, after credit assignment)
+    if drop_zero_weight:
+        preprocess_fns.append(
+            lambda batch: drop_zero_weight_samples(batch, eps=drop_zero_weight_eps)
+        )
+
+    # 3. Validate actor logprobs (required for CISPO ratio computation)
+    preprocess_fns.append(validate_actor_logps)
+
+    preprocess = compose_preprocess(*preprocess_fns)
+
+    return RLAlgorithm(
+        name=name,
+        credit_assigner=credit_assigner,
+        loss=loss,
+        preprocess=preprocess,
+    )
diff --git a/src/ludic/training/batching/synced_batching.py b/src/ludic/training/batching/synced_batching.py
index f4e5377..e55e285 100644
--- a/src/ludic/training/batching/synced_batching.py
+++ b/src/ludic/training/batching/synced_batching.py
@@ -1,4 +1,6 @@
 from __future__ import annotations
+import logging
+import time
 from typing import Callable, List, Optional
 
 from ludic.training.types import (
@@ -10,6 +12,8 @@
 )
 from .rollout_engine import RolloutEngine
 
+logger = logging.getLogger(__name__)
+
 
 class RolloutBatchSource(BatchSource):
     """
@@ -53,7 +57,13 @@ async def next_batch(self) -> SAWBatch:
         Pull requests -> Generate (blocking) -> Return Batch.
         """
         requests = self._requests_fn()
-        return await self._engine.generate_batch(
+        n_requests = len(requests)
+        logger.info(
+            f"Generating batch: {n_requests} rollouts with concurrency={self._concurrency}"
+        )
+        start_time = time.monotonic()
+
+        batch = await self._engine.generate_batch(
             requests=requests,
             max_steps=self._max_steps,
             credit_assigner=self._credit_assigner,
@@ -61,3 +71,10 @@ async def next_batch(self) -> SAWBatch:
             concurrency=self._concurrency,
             sample_filter=self._sample_filter,
         )
+
+        elapsed = time.monotonic() - start_time
+        logger.info(
+            f"Batch complete: {len(batch.items)} samples from {n_requests} rollouts "
+            f"in {elapsed:.1f}s ({n_requests / elapsed:.1f} rollouts/s)"
+        )
+        return batch
diff --git a/src/ludic/training/config.py b/src/ludic/training/config.py
index a2a82fb..a1e1253 100644
--- a/src/ludic/training/config.py
+++ b/src/ludic/training/config.py
@@ -43,9 +43,16 @@ class TrainerConfig:
           syncing (e.g., pure offline/local training).
 
     - mixed_precision_dtype:
-          Optional string to configure FSDP's mixed precision policy. 
+          Optional string to configure FSDP's mixed precision policy.
           Use "bf16" or "fp16". If None, defaults to full precision (fp32).
 
+    - cast_logits_to_fp32:
+          If True, cast model logits to FP32 before loss computation.
+          Critical for importance sampling stability in ratio-based RL objectives
+          (GRPO, CISPO, etc.) where BF16 precision errors compound in exp(log_ratio).
+          Follows ScaleRL paper's "FP32 at LM head" recommendation.
+          See: arXiv:2510.13786 (ScaleRL)
+
     ==========================
     Collation
     ==========================
@@ -106,6 +113,7 @@ class TrainerConfig:
     micro_token_budget: int = 8192
     sync_every_steps: int = 1
     mixed_precision_dtype: Optional[str] = "bf16"
+    cast_logits_to_fp32: bool = True  # ScaleRL: FP32 logits for IS ratio stability
 
     # PipelineRL specific settings
     max_lag: Optional[int] = None  # Drop batches older than N steps
diff --git a/src/ludic/training/credit_assignment.py b/src/ludic/training/credit_assignment.py
index c17c351..9418618 100644
--- a/src/ludic/training/credit_assignment.py
+++ b/src/ludic/training/credit_assignment.py
@@ -97,6 +97,113 @@ def compute(
         return out
 
 
+@dataclass
+class HybridNormalizedReturn:
+    """
+    ScaleRL-style advantage normalization: group-mean baseline, batch-std scaling.
+
+    Formula: A_i = (R_i - mean(R_group)) / (std(A_batch) + eps)
+
+    This is more robust than pure group-level normalization (GroupNormalizedReturn)
+    because:
+    1. Avoids std=0 explosions in low-variance groups (easy prompts)
+    2. Provides consistent advantage scale across diverse prompts
+    3. Recommended by ScaleRL and "Tricks or Traps Part I" papers
+
+    The key insight: use group-level *centering* (baseline = group mean) but
+    batch-level *scaling* (divide by batch std). This combines GRPO's per-prompt
+    baseline with robust global scaling.
+
+    Contract:
+    - Rollouts must have `group_id` in `rollout.meta["request_meta"]["group_id"]`.
+    - Each group must have exactly `group_size` rollouts.
+    - Raises ValueError if either condition is violated.
+
+    Args:
+        group_size: Number of rollouts per group.
+        eps: Small constant for numerical stability in std division.
+        positive_only: If True, clip negative advantages to 0.
+
+    Reference: ScaleRL (arXiv:2510.13786), Tricks or Traps Part I (arXiv:2508.08221)
+    """
+
+    group_size: int
+    eps: float = 1e-8
+    positive_only: bool = False
+
+    def __post_init__(self):
+        if self.group_size <= 0:
+            raise ValueError(f"group_size must be positive, got {self.group_size}")
+
+    def compute(
+        self,
+        rollouts: List[Rollout],
+    ) -> Dict[RolloutStepKey, float]:
+
+        out: Dict[RolloutStepKey, float] = {}
+
+        # Group by group_id from request meta
+        groups: Dict[str, List[Rollout]] = defaultdict(list)
+        for r in rollouts:
+            group_id = r.meta.get("request_meta", {}).get("group_id")
+            if group_id is None:
+                raise ValueError(
+                    f"Rollout {r.id} missing group_id in meta['request_meta']. "
+                    "HybridNormalizedReturn requires each rollout to have a group_id."
+                )
+            groups[group_id].append(r)
+
+        # Phase 1: Compute group-centered advantages (A_i = R_i - mean(R_group))
+        # Store (rollout, advantage) pairs for batch-level normalization
+        all_advantages: List[float] = []
+        rollout_advantages: List[tuple[Rollout, float]] = []
+
+        for group_id, group_rollouts in groups.items():
+            # Validate group size
+            actual_size = len(group_rollouts)
+            if actual_size != self.group_size:
+                raise ValueError(
+                    f"Group size mismatch for group_id={group_id}: "
+                    f"expected {self.group_size}, got {actual_size}."
+                )
+
+            # Get total reward for each rollout in the group
+            rewards = torch.tensor(
+                [r.total_reward for r in group_rollouts],
+                dtype=torch.float32,
+            )
+
+            # Group-level centering: A_i = R_i - mean(R_group)
+            baseline = rewards.mean()
+            advantages = rewards - baseline
+
+            for i, r in enumerate(group_rollouts):
+                adv = advantages[i].item()
+                all_advantages.append(adv)
+                rollout_advantages.append((r, adv))
+
+        # Phase 2: Batch-level std normalization
+        if len(all_advantages) == 0:
+            return out
+
+        all_adv_tensor = torch.tensor(all_advantages, dtype=torch.float32)
+        batch_std = all_adv_tensor.std(unbiased=False)
+
+        # Normalize all advantages by batch std
+        for rollout, raw_adv in rollout_advantages:
+            adv = raw_adv / (batch_std.item() + self.eps)
+
+            if self.positive_only:
+                adv = max(adv, 0.0)
+
+            # Assign same advantage to all steps in the rollout
+            for step in rollout.steps:
+                key: RolloutStepKey = (rollout.id, step.index)
+                out[key] = adv
+
+        return out
+
+
 @dataclass
 class MonteCarloReturn:
     """
diff --git a/src/ludic/training/hardware.py b/src/ludic/training/hardware.py
new file mode 100644
index 0000000..e1cad63
--- /dev/null
+++ b/src/ludic/training/hardware.py
@@ -0,0 +1,251 @@
+"""
+GPU hardware detection and Flash Attention configuration utilities.
+
+This module provides utilities for:
+- Detecting GPU architecture (Hopper, Ampere, etc.)
+- Selecting optimal attention implementation based on hardware
+- Configuring PyTorch SDPA backends for Flash Attention
+
+Usage:
+    from ludic.training.hardware import configure_flash_attention
+
+    # In training script, after device detection:
+    attn_impl = configure_flash_attention(device="cuda", disable_flash_attn=False)
+    model = AutoModelForCausalLM.from_pretrained(..., attn_implementation=attn_impl)
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Literal, Optional
+
+import torch
+
+logger = logging.getLogger(__name__)
+
+# GPU architecture compute capability mapping
+# See: https://developer.nvidia.com/cuda-gpus
+GPU_ARCHITECTURES = {
+    (9, 0): "hopper",   # H100, H200, GH200
+    (8, 9): "ada",      # RTX 4090, L40
+    (8, 6): "ampere",   # RTX 3090, A10
+    (8, 0): "ampere",   # A100
+    (7, 5): "turing",   # RTX 2080, T4
+    (7, 0): "volta",    # V100
+}
+
+AttentionImpl = Literal["flash_attention_3", "flash_attention_2", "sdpa", "eager"]
+
+
+def detect_gpu_architecture() -> Optional[str]:
+    """
+    Detect the GPU architecture from CUDA compute capability.
+    
+    Returns:
+        Architecture name: "hopper", "ampere", "ada", "turing", "volta", or None
+        if no CUDA GPU is available.
+    """
+    if not torch.cuda.is_available():
+        return None
+    
+    try:
+        capability = torch.cuda.get_device_capability()
+        arch = GPU_ARCHITECTURES.get(capability)
+        if arch is None:
+            # Unknown architecture, try to infer from major version
+            major = capability[0]
+            if major >= 9:
+                arch = "hopper"
+            elif major >= 8:
+                arch = "ampere"
+            else:
+                arch = "older"
+        return arch
+    except Exception as e:
+        logger.warning(f"Failed to detect GPU architecture: {e}")
+        return None
+
+
+def get_cuda_version() -> Optional[tuple[int, int]]:
+    """
+    Get the CUDA runtime version.
+    
+    Returns:
+        Tuple of (major, minor) version, or None if CUDA unavailable.
+    """
+    if not torch.cuda.is_available():
+        return None
+    
+    try:
+        version = torch.version.cuda
+        if version is None:
+            return None
+        parts = version.split(".")
+        return (int(parts[0]), int(parts[1]))
+    except Exception as e:
+        logger.warning(f"Failed to get CUDA version: {e}")
+        return None
+
+
+def _check_flash_attn_3_available() -> bool:
+    """
+    Check if Flash Attention 3 is available for HuggingFace Transformers.
+    
+    HuggingFace Transformers checks for flash_attention_3 support via:
+        importlib.util.find_spec("flash_attn_3")
+    
+    This requires either:
+    1. The flash_attn_3 package installed (pip install flash_attn_3)
+    2. Building flash-attn from the hopper/ subdirectory
+    3. Using HuggingFace 'kernels' package (pip install kernels)
+    
+    Returns True only if HuggingFace will accept flash_attention_3.
+    """
+    import importlib.util
+    
+    # Check what HuggingFace Transformers actually checks
+    if importlib.util.find_spec("flash_attn_3") is not None:
+        logger.info("flash_attn_3 package found - FA3 available")
+        return True
+    
+    # Also check for flash_attn_interface (alternative FA3 installation)
+    if importlib.util.find_spec("flash_attn_interface") is not None:
+        logger.info("flash_attn_interface found - FA3 may be available")
+        # Note: This might not work with all HF Transformers versions
+        # as they specifically check for flash_attn_3, not flash_attn_interface
+        return False  # Be conservative - HF checks for flash_attn_3 specifically
+    
+    logger.debug("FA3 not available (flash_attn_3 package not found)")
+    return False
+
+
+def get_optimal_attention_impl(
+    *,
+    disable_flash_attn: bool = False,
+) -> AttentionImpl:
+    """
+    Determine the optimal attention implementation for the current hardware.
+    
+    Selection logic:
+    - Hopper (H100/H200) + CUDA >= 12.3 + flash-attn >= 2.7: flash_attention_3
+    - Ampere/Ada + CUDA >= 11.6 + flash-attn installed: flash_attention_2  
+    - Otherwise: sdpa (PyTorch native, still uses flash kernels when possible)
+    
+    Args:
+        disable_flash_attn: If True, skip flash attention and use SDPA.
+    
+    Returns:
+        Attention implementation string for HuggingFace models:
+        "flash_attention_3", "flash_attention_2", "sdpa", or "eager"
+    """
+    if disable_flash_attn:
+        logger.info("Flash Attention disabled by user request, using SDPA")
+        return "sdpa"
+    
+    arch = detect_gpu_architecture()
+    cuda_version = get_cuda_version()
+    
+    # Check if flash_attn is available
+    try:
+        import flash_attn
+        flash_attn_available = True
+        flash_attn_version = getattr(flash_attn, "__version__", "unknown")
+    except ImportError:
+        flash_attn_available = False
+        flash_attn_version = None
+    
+    if not flash_attn_available:
+        logger.info(f"flash-attn not installed, using SDPA (arch={arch})")
+        return "sdpa"
+    
+    # Flash Attention 3: Hopper-only (H100/H200) with CUDA >= 12.3
+    # Achieves 1.5-2x speedup over FA2, 75% H100 utilization
+    # Ref: https://arxiv.org/abs/2407.08608
+    if arch == "hopper" and cuda_version and cuda_version >= (12, 3):
+        if _check_flash_attn_3_available():
+            logger.info(
+                f"Using flash_attention_3 (arch={arch}, cuda={cuda_version}, "
+                f"flash_attn={flash_attn_version})"
+            )
+            return "flash_attention_3"
+    
+    # Flash Attention 2: Ampere+ with CUDA >= 11.6
+    if arch in ("hopper", "ampere", "ada") and cuda_version and cuda_version >= (11, 6):
+        logger.info(
+            f"Using flash_attention_2 (arch={arch}, cuda={cuda_version}, "
+            f"flash_attn={flash_attn_version})"
+        )
+        return "flash_attention_2"
+    
+    # Fallback to SDPA (PyTorch native, also uses flash kernels when possible)
+    logger.info(f"Using SDPA (arch={arch}, cuda={cuda_version})")
+    return "sdpa"
+
+
+def configure_flash_attention(
+    device: str = "cuda",
+    *,
+    disable_flash_attn: bool = False,
+) -> AttentionImpl:
+    """
+    Configure Flash Attention for optimal performance.
+    
+    This function:
+    1. Enables PyTorch's Flash SDP backend (if available)
+    2. Returns the optimal attention implementation for HuggingFace models
+    
+    Args:
+        device: Target device ("cuda" or "cpu")
+        disable_flash_attn: If True, disable flash attention entirely.
+    
+    Returns:
+        Attention implementation string to pass to model.from_pretrained().
+    
+    Example:
+        attn_impl = configure_flash_attention("cuda")
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            attn_implementation=attn_impl,
+        )
+    """
+    if device != "cuda" or not torch.cuda.is_available():
+        logger.info("No CUDA device, using eager attention")
+        return "eager"
+    
+    # Enable Flash SDP backend in PyTorch (uses flash kernels for F.scaled_dot_product_attention)
+    if not disable_flash_attn:
+        try:
+            torch.backends.cuda.enable_flash_sdp(True)
+            logger.debug("Enabled torch.backends.cuda.flash_sdp")
+        except Exception as e:
+            logger.warning(f"Could not enable flash_sdp: {e}")
+    
+    return get_optimal_attention_impl(disable_flash_attn=disable_flash_attn)
+
+
+def log_hardware_info() -> None:
+    """Log GPU hardware information for debugging."""
+    if not torch.cuda.is_available():
+        logger.info("No CUDA GPU available")
+        return
+    
+    try:
+        device_name = torch.cuda.get_device_name()
+        capability = torch.cuda.get_device_capability()
+        arch = detect_gpu_architecture()
+        cuda_version = get_cuda_version()
+        
+        logger.info(
+            f"GPU: {device_name} (sm_{capability[0]}{capability[1]}, {arch}), "
+            f"CUDA: {cuda_version[0]}.{cuda_version[1] if cuda_version else 'N/A'}"
+        )
+        
+        # Check flash_attn
+        try:
+            import flash_attn
+            logger.info(f"flash-attn version: {flash_attn.__version__}")
+        except ImportError:
+            logger.info("flash-attn: not installed")
+            
+    except Exception as e:
+        logger.warning(f"Could not log hardware info: {e}")
diff --git a/src/ludic/training/loss.py b/src/ludic/training/loss.py
index 18fe078..e041ae2 100644
--- a/src/ludic/training/loss.py
+++ b/src/ludic/training/loss.py
@@ -20,9 +20,11 @@
 
 logger = logging.getLogger(__name__)
 
+
 def _no_op(fn):
     return fn
 
+
 _TYPECHECK_ENABLED = os.getenv("LUDIC_TYPECHECK", "0") == "1"
 typechecker = beartype if _TYPECHECK_ENABLED else _no_op
 logger.info(
@@ -31,6 +33,141 @@ def _no_op(fn):
 )
 
 
+# ---------------------------------------------------------------------------
+# Shared context for memory-efficient loss composition
+# ---------------------------------------------------------------------------
+
+
+class SharedContext:
+    """
+    Lazy-computed shared tensors for memory-efficient loss composition.
+
+    When multiple losses are combined via CompositeLoss, each typically needs
+    the same expensive intermediate tensors (e.g., token_logp from log_softmax).
+    Without sharing, each loss computes these independently, creating separate
+    autograd graphs that store duplicate [B, T, V] activations for backward.
+
+    SharedContext solves this by computing expensive tensors ONCE on first access
+    and caching them for subsequent uses. All losses receive the same tensor
+    objects, sharing a single autograd graph.
+
+    Memory savings example (7B model, V=32K, B=8, T=4096):
+        - Without sharing (2 losses): 2× [B, T, V] ≈ 4GB activations
+        - With sharing (2 losses):    1× [B, T, V] ≈ 2GB activations
+
+    Usage:
+        # Created by CompositeLoss, passed to each child loss
+        shared = SharedContext(logits, batch)
+
+        # In CISPOLoss.compute():
+        token_logp = shared.token_logp  # Computes and caches on first access
+
+        # In TokenKLLoss.compute():
+        token_logp = shared.token_logp  # Returns cached tensor (same object!)
+
+    Note: Properties that depend on batch["actor_logps"] will raise KeyError
+    if that key is missing. This is intentional - not all loss combinations
+    need actor logprobs.
+    """
+
+    __slots__ = ("logits", "batch", "_cache")
+
+    def __init__(self, logits: Logits, batch: Batch) -> None:
+        self.logits = logits
+        self.batch = batch
+        self._cache: Dict[str, Tensor] = {}
+
+    @property
+    def input_ids(self) -> TokenIds:
+        """Token IDs from batch (not cached, just a convenience accessor)."""
+        return self.batch["input_ids"]
+
+    @property
+    def action_mask(self) -> Mask:
+        """Action mask from batch (not cached, just a convenience accessor)."""
+        return self.batch["action_mask"]
+
+    @property
+    def token_logp(self) -> Float[Tensor, "B T-1"]:
+        """
+        Per-token log probabilities: log π(a_t|s_t) for each position.
+
+        THIS IS THE EXPENSIVE OPERATION - calls selective_log_softmax which
+        requires storing [B, T, V] activations for backward. Caching this
+        is the primary memory optimization.
+        """
+        if "token_logp" not in self._cache:
+            # Import here to avoid circular dependency (compute_token_logp defined later)
+            self._cache["token_logp"] = compute_token_logp(self.logits, self.input_ids)
+        return self._cache["token_logp"]
+
+    @property
+    def token_mask(self) -> Float[Tensor, "B T-1"]:
+        """Action mask aligned with token_logp (shifted by 1 for next-token prediction)."""
+        if "token_mask" not in self._cache:
+            self._cache["token_mask"] = self.action_mask[:, 1:].to(
+                self.token_logp.dtype
+            )
+        return self._cache["token_mask"]
+
+    @property
+    def token_counts(self) -> Float[Tensor, "B"]:
+        """Number of action tokens per sample (for length normalization)."""
+        if "token_counts" not in self._cache:
+            self._cache["token_counts"] = self.token_mask.sum(dim=-1).clamp(min=1.0)
+        return self._cache["token_counts"]
+
+    @property
+    def actor_logps_shifted(self) -> Float[Tensor, "B T-1"]:
+        """
+        Behavior policy log probs aligned with token_logp.
+
+        Raises:
+            KeyError: If batch["actor_logps"] is not present.
+        """
+        if "actor_logps_shifted" not in self._cache:
+            if "actor_logps" not in self.batch:
+                raise KeyError(
+                    "SharedContext.actor_logps_shifted requires batch['actor_logps']. "
+                    "Ensure your rollouts include actor_logps for ratio-based objectives."
+                )
+            self._cache["actor_logps_shifted"] = self.batch["actor_logps"][:, 1:]
+        return self._cache["actor_logps_shifted"]
+
+    @property
+    def log_ratio(self) -> Float[Tensor, "B T-1"]:
+        """Log importance ratio: log(π_new/π_old) per token."""
+        if "log_ratio" not in self._cache:
+            self._cache["log_ratio"] = self.token_logp - self.actor_logps_shifted
+        return self._cache["log_ratio"]
+
+    @property
+    def ratio(self) -> Float[Tensor, "B T-1"]:
+        """Importance ratio: π_new/π_old per token."""
+        if "ratio" not in self._cache:
+            self._cache["ratio"] = torch.exp(self.log_ratio)
+        return self._cache["ratio"]
+
+    def logp_action(self, *, length_normalize: bool = False) -> Float[Tensor, "B"]:
+        """
+        Sequence-level log probability (sum over action tokens).
+
+        Unlike token_logp, this is a cheap derivation that doesn't require
+        additional [B, T, V] storage. The length_normalize flag controls
+        whether to divide by number of action tokens.
+
+        Args:
+            length_normalize: If True, return mean log prob instead of sum.
+
+        Returns:
+            [B] tensor of per-sample log probabilities.
+        """
+        masked_logp = (self.token_logp * self.token_mask).sum(dim=-1)
+        if length_normalize:
+            return masked_logp / self.token_counts
+        return masked_logp
+
+
 class Loss(Protocol):
     """
     Generic loss: given model outputs (logits) and a collated batch, return
@@ -40,8 +177,10 @@ class Loss(Protocol):
     batched D2H transfer during aggregation. Use `tensor.detach()` not `float(...)`.
     """
 
-    def compute(self, logits: Logits, batch: Batch) -> Tuple[Tensor, Dict[str, Tensor]]:
-        ...
+    def compute(
+        self, logits: Logits, batch: Batch
+    ) -> Tuple[Tensor, Dict[str, Tensor]]: ...
+
 
 # We define this as a standalone helper so torch.compile can cache it cleanly.
 # dynamic=True is critical for varying sequence lengths (preventing recompilation).
@@ -50,7 +189,7 @@ def compute(self, logits: Logits, batch: Batch) -> Tuple[Tensor, Dict[str, Tenso
 def selective_log_softmax(logits: Logits, index: TokenIds) -> Float[Tensor, "B T"]:
     """
     Fused kernel for log_softmax + gather.
-    
+
     Inductor (torch.compile) generates a kernel that computes the log_softmax
     normalization term and selects the target token in a single pass.
     This avoids materializing the massive [B, T, V] probability tensor in VRAM.
@@ -59,6 +198,7 @@ def selective_log_softmax(logits: Logits, index: TokenIds) -> Float[Tensor, "B T
     logprobs = logits.log_softmax(dim=-1)
     return torch.gather(logprobs, dim=-1, index=index.unsqueeze(-1)).squeeze(-1)
 
+
 @jaxtyped(typechecker=typechecker)
 def compute_logp_action(
     logits: Logits,
@@ -80,16 +220,18 @@ def compute_logp_action(
     """
     if logits.ndim != 3:
         raise ValueError(f"Expected logits [B, T, V], got {tuple(logits.shape)}")
-    
+
     if input_ids.shape != logits.shape[:2]:
-        raise ValueError(f"Shape mismatch: input_ids {input_ids.shape} vs logits {logits.shape}")
+        raise ValueError(
+            f"Shape mismatch: input_ids {input_ids.shape} vs logits {logits.shape}"
+        )
 
     # Shift for causal LM: logits[t] predicts input_ids[t+1]
     if logits.size(1) < 2:
         raise ValueError("Sequence too short to compute next-token logprobs.")
-    logits_shifted = logits[:, :-1, :]          # [B, T-1, V]
-    target_ids = input_ids[:, 1:]               # [B, T-1]
-    action_mask_shifted = action_mask[:, 1:]    # [B, T-1]
+    logits_shifted = logits[:, :-1, :]  # [B, T-1, V]
+    target_ids = input_ids[:, 1:]  # [B, T-1]
+    action_mask_shifted = action_mask[:, 1:]  # [B, T-1]
 
     # Use the compiled fused kernel on aligned targets
     token_logp = selective_log_softmax(logits_shifted, target_ids)
@@ -119,12 +261,14 @@ def compute_token_logp(
     if logits.ndim != 3:
         raise ValueError(f"Expected logits [B, T, V], got {tuple(logits.shape)}")
     if input_ids.shape != logits.shape[:2]:
-        raise ValueError(f"Shape mismatch: input_ids {input_ids.shape} vs logits {logits.shape}")
+        raise ValueError(
+            f"Shape mismatch: input_ids {input_ids.shape} vs logits {logits.shape}"
+        )
     if logits.size(1) < 2:
         raise ValueError("Sequence too short to compute next-token logprobs.")
 
-    logits_shifted = logits[:, :-1, :]          # [B, T-1, V]
-    target_ids = input_ids[:, 1:]               # [B, T-1]
+    logits_shifted = logits[:, :-1, :]  # [B, T-1, V]
+    target_ids = input_ids[:, 1:]  # [B, T-1]
     return selective_log_softmax(logits_shifted, target_ids)
 
 
@@ -155,21 +299,34 @@ class ReinforceLoss:
     old_logp_key: str = "old_logp_action"
 
     @jaxtyped(typechecker=typechecker)
-    def compute(self, logits: Logits, batch: Batch) -> Tuple[Tensor, Dict[str, Any]]:
-        input_ids = batch["input_ids"]            # [B, T]
-        action_mask = batch["action_mask"]        # [B, T]
-        advantages = batch["weight"]              # [B]
+    def compute(
+        self,
+        logits: Logits,
+        batch: Batch,
+        *,
+        shared: Optional[SharedContext] = None,
+    ) -> Tuple[Tensor, Dict[str, Any]]:
+        advantages = batch["weight"]  # [B]
 
         if self.old_logp_key not in batch:
             raise KeyError(f"ReinforceLoss requires '{self.old_logp_key}' in batch.")
 
-        logp_action = compute_logp_action(
-            logits, input_ids, action_mask, length_normalize=self.length_normalize
-        )  # [B]
+        # Use shared context if available (memory-efficient composition)
+        if shared is not None:
+            logp_action = shared.logp_action(length_normalize=self.length_normalize)
+            token_counts = shared.token_counts
+        else:
+            # Backward-compatible: compute locally
+            input_ids = batch["input_ids"]  # [B, T]
+            action_mask = batch["action_mask"]  # [B, T]
+            logp_action = compute_logp_action(
+                logits, input_ids, action_mask, length_normalize=self.length_normalize
+            )  # [B]
+            token_counts = action_mask[:, 1:].sum(dim=-1).clamp(min=1.0)
 
         old_logp = batch[self.old_logp_key]  # [B]
         if self.length_normalize:
-            lengths = action_mask[:, 1:].to(old_logp.dtype).sum(dim=-1).clamp(min=1.0)
+            lengths = token_counts.to(old_logp.dtype)
             old_logp = old_logp / lengths
 
         log_ratio = logp_action - old_logp
@@ -178,7 +335,7 @@ def compute(self, logits: Logits, batch: Batch) -> Tuple[Tensor, Dict[str, Any]]
         mismatch_kl = ratio - log_ratio - 1.0
 
         # Stop-gradient on IS weight: gradient flows only through log π
-        loss = - (ratio.detach() * advantages * logp_action).mean()
+        loss = -(ratio.detach() * advantages * logp_action).mean()
 
         stats: Dict[str, Any] = {
             "loss": loss.detach(),
@@ -222,7 +379,9 @@ def compute(self, logits: Logits, batch: Batch) -> Tuple[Tensor, Dict[str, Any]]
         if logits.ndim != 3:
             raise ValueError(f"Expected logits [B, T, V], got {tuple(logits.shape)}")
         if input_ids.shape != logits.shape[:2]:
-            raise ValueError(f"Shape mismatch: input_ids {input_ids.shape} vs logits {logits.shape}")
+            raise ValueError(
+                f"Shape mismatch: input_ids {input_ids.shape} vs logits {logits.shape}"
+            )
 
         if logits.size(1) < 2:
             raise ValueError("Sequence too short to compute next-token loss.")
@@ -281,21 +440,36 @@ class ReinforceBaselineLoss:
     old_logp_key: str = "old_logp_action"
 
     @jaxtyped(typechecker=typechecker)
-    def compute(self, logits: Logits, batch: Batch) -> Tuple[Tensor, Dict[str, Any]]:
-        input_ids = batch["input_ids"]
-        action_mask = batch["action_mask"]
-        adv_raw = batch["weight"]                # [B]
+    def compute(
+        self,
+        logits: Logits,
+        batch: Batch,
+        *,
+        shared: Optional[SharedContext] = None,
+    ) -> Tuple[Tensor, Dict[str, Any]]:
+        adv_raw = batch["weight"]  # [B]
 
         if self.old_logp_key not in batch:
-            raise KeyError(f"ReinforceBaselineLoss requires '{self.old_logp_key}' in batch.")
+            raise KeyError(
+                f"ReinforceBaselineLoss requires '{self.old_logp_key}' in batch."
+            )
 
-        logp_action = compute_logp_action(
-            logits, input_ids, action_mask, length_normalize=self.length_normalize
-        )  # [B]
+        # Use shared context if available (memory-efficient composition)
+        if shared is not None:
+            logp_action = shared.logp_action(length_normalize=self.length_normalize)
+            token_counts = shared.token_counts
+        else:
+            # Backward-compatible: compute locally
+            input_ids = batch["input_ids"]
+            action_mask = batch["action_mask"]
+            logp_action = compute_logp_action(
+                logits, input_ids, action_mask, length_normalize=self.length_normalize
+            )  # [B]
+            token_counts = action_mask[:, 1:].sum(dim=-1).clamp(min=1.0)
 
         old_logp = batch[self.old_logp_key]  # [B]
         if self.length_normalize:
-            lengths = action_mask[:, 1:].to(old_logp.dtype).sum(dim=-1).clamp(min=1.0)
+            lengths = token_counts.to(old_logp.dtype)
             old_logp = old_logp / lengths
 
         log_ratio = logp_action - old_logp
@@ -310,7 +484,7 @@ def compute(self, logits: Logits, batch: Batch) -> Tuple[Tensor, Dict[str, Any]]
             advantages = advantages / (std + 1e-8)
 
         # Stop-gradient on IS weight
-        loss = - (ratio.detach() * advantages * logp_action).mean()
+        loss = -(ratio.detach() * advantages * logp_action).mean()
 
         stats: Dict[str, Any] = {
             "loss": loss.detach(),
@@ -362,22 +536,38 @@ def __post_init__(self) -> None:
             raise ValueError(f"ratio_clip must be positive, got {self.ratio_clip}")
 
     @jaxtyped(typechecker=typechecker)
-    def compute(self, logits: Logits, batch: Batch) -> Tuple[Tensor, Dict[str, Any]]:
-        input_ids = batch["input_ids"]
-        action_mask = batch["action_mask"]
-        advantages = batch["weight"]              # [B]
+    def compute(
+        self,
+        logits: Logits,
+        batch: Batch,
+        *,
+        shared: Optional[SharedContext] = None,
+    ) -> Tuple[Tensor, Dict[str, Any]]:
+        advantages = batch["weight"]  # [B]
         if self.old_logp_key not in batch:
-            raise KeyError(f"ClippedSurrogateLoss requires '{self.old_logp_key}' in batch.")
-
-        logp_action = compute_logp_action(
-            logits,
-            input_ids,
-            action_mask,
-            length_normalize=self.length_normalize,
-        )  # [B]
+            raise KeyError(
+                f"ClippedSurrogateLoss requires '{self.old_logp_key}' in batch."
+            )
+
+        # Use shared context if available (memory-efficient composition)
+        if shared is not None:
+            logp_action = shared.logp_action(length_normalize=self.length_normalize)
+            token_counts = shared.token_counts
+        else:
+            # Backward-compatible: compute locally
+            input_ids = batch["input_ids"]
+            action_mask = batch["action_mask"]
+            logp_action = compute_logp_action(
+                logits,
+                input_ids,
+                action_mask,
+                length_normalize=self.length_normalize,
+            )  # [B]
+            token_counts = action_mask[:, 1:].sum(dim=-1).clamp(min=1.0)
+
         old_logp = batch[self.old_logp_key]  # [B]
         if self.length_normalize:
-            lengths = action_mask[:, 1:].to(old_logp.dtype).sum(dim=-1).clamp(min=1.0)
+            lengths = token_counts.to(old_logp.dtype)
             old_logp = old_logp / lengths
 
         log_ratio = logp_action - old_logp
@@ -388,16 +578,19 @@ def compute(self, logits: Logits, batch: Batch) -> Tuple[Tensor, Dict[str, Any]]
             ratio = torch.clamp(ratio, max=self.ratio_clip)
 
         unclipped = ratio * advantages
-        clipped = torch.clamp(
-            ratio, 1.0 - self.clip_eps_low, 1.0 + self.clip_eps_high
-        ) * advantages
+        clipped = (
+            torch.clamp(ratio, 1.0 - self.clip_eps_low, 1.0 + self.clip_eps_high)
+            * advantages
+        )
 
         obj = torch.min(unclipped, clipped)
         loss = -obj.mean()
 
         ppo_clip_frac = (
-            (ratio > 1.0 + self.clip_eps_high) | (ratio < 1.0 - self.clip_eps_low)
-        ).float().mean()
+            ((ratio > 1.0 + self.clip_eps_high) | (ratio < 1.0 - self.clip_eps_low))
+            .float()
+            .mean()
+        )
         if self.ratio_clip is not None:
             ratio_clip_frac = (ratio >= self.ratio_clip).float().mean()
         else:
@@ -450,33 +643,51 @@ class CISPOLoss:
 
     def __post_init__(self) -> None:
         if self.clip_eps_high < 0:
-            raise ValueError(f"clip_eps_high must be non-negative, got {self.clip_eps_high}")
-
-    @jaxtyped(typechecker=typechecker)
-    def compute(self, logits: Logits, batch: Batch) -> Tuple[Tensor, Dict[str, Any]]:
-        input_ids = batch["input_ids"]
-        action_mask = batch["action_mask"]
-        advantages = batch["weight"]  # [B]
-
-        if "actor_logps" not in batch:
-            raise KeyError("CISPOLoss requires batch['actor_logps'] for importance sampling.")
-
-        actor_logps = batch["actor_logps"]  # [B, T]
-        if actor_logps.shape != input_ids.shape:
             raise ValueError(
-                f"actor_logps shape {tuple(actor_logps.shape)} does not match input_ids "
-                f"{tuple(input_ids.shape)}."
+                f"clip_eps_high must be non-negative, got {self.clip_eps_high}"
             )
 
-        # Compute token log probs under current policy
-        token_logp = compute_token_logp(logits, input_ids)  # [B, T-1]
-        token_mask = action_mask[:, 1:].to(token_logp.dtype)  # [B, T-1]
-        token_counts = token_mask.sum(dim=-1).clamp(min=1.0)  # [B]
-        actor_logps_shifted = actor_logps[:, 1:]  # [B, T-1]
+    @jaxtyped(typechecker=typechecker)
+    def compute(
+        self,
+        logits: Logits,
+        batch: Batch,
+        *,
+        shared: Optional[SharedContext] = None,
+    ) -> Tuple[Tensor, Dict[str, Any]]:
+        advantages = batch["weight"]  # [B]
 
-        # Compute importance sampling ratios
-        log_ratio = token_logp - actor_logps_shifted
-        ratio = torch.exp(log_ratio)
+        # Use shared context if available (memory-efficient composition)
+        if shared is not None:
+            token_logp = shared.token_logp
+            token_mask = shared.token_mask
+            token_counts = shared.token_counts
+            actor_logps_shifted = shared.actor_logps_shifted
+            log_ratio = shared.log_ratio
+            ratio = shared.ratio
+        else:
+            # Backward-compatible: compute locally
+            input_ids = batch["input_ids"]
+            action_mask = batch["action_mask"]
+
+            if "actor_logps" not in batch:
+                raise KeyError(
+                    "CISPOLoss requires batch['actor_logps'] for importance sampling."
+                )
+
+            actor_logps = batch["actor_logps"]  # [B, T]
+            if actor_logps.shape != input_ids.shape:
+                raise ValueError(
+                    f"actor_logps shape {tuple(actor_logps.shape)} does not match input_ids "
+                    f"{tuple(input_ids.shape)}."
+                )
+
+            token_logp = compute_token_logp(logits, input_ids)  # [B, T-1]
+            token_mask = action_mask[:, 1:].to(token_logp.dtype)  # [B, T-1]
+            token_counts = token_mask.sum(dim=-1).clamp(min=1.0)  # [B]
+            actor_logps_shifted = actor_logps[:, 1:]  # [B, T-1]
+            log_ratio = token_logp - actor_logps_shifted
+            ratio = torch.exp(log_ratio)
 
         # CISPO: Clip the IS weight, not the update
         clipped_ratio = torch.clamp(
@@ -508,9 +719,13 @@ def compute(self, logits: Logits, batch: Batch) -> Tuple[Tensor, Dict[str, Any]]
             ratio_std = ratio_vals.std(unbiased=False)
             # Fraction of tokens where IS weight was clipped
             clip_frac = (
-                (ratio_vals > 1.0 + self.clip_eps_high) |
-                (ratio_vals < 1.0 - self.clip_eps_low)
-            ).float().mean()
+                (
+                    (ratio_vals > 1.0 + self.clip_eps_high)
+                    | (ratio_vals < 1.0 - self.clip_eps_low)
+                )
+                .float()
+                .mean()
+            )
             # KL approximation: r - log(r) - 1
             token_kl = ratio_vals - log_ratio.masked_select(mask) - 1.0
             mismatch_kl = token_kl.mean()
@@ -535,6 +750,112 @@ def compute(self, logits: Logits, batch: Batch) -> Tuple[Tensor, Dict[str, Any]]
         return loss, stats
 
 
+@dataclass
+class TokenKLLoss:
+    """
+    Token-level KL penalty between π_new and the behavior policy (actor).
+
+    Uses token-level log probabilities from batch["actor_logps"] to compute:
+
+        KL(π_new || π_actor) ≈ mean_t[ log π_new(a_t|s_t) - log π_actor(a_t|s_t) ]
+
+    This is designed to be consistent with CISPO's token-level approach,
+    where we want to penalize the policy for deviating too far from the
+    behavior policy at each token position.
+
+    Loss is:
+        loss = coeff * mean(kl)
+
+    Args:
+        coeff: KL penalty coefficient. Positive values penalize divergence.
+        length_normalize: If True, normalize by number of action tokens per sample
+            before averaging across samples.
+
+    Expects:
+        - batch["actor_logps"]: [B, T] log probs under behavior policy
+        - batch["action_mask"]: [B, T] mask for action tokens
+        - batch["input_ids"]: [B, T] token ids
+
+    Reference: Useful for stabilizing CISPO training (MiniMax-M1, arXiv:2506.13585)
+    """
+
+    coeff: float = 0.01
+    length_normalize: bool = True
+
+    @jaxtyped(typechecker=typechecker)
+    def compute(
+        self,
+        logits: Logits,
+        batch: Batch,
+        *,
+        shared: Optional[SharedContext] = None,
+    ) -> Tuple[Tensor, Dict[str, Any]]:
+        # Use shared context if available (memory-efficient composition)
+        if shared is not None:
+            token_logp = shared.token_logp
+            token_mask = shared.token_mask
+            token_counts = shared.token_counts
+            actor_logps_shifted = shared.actor_logps_shifted
+            token_kl = shared.log_ratio  # log π_new - log π_actor
+        else:
+            # Backward-compatible: compute locally
+            input_ids = batch["input_ids"]
+            action_mask = batch["action_mask"]
+
+            if "actor_logps" not in batch:
+                raise KeyError(
+                    "TokenKLLoss requires batch['actor_logps'] for KL computation."
+                )
+
+            actor_logps = batch["actor_logps"]  # [B, T]
+            if actor_logps.shape != input_ids.shape:
+                raise ValueError(
+                    f"actor_logps shape {tuple(actor_logps.shape)} does not match input_ids "
+                    f"{tuple(input_ids.shape)}."
+                )
+
+            token_logp = compute_token_logp(logits, input_ids)  # [B, T-1]
+            token_mask = action_mask[:, 1:].to(token_logp.dtype)  # [B, T-1]
+            token_counts = token_mask.sum(dim=-1).clamp(min=1.0)  # [B]
+            actor_logps_shifted = actor_logps[:, 1:]  # [B, T-1]
+            token_kl = token_logp - actor_logps_shifted  # [B, T-1]
+
+        # Masked sum per sample
+        per_sample_kl = (token_kl * token_mask).sum(dim=-1)  # [B]
+        if self.length_normalize:
+            per_sample_kl = per_sample_kl / token_counts
+
+        mean_kl = per_sample_kl.mean()
+        loss = self.coeff * mean_kl
+
+        # Stats
+        mask = token_mask > 0
+        if mask.any():
+            kl_vals = token_kl.masked_select(mask)
+            kl_mean = kl_vals.mean()
+            kl_std = kl_vals.std(unbiased=False)
+            # Also compute ratio stats for monitoring
+            ratio = torch.exp(token_kl)
+            ratio_vals = ratio.masked_select(mask)
+            ratio_mean = ratio_vals.mean()
+            ratio_std = ratio_vals.std(unbiased=False)
+        else:
+            kl_mean = torch.zeros((), device=token_kl.device, dtype=token_kl.dtype)
+            kl_std = torch.zeros((), device=token_kl.device, dtype=token_kl.dtype)
+            ratio_mean = torch.zeros((), device=token_kl.device, dtype=token_kl.dtype)
+            ratio_std = torch.zeros((), device=token_kl.device, dtype=token_kl.dtype)
+
+        stats: Dict[str, Any] = {
+            "loss": loss.detach(),
+            "kl_mean": kl_mean.detach(),
+            "kl_std": kl_std.detach(),
+            "ratio_mean": ratio_mean.detach(),
+            "ratio_std": ratio_std.detach(),
+            "avg_action_tokens": token_counts.mean().detach(),
+        }
+        return loss, stats
+
+
 @dataclass
 class TokenClippedSurrogateLoss:
     """
@@ -565,27 +886,45 @@ def __post_init__(self) -> None:
             raise ValueError(f"ratio_clip must be positive, got {self.ratio_clip}")
 
     @jaxtyped(typechecker=typechecker)
-    def compute(self, logits: Logits, batch: Batch) -> Tuple[Tensor, Dict[str, Any]]:
-        input_ids = batch["input_ids"]
-        action_mask = batch["action_mask"]
+    def compute(
+        self,
+        logits: Logits,
+        batch: Batch,
+        *,
+        shared: Optional[SharedContext] = None,
+    ) -> Tuple[Tensor, Dict[str, Any]]:
         advantages = batch["weight"]
-        if "actor_logps" not in batch:
-            raise KeyError("TokenClippedSurrogateLoss requires batch['actor_logps'] for token IS.")
 
-        actor_logps = batch["actor_logps"]
-        if actor_logps.shape != input_ids.shape:
-            raise ValueError(
-                f"actor_logps shape {tuple(actor_logps.shape)} does not match input_ids "
-                f"{tuple(input_ids.shape)}."
-            )
-
-        token_logp = compute_token_logp(logits, input_ids)  # [B, T-1]
-        token_mask = action_mask[:, 1:].to(token_logp.dtype)
-        token_counts = token_mask.sum(dim=-1).clamp(min=1.0)
-        actor_logps_shifted = actor_logps[:, 1:]
+        # Use shared context if available (memory-efficient composition)
+        if shared is not None:
+            token_logp = shared.token_logp
+            token_mask = shared.token_mask
+            token_counts = shared.token_counts
+            log_ratio = shared.log_ratio
+            ratio_raw = shared.ratio
+        else:
+            # Backward-compatible: compute locally
+            input_ids = batch["input_ids"]
+            action_mask = batch["action_mask"]
+            if "actor_logps" not in batch:
+                raise KeyError(
+                    "TokenClippedSurrogateLoss requires batch['actor_logps'] for token IS."
+                )
+
+            actor_logps = batch["actor_logps"]
+            if actor_logps.shape != input_ids.shape:
+                raise ValueError(
+                    f"actor_logps shape {tuple(actor_logps.shape)} does not match input_ids "
+                    f"{tuple(input_ids.shape)}."
+                )
+
+            token_logp = compute_token_logp(logits, input_ids)  # [B, T-1]
+            token_mask = action_mask[:, 1:].to(token_logp.dtype)
+            token_counts = token_mask.sum(dim=-1).clamp(min=1.0)
+            actor_logps_shifted = actor_logps[:, 1:]
+            log_ratio = token_logp - actor_logps_shifted
+            ratio_raw = torch.exp(log_ratio)
 
-        log_ratio = token_logp - actor_logps_shifted
-        ratio_raw = torch.exp(log_ratio)
         token_mismatch_kl = ratio_raw - log_ratio - 1.0
         ratio = ratio_raw
         if self.ratio_clip is not None:
@@ -608,15 +947,22 @@ def compute(self, logits: Logits, batch: Batch) -> Tuple[Tensor, Dict[str, Any]]
         if mask.any():
             ratio_vals = ratio.masked_select(mask)
             ppo_clip_frac = (
-                (ratio_vals > 1.0 + self.clip_eps_high) | (ratio_vals < 1.0 - self.clip_eps_low)
-            ).float().mean()
+                (
+                    (ratio_vals > 1.0 + self.clip_eps_high)
+                    | (ratio_vals < 1.0 - self.clip_eps_low)
+                )
+                .float()
+                .mean()
+            )
             ratio_mean = ratio_vals.mean()
             ratio_std = ratio_vals.std(unbiased=False)
             mismatch_kl = token_mismatch_kl.masked_select(mask).mean()
             if self.ratio_clip is not None:
                 ratio_clip_frac = (ratio_vals >= self.ratio_clip).float().mean()
             else:
-                ratio_clip_frac = torch.zeros((), device=ratio.device, dtype=ratio.dtype)
+                ratio_clip_frac = torch.zeros(
+                    (), device=ratio.device, dtype=ratio.dtype
+                )
         else:
             ratio_mean = torch.zeros((), device=ratio.device, dtype=ratio.dtype)
             ratio_std = torch.zeros((), device=ratio.device, dtype=ratio.dtype)
@@ -667,22 +1013,36 @@ class KLLoss:
     length_normalize: bool = False
 
     @jaxtyped(typechecker=typechecker)
-    def compute(self, logits: Logits, batch: Batch) -> Tuple[Tensor, Dict[str, Any]]:
-        input_ids = batch["input_ids"]
-        action_mask = batch["action_mask"]
-        old_logp = batch[self.old_logp_key]       # [B]
-
-        logp_new = compute_logp_action(
-            logits,
-            input_ids,
-            action_mask,
-            length_normalize=self.length_normalize,
-        )  # [B]
+    def compute(
+        self,
+        logits: Logits,
+        batch: Batch,
+        *,
+        shared: Optional[SharedContext] = None,
+    ) -> Tuple[Tensor, Dict[str, Any]]:
+        old_logp = batch[self.old_logp_key]  # [B]
+
+        # Use shared context if available (memory-efficient composition)
+        if shared is not None:
+            logp_new = shared.logp_action(length_normalize=self.length_normalize)
+            token_counts = shared.token_counts
+        else:
+            # Backward-compatible: compute locally
+            input_ids = batch["input_ids"]
+            action_mask = batch["action_mask"]
+            logp_new = compute_logp_action(
+                logits,
+                input_ids,
+                action_mask,
+                length_normalize=self.length_normalize,
+            )  # [B]
+            token_counts = action_mask[:, 1:].sum(dim=-1).clamp(min=1.0)
+
         if self.length_normalize:
-            lengths = action_mask[:, 1:].to(old_logp.dtype).sum(dim=-1).clamp(min=1.0)
+            lengths = token_counts.to(old_logp.dtype)
             old_logp = old_logp / lengths
 
-        kl = logp_new - old_logp                                           # [B]
+        kl = logp_new - old_logp  # [B]
         loss = self.coeff * kl.mean()
 
         stats: Dict[str, Any] = {
@@ -720,7 +1080,7 @@ def compute(self, logits: Logits, batch: Batch) -> Tuple[Tensor, Dict[str, Any]]
 
         mask = action_mask.to(token_entropy.dtype)
 
-        masked_entropy = token_entropy * mask   # [B, T]
+        masked_entropy = token_entropy * mask  # [B, T]
         # avoid divide-by-zero if mask is all zeros
         denom = mask.sum()
         if denom.item() == 0:
@@ -751,6 +1111,7 @@ class LossTerm:
     - loss:   loss object implementing Loss protocol
     - weight: scalar multiplier applied to that loss
     """
+
     name: str
     loss: Loss
     weight: float = 1.0
@@ -768,9 +1129,20 @@ class CompositeLoss:
         "{name}/loss", "{name}/<stat_key>", ...
 
     and a top-level "loss" key for the final combined loss.
-    
-    This class expects logits to be passed in, and it passes them
-    down to all child terms.
+
+    Memory Efficiency:
+        CompositeLoss automatically creates a SharedContext to cache expensive
+        intermediate tensors (like token_logp from log_softmax). All child losses
+        receive the same SharedContext, enabling them to share the cached tensors
+        and avoiding duplicate autograd graphs.
+
+        Without SharedContext: N losses → N× [B, T, V] autograd activations
+        With SharedContext:    N losses → 1× [B, T, V] autograd activations
+
+    Note:
+        All child losses must accept a `shared` keyword argument of type
+        Optional[SharedContext]. Losses that don't support this will cause
+        a TypeError with a helpful message.
     """
 
     terms: List[LossTerm]
@@ -780,12 +1152,26 @@ def compute(self, logits: Logits, batch: Batch) -> Tuple[Tensor, Dict[str, Any]]
         if not self.terms:
             raise ValueError("CompositeLoss.terms must be non-empty")
 
+        # Create shared context for memory-efficient tensor sharing
+        shared = SharedContext(logits, batch)
+
         total_loss: Tensor | None = None
         stats: Dict[str, Any] = {}
 
         for term in self.terms:
-            # Pass the pre-computed logits down to the child term
-            raw_loss, term_stats = term.loss.compute(logits, batch)
+            # Pass shared context to enable memory-efficient composition
+            try:
+                raw_loss, term_stats = term.loss.compute(logits, batch, shared=shared)
+            except TypeError as e:
+                if "shared" in str(e):
+                    raise TypeError(
+                        f"Loss '{term.name}' ({type(term.loss).__name__}) does not accept "
+                        f"'shared' parameter. All losses used in CompositeLoss must accept "
+                        f"Optional[SharedContext] for memory-efficient composition. "
+                        f"Update the loss class to add: def compute(..., *, shared: Optional[SharedContext] = None)"
+                    ) from e
+                raise
+
             scaled_loss = term.weight * raw_loss
 
             if total_loss is None:
diff --git a/src/ludic/training/trainer.py b/src/ludic/training/trainer.py
index 8a01d8a..77d7fa5 100644
--- a/src/ludic/training/trainer.py
+++ b/src/ludic/training/trainer.py
@@ -575,7 +575,11 @@ async def train_step(self) -> Dict[str, float]:
             # ---- 2c) Loss + backward (scaled) --------------------------
             pre_forward_alloc = self._reset_peak_memory(device) if profile_memory else None
             try:
-                loss, stats = self.algo.compute_loss(self.model, batch_tensors)
+                loss, stats = self.algo.compute_loss(
+                    self.model,
+                    batch_tensors,
+                    cast_logits_to_fp32=self.cfg.cast_logits_to_fp32,
+                )
 
                 # Scale loss by micro-batch size to preserve macro-batch mean.
                 scaled_loss = loss * (item_count / total_items)
diff --git a/tests/integration/test_code_exec_docker.py b/tests/integration/test_code_exec_docker.py
new file mode 100644
index 0000000..ed14ed6
--- /dev/null
+++ b/tests/integration/test_code_exec_docker.py
@@ -0,0 +1,615 @@
+"""
+Integration tests for Docker-based code execution sandbox.
+
+These tests require Docker to be running and will create/destroy containers.
+Run with: pytest -m integration tests/integration/test_code_exec_docker.py
+
+To skip GPU tests while running integration tests:
+    pytest -m "integration and not gpu"
+"""
+
+from __future__ import annotations
+
+import asyncio
+
+import pytest
+
+pytestmark = [pytest.mark.integration]
+
+
+# Try to import docker - skip all tests if not available
+try:
+    import docker
+    from docker.errors import DockerException
+
+    # Try to connect to Docker daemon
+    try:
+        _client = docker.from_env()
+        _client.ping()
+        _client.close()
+        DOCKER_AVAILABLE = True
+    except (DockerException, Exception):
+        DOCKER_AVAILABLE = False
+except ImportError:
+    DOCKER_AVAILABLE = False
+
+
+skip_if_no_docker = pytest.mark.skipif(
+    not DOCKER_AVAILABLE,
+    reason="Docker daemon not available or docker package not installed",
+)
+
+
+# ---------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------
+
+
+@pytest.fixture
+async def sandbox_pool():
+    """Create and tear down a sandbox pool for testing."""
+    from ludic.envs.code_exec.docker_sandbox import DockerSandboxPool, DockerSandboxConfig
+
+    config = DockerSandboxConfig(
+        python_version="3.11",
+        memory_limit="128m",
+        cpu_quota=25000,
+        network_disabled=True,
+    )
+
+    pool = DockerSandboxPool(
+        n_workers=2,
+        config=config,
+        cache_size=100,
+    )
+
+    await pool.start()
+    yield pool
+    await pool.shutdown()
+
+
+@pytest.fixture
+async def sandbox(sandbox_pool):
+    """Get a single sandbox for testing."""
+    sandbox = await sandbox_pool.checkout()
+    yield sandbox
+    await sandbox_pool.release(sandbox)
+
+
+# ---------------------------------------------------------------------
+# DockerSandbox Tests
+# ---------------------------------------------------------------------
+
+
+@skip_if_no_docker
+class TestDockerSandboxCompile:
+    @pytest.mark.asyncio
+    async def test_compile_valid_code(self, sandbox):
+        """Valid Python code should compile successfully."""
+        from ludic.envs.code_exec.types import CompileStatus
+
+        code = """
+def hello():
+    return "Hello, World!"
+
+print(hello())
+"""
+        result = await sandbox.compile(code)
+
+        assert result.success is True
+        assert result.status == CompileStatus.SUCCESS
+        assert result.error_message is None
+        assert result.duration_ms > 0
+
+    @pytest.mark.asyncio
+    async def test_compile_syntax_error(self, sandbox):
+        """Syntax errors should be detected and reported."""
+        from ludic.envs.code_exec.types import CompileStatus
+
+        code = """
+def broken(
+    print("missing parenthesis")
+"""
+        result = await sandbox.compile(code)
+
+        assert result.success is False
+        assert result.status == CompileStatus.SYNTAX_ERROR
+        assert result.error_message is not None
+        assert "SyntaxError" in result.error_message or "syntax" in result.error_message.lower()
+
+    @pytest.mark.asyncio
+    async def test_compile_indentation_error(self, sandbox):
+        """Indentation errors should be detected."""
+        from ludic.envs.code_exec.types import CompileStatus
+
+        code = """
+def foo():
+print("bad indent")
+"""
+        result = await sandbox.compile(code)
+
+        assert result.success is False
+        assert result.status == CompileStatus.SYNTAX_ERROR
+
+
+@skip_if_no_docker
+class TestDockerSandboxExecute:
+    @pytest.mark.asyncio
+    async def test_execute_simple_print(self, sandbox):
+        """Simple print statement should produce output."""
+        from ludic.envs.code_exec.types import RunStatus
+
+        code = 'print("Hello from Docker!")'
+        result = await sandbox.execute(code)
+
+        assert result.compiled is True
+        assert result.succeeded is True
+        assert result.run_status == RunStatus.SUCCESS
+        assert "Hello from Docker!" in result.stdout.strip()
+        assert result.exit_code == 0
+
+    @pytest.mark.asyncio
+    async def test_execute_with_stdin(self, sandbox):
+        """Code should be able to read from stdin."""
+        from ludic.envs.code_exec.types import RunStatus
+
+        code = """
+import sys
+line = input()
+print(f"Got: {line}")
+"""
+        result = await sandbox.execute(code, stdin="test_input")
+
+        assert result.compiled is True
+        # Note: stdin handling in docker exec is tricky
+        # This test may need adjustment based on actual behavior
+
+    @pytest.mark.asyncio
+    async def test_execute_runtime_error(self, sandbox):
+        """Runtime errors should be captured."""
+        from ludic.envs.code_exec.types import RunStatus
+
+        code = """
+x = undefined_variable
+"""
+        result = await sandbox.execute(code)
+
+        assert result.compiled is True
+        assert result.succeeded is False
+        assert result.run_status == RunStatus.RUNTIME_ERROR
+        assert "NameError" in result.stderr or "undefined" in result.stderr.lower()
+
+    @pytest.mark.asyncio
+    async def test_execute_division_by_zero(self, sandbox):
+        """Division by zero should be a runtime error."""
+        from ludic.envs.code_exec.types import RunStatus
+
+        code = """
+result = 1 / 0
+"""
+        result = await sandbox.execute(code)
+
+        assert result.compiled is True
+        assert result.succeeded is False
+        assert result.run_status == RunStatus.RUNTIME_ERROR
+        assert "ZeroDivision" in result.stderr
+
+    @pytest.mark.asyncio
+    async def test_execute_timeout(self, sandbox):
+        """Infinite loops should timeout."""
+        from ludic.envs.code_exec.types import RunStatus
+
+        code = """
+while True:
+    pass
+"""
+        result = await sandbox.execute(code, timeout_s=1.0)
+
+        assert result.compiled is True
+        assert result.timed_out is True
+        assert result.run_status == RunStatus.TIMEOUT
+
+    @pytest.mark.asyncio
+    async def test_execute_returns_timing(self, sandbox):
+        """Execution should return timing information."""
+        code = """
+import time
+time.sleep(0.1)
+print("done")
+"""
+        result = await sandbox.execute(code)
+
+        assert result.compile_duration_ms > 0
+        assert result.run_duration_ms >= 100  # At least 100ms for sleep
+        assert result.total_duration_ms > 0
+
+
+@skip_if_no_docker
+class TestDockerSandboxReset:
+    @pytest.mark.asyncio
+    async def test_reset_clears_files(self, sandbox):
+        """Reset should clear workspace files."""
+        # Write a file
+        code1 = """
+with open('test_file.txt', 'w') as f:
+    f.write('hello')
+"""
+        await sandbox.execute(code1)
+
+        # Reset
+        await sandbox.reset()
+
+        # Try to read the file - should fail
+        code2 = """
+try:
+    with open('test_file.txt', 'r') as f:
+        print(f.read())
+except FileNotFoundError:
+    print("FILE_NOT_FOUND")
+"""
+        result = await sandbox.execute(code2)
+
+        assert "FILE_NOT_FOUND" in result.stdout
+
+
+# ---------------------------------------------------------------------
+# DockerSandboxPool Tests
+# ---------------------------------------------------------------------
+
+
+@skip_if_no_docker
+class TestDockerSandboxPool:
+    @pytest.mark.asyncio
+    async def test_pool_checkout_and_release(self, sandbox_pool):
+        """Should be able to checkout and release sandboxes."""
+        sandbox = await sandbox_pool.checkout()
+        assert sandbox is not None
+        assert sandbox_pool.available == 1  # One still available
+
+        await sandbox_pool.release(sandbox)
+        assert sandbox_pool.available == 2  # Both available again
+
+    @pytest.mark.asyncio
+    async def test_pool_concurrent_checkout(self, sandbox_pool):
+        """Multiple checkouts should work concurrently."""
+        sandbox1 = await sandbox_pool.checkout()
+        sandbox2 = await sandbox_pool.checkout()
+
+        assert sandbox1 is not sandbox2
+        assert sandbox_pool.available == 0
+
+        await sandbox_pool.release(sandbox1)
+        await sandbox_pool.release(sandbox2)
+        assert sandbox_pool.available == 2
+
+    @pytest.mark.asyncio
+    async def test_pool_checkout_timeout(self, sandbox_pool):
+        """Checkout should timeout when no sandboxes available."""
+        # Check out all sandboxes
+        sandbox1 = await sandbox_pool.checkout()
+        sandbox2 = await sandbox_pool.checkout()
+
+        # Third checkout should timeout
+        with pytest.raises(TimeoutError):
+            await sandbox_pool.checkout(timeout_s=0.5)
+
+        await sandbox_pool.release(sandbox1)
+        await sandbox_pool.release(sandbox2)
+
+    @pytest.mark.asyncio
+    async def test_pool_caching(self, sandbox_pool):
+        """Pool should cache execution results."""
+        from ludic.envs.code_exec.types import (
+            BatchTestResult,
+            CompileResult,
+            CompileStatus,
+            ExecutionResult,
+            RunStatus,
+            TestCase,
+            TestResult,
+        )
+
+        # Create a mock result
+        test_result = TestResult(
+            test_case=TestCase(input="1", expected="2", id="t1"),
+            passed=True,
+            actual="2",
+            execution=ExecutionResult(
+                compile_result=CompileResult(status=CompileStatus.SUCCESS),
+                run_status=RunStatus.SUCCESS,
+            ),
+        )
+        batch_result = BatchTestResult(
+            results=[test_result],
+            code_hash="abc123",
+            tests_hash="xyz789",
+        )
+
+        # Cache it
+        sandbox_pool.put_cached("abc123", "xyz789", batch_result)
+
+        # Retrieve it
+        cached = sandbox_pool.get_cached("abc123", "xyz789")
+        assert cached is batch_result
+
+        # Check cache stats
+        stats = sandbox_pool.cache_stats
+        assert stats["hits"] == 1
+        assert stats["size"] == 1
+
+
+# ---------------------------------------------------------------------
+# StdinStdoutRunner Integration Tests
+# ---------------------------------------------------------------------
+
+
+@skip_if_no_docker
+class TestStdinStdoutRunnerIntegration:
+    @pytest.mark.asyncio
+    async def test_runner_all_pass(self, sandbox):
+        """Runner should correctly execute code and verify outputs."""
+        from ludic.envs.code_exec.runners import StdinStdoutRunner
+        from ludic.envs.code_exec.adapters.base import ExactMatchVerifier
+        from ludic.envs.code_exec.types import TestCase
+
+        code = """
+n = int(input())
+print(n * 2)
+"""
+        tests = [
+            TestCase(input="5", expected="10", id="t1"),
+            TestCase(input="10", expected="20", id="t2"),
+            TestCase(input="0", expected="0", id="t3"),
+        ]
+
+        runner = StdinStdoutRunner(default_timeout_s=5.0)
+        verifier = ExactMatchVerifier()
+
+        result = await runner.run_tests(
+            sandbox=sandbox,
+            code=code,
+            tests=tests,
+            verifier=verifier,
+        )
+
+        assert result.all_passed is True
+        assert result.passed_count == 3
+        assert result.total_count == 3
+
+    @pytest.mark.asyncio
+    async def test_runner_some_fail(self, sandbox):
+        """Runner should correctly identify failing tests."""
+        from ludic.envs.code_exec.runners import StdinStdoutRunner
+        from ludic.envs.code_exec.adapters.base import ExactMatchVerifier
+        from ludic.envs.code_exec.types import TestCase
+
+        # Code that only works for positive numbers
+        code = """
+n = int(input())
+if n < 0:
+    print("error")
+else:
+    print(n * 2)
+"""
+        tests = [
+            TestCase(input="5", expected="10", id="t1"),  # Pass
+            TestCase(input="-5", expected="-10", id="t2"),  # Fail
+        ]
+
+        runner = StdinStdoutRunner(default_timeout_s=5.0)
+        verifier = ExactMatchVerifier()
+
+        result = await runner.run_tests(
+            sandbox=sandbox,
+            code=code,
+            tests=tests,
+            verifier=verifier,
+            stop_on_first_failure=False,
+        )
+
+        assert result.all_passed is False
+        assert result.passed_count == 1
+        assert result.total_count == 2
+        assert result.results[0].passed is True
+        assert result.results[1].passed is False
+
+    @pytest.mark.asyncio
+    async def test_runner_compile_failure(self, sandbox):
+        """Runner should handle compilation failures gracefully."""
+        from ludic.envs.code_exec.runners import StdinStdoutRunner
+        from ludic.envs.code_exec.adapters.base import ExactMatchVerifier
+        from ludic.envs.code_exec.types import TestCase
+
+        code = """
+def broken(
+    print("syntax error")
+"""
+        tests = [
+            TestCase(input="1", expected="x", id="t1"),
+            TestCase(input="2", expected="y", id="t2"),
+        ]
+
+        runner = StdinStdoutRunner()
+        verifier = ExactMatchVerifier()
+
+        result = await runner.run_tests(
+            sandbox=sandbox,
+            code=code,
+            tests=tests,
+            verifier=verifier,
+            compile_first=True,
+        )
+
+        assert result.compile_failed is True
+        assert result.all_passed is False
+        assert result.passed_count == 0
+        # All tests should be marked as not compiled
+        for r in result.results:
+            assert r.compiled is False
+
+    @pytest.mark.asyncio
+    async def test_runner_stop_on_first_failure(self, sandbox):
+        """Runner should stop after first failure when configured."""
+        from ludic.envs.code_exec.runners import StdinStdoutRunner
+        from ludic.envs.code_exec.adapters.base import ExactMatchVerifier
+        from ludic.envs.code_exec.types import TestCase, RunStatus
+
+        code = """
+n = int(input())
+print("wrong" if n == 1 else "correct")
+"""
+        tests = [
+            TestCase(input="1", expected="correct", id="t1"),  # Fails
+            TestCase(input="2", expected="correct", id="t2"),  # Skipped
+            TestCase(input="3", expected="correct", id="t3"),  # Skipped
+        ]
+
+        runner = StdinStdoutRunner()
+        verifier = ExactMatchVerifier()
+
+        result = await runner.run_tests(
+            sandbox=sandbox,
+            code=code,
+            tests=tests,
+            verifier=verifier,
+            stop_on_first_failure=True,
+        )
+
+        assert result.passed_count == 0
+        assert result.results[0].passed is False
+        assert result.results[0].ran is True
+        assert result.results[1].ran is False
+        assert result.results[1].execution.run_status == RunStatus.NOT_RUN
+        assert result.results[2].ran is False
+
+
+# ---------------------------------------------------------------------
+# End-to-End CodeExecEnv Tests
+# ---------------------------------------------------------------------
+
+
+@skip_if_no_docker
+class TestCodeExecEnvIntegration:
+    @pytest.mark.asyncio
+    async def test_env_full_workflow(self, sandbox_pool):
+        """Test complete workflow from reset to step."""
+        from ludic.envs.code_exec.env import CodeExecEnv, CodeExecConfig
+        from ludic.envs.code_exec.adapters.apps import APPSTestAdapter
+
+        sample = {
+            "problem_id": "test_add",
+            "question": "Write a program that reads two integers and prints their sum.",
+            "inputs": ["1 2", "10 20", "-5 5"],
+            "outputs": ["3", "30", "0"],
+        }
+
+        adapter = APPSTestAdapter()
+        config = CodeExecConfig(
+            timeout_per_test_s=5.0,
+            stop_on_first_failure=False,
+            compile_first=True,
+        )
+
+        env = CodeExecEnv(
+            sample=sample,
+            sandbox_pool=sandbox_pool,
+            test_adapter=adapter,
+            config=config,
+        )
+
+        # Reset
+        obs, info = await env.env_reset()
+
+        assert "two integers" in obs.lower()
+        assert info["problem_id"] == "test_add"
+        assert info["num_tests"] == 3
+
+        # Submit correct code
+        correct_code = """
+a, b = map(int, input().split())
+print(a + b)
+"""
+        outcome = await env.env_step(correct_code)
+
+        assert outcome.terminated is True
+        assert outcome.reward == 1.0
+        assert outcome.info["all_passed"] is True
+        assert outcome.info["passed"] == 3
+        assert outcome.info["total"] == 3
+
+    @pytest.mark.asyncio
+    async def test_env_wrong_code(self, sandbox_pool):
+        """Test env with incorrect code submission."""
+        from ludic.envs.code_exec.env import CodeExecEnv, CodeExecConfig
+        from ludic.envs.code_exec.adapters.apps import APPSTestAdapter
+
+        sample = {
+            "problem_id": "test_double",
+            "question": "Write a program that reads an integer and prints it doubled.",
+            "inputs": ["5", "10"],
+            "outputs": ["10", "20"],
+        }
+
+        adapter = APPSTestAdapter()
+        config = CodeExecConfig(stop_on_first_failure=False)
+
+        env = CodeExecEnv(
+            sample=sample,
+            sandbox_pool=sandbox_pool,
+            test_adapter=adapter,
+            config=config,
+        )
+
+        await env.env_reset()
+
+        # Submit wrong code (triples instead of doubles)
+        wrong_code = """
+n = int(input())
+print(n * 3)
+"""
+        outcome = await env.env_step(wrong_code)
+
+        assert outcome.terminated is True
+        assert outcome.reward == 0.0  # Binary reward, not all passed
+        assert outcome.info["all_passed"] is False
+        assert outcome.info["passed"] == 0
+
+    @pytest.mark.asyncio
+    async def test_env_partial_credit(self, sandbox_pool):
+        """Test env with partial credit enabled."""
+        from ludic.envs.code_exec.env import CodeExecEnv, CodeExecConfig
+        from ludic.envs.code_exec.adapters.apps import APPSTestAdapter
+
+        sample = {
+            "problem_id": "test_abs",
+            "question": "Write a program that reads an integer and prints its absolute value.",
+            "inputs": ["5", "-5", "0", "-10"],
+            "outputs": ["5", "5", "0", "10"],
+        }
+
+        adapter = APPSTestAdapter()
+        config = CodeExecConfig(
+            partial_credit=True,
+            stop_on_first_failure=False,
+        )
+
+        env = CodeExecEnv(
+            sample=sample,
+            sandbox_pool=sandbox_pool,
+            test_adapter=adapter,
+            config=config,
+        )
+
+        await env.env_reset()
+
+        # Code that only works for non-negative numbers
+        partial_code = """
+n = int(input())
+print(n)  # Wrong for negative numbers
+"""
+        outcome = await env.env_step(partial_code)
+
+        assert outcome.terminated is True
+        assert outcome.info["all_passed"] is False
+        assert outcome.info["passed"] == 2  # Only positive and zero pass
+        assert outcome.reward == pytest.approx(0.5)  # 2/4 = 0.5
diff --git a/tests/integration/test_grpo_e2e.py b/tests/integration/test_grpo_e2e.py
index efe9311..9f31280 100644
--- a/tests/integration/test_grpo_e2e.py
+++ b/tests/integration/test_grpo_e2e.py
@@ -22,7 +22,7 @@
     GroupNormalizedReturn,
 )
 from ludic.interaction.base import InteractionProtocol
-from ludic.interaction.single_agent import SingleAgentSyncProtocol
+from ludic.interaction.single_agent import SingleAgentProtocol
 
 from tests._mocks import SeedableMockAgent
 
@@ -51,7 +51,9 @@ def suggested_sysprompt(self) -> Optional[str]:
 
     def env_reset(self, *, seed: Optional[int] = None) -> Tuple[Observation, Info]:
         self._t = 0
-        self._obs = f"Start state for seed {seed}. Correct action is {self.correct_action}."
+        self._obs = (
+            f"Start state for seed {seed}. Correct action is {self.correct_action}."
+        )
         return self._obs, {"seed": seed}
 
     def env_step(self, action: str) -> StepOutcome:
@@ -110,7 +112,9 @@ def create_protocol() -> InteractionProtocol:
         return SingleAgentSyncProtocol(agent=agent)
 
     protocol_registry = {"grpo_protocol": create_protocol}
-    engine = RolloutEngine(protocol_registry=protocol_registry, env_registry=env_registry)
+    engine = RolloutEngine(
+        protocol_registry=protocol_registry, env_registry=env_registry
+    )
 
     def make_expanded_requests() -> List[RolloutRequest]:
         inference = InferenceSpec(
diff --git a/tests/test_batch_execution.py b/tests/test_batch_execution.py
new file mode 100644
index 0000000..6b70344
--- /dev/null
+++ b/tests/test_batch_execution.py
@@ -0,0 +1,501 @@
+"""
+Unit tests for batch execution functionality.
+
+Tests the batch execution path in StdinStdoutRunner using mock sandboxes
+that implement execute_batch().
+"""
+
+import pytest
+from typing import AsyncIterator, Union
+
+from ludic.envs.code_exec.runners import StdinStdoutRunner
+from ludic.envs.code_exec.types import (
+    BatchExecutionSpec,
+    TestCase,
+    CompileResult,
+    CompileStatus,
+    ExecutionResult,
+    RunStatus,
+)
+from ludic.envs.code_exec.adapters.base import ExactMatchVerifier
+
+
+# ---------------------------------------------------------------------
+# Mock Sandbox with execute_batch() support
+# ---------------------------------------------------------------------
+
+
+class MockBatchSandbox:
+    """
+    A mock sandbox that supports execute_batch() for testing the batched
+    execution path in StdinStdoutRunner.
+
+    Can be configured with:
+      - batch_results: List of results to yield from execute_batch()
+      - compile_success: Whether compilation succeeds
+      - break_after: If set, raise exception after yielding N results
+    """
+
+    def __init__(
+        self,
+        batch_results: list[Union[CompileResult, ExecutionResult, dict]] | None = None,
+        compile_success: bool = True,
+        break_after: int | None = None,
+    ):
+        self._batch_results = batch_results or []
+        self._compile_success = compile_success
+        self._break_after = break_after
+        self._python_version = "3.11"
+
+        # Track calls
+        self.execute_batch_calls: list[BatchExecutionSpec] = []
+
+    @property
+    def python_version(self) -> str:
+        return self._python_version
+
+    async def reset(self) -> None:
+        pass
+
+    async def compile(self, code: str, *, timeout_s: float = 5.0) -> CompileResult:
+        if self._compile_success:
+            return CompileResult(status=CompileStatus.SUCCESS, duration_ms=10.0)
+        return CompileResult(
+            status=CompileStatus.SYNTAX_ERROR,
+            error_message="SyntaxError",
+            duration_ms=5.0,
+        )
+
+    async def execute(
+        self,
+        code: str,
+        *,
+        stdin: str = "",
+        skip_compile: bool = False,
+        timeout_s: float = 10.0,
+        memory_limit_mb: int | None = None,
+        env_vars: dict[str, str] | None = None,
+    ) -> ExecutionResult:
+        # Fallback for non-batch execution
+        return ExecutionResult(
+            compile_result=CompileResult(status=CompileStatus.SUCCESS),
+            run_status=RunStatus.SUCCESS,
+            stdout="",
+            stderr="",
+            exit_code=0,
+        )
+
+    async def execute_batch(
+        self,
+        spec: BatchExecutionSpec,
+    ) -> AsyncIterator[Union[CompileResult, ExecutionResult, dict]]:
+        """Yield pre-configured batch results."""
+        self.execute_batch_calls.append(spec)
+
+        count = 0
+        for result in self._batch_results:
+            if self._break_after is not None and count >= self._break_after:
+                raise RuntimeError("Simulated container crash")
+            yield result
+            count += 1
+
+
+def make_success_execution(test_id: str, stdout: str) -> ExecutionResult:
+    """Helper to create a successful ExecutionResult for a test."""
+    return ExecutionResult(
+        compile_result=CompileResult(status=CompileStatus.SUCCESS),
+        run_status=RunStatus.SUCCESS,
+        stdout=stdout,
+        stderr="",
+        exit_code=0,
+        cache_key=test_id,  # Used to identify which test this result is for
+    )
+
+
+def make_failure_execution(
+    test_id: str, status: RunStatus = RunStatus.RUNTIME_ERROR
+) -> ExecutionResult:
+    """Helper to create a failed ExecutionResult for a test."""
+    return ExecutionResult(
+        compile_result=CompileResult(status=CompileStatus.SUCCESS),
+        run_status=status,
+        stdout="",
+        stderr="Error occurred",
+        exit_code=1,
+        cache_key=test_id,
+    )
+
+
+# ---------------------------------------------------------------------
+# Batch Execution Tests
+# ---------------------------------------------------------------------
+
+
+class TestBatchExecution:
+    @pytest.mark.asyncio
+    async def test_batch_all_tests_pass(self):
+        """All tests pass through batch execution."""
+        batch_results = [
+            CompileResult(status=CompileStatus.SUCCESS, duration_ms=10.0),
+            make_success_execution("t1", "expected1"),
+            make_success_execution("t2", "expected2"),
+            {"type": "done", "passed": 2, "failed": 0},
+        ]
+        sandbox = MockBatchSandbox(batch_results=batch_results)
+        runner = StdinStdoutRunner(use_batch_execution=True)
+        verifier = ExactMatchVerifier()
+
+        tests = [
+            TestCase(input="input1", expected="expected1", id="t1"),
+            TestCase(input="input2", expected="expected2", id="t2"),
+        ]
+
+        result = await runner.run_tests(
+            sandbox=sandbox,
+            code="print('hello')",
+            tests=tests,
+            verifier=verifier,
+        )
+
+        assert result.all_passed is True
+        assert result.passed_count == 2
+        assert result.total_count == 2
+        assert len(sandbox.execute_batch_calls) == 1
+
+    @pytest.mark.asyncio
+    async def test_batch_compile_failure(self):
+        """Compilation failure returns all tests as failed."""
+        batch_results = [
+            CompileResult(
+                status=CompileStatus.SYNTAX_ERROR,
+                error_message="SyntaxError: invalid syntax",
+                error_line=1,
+                duration_ms=5.0,
+            ),
+        ]
+        sandbox = MockBatchSandbox(batch_results=batch_results)
+        runner = StdinStdoutRunner(use_batch_execution=True)
+        verifier = ExactMatchVerifier()
+
+        tests = [
+            TestCase(input="input1", expected="x", id="t1"),
+            TestCase(input="input2", expected="y", id="t2"),
+        ]
+
+        result = await runner.run_tests(
+            sandbox=sandbox,
+            code="invalid syntax",
+            tests=tests,
+            verifier=verifier,
+        )
+
+        assert result.compile_failed is True
+        assert result.all_passed is False
+        assert result.passed_count == 0
+        assert len(result.results) == 2
+
+    @pytest.mark.asyncio
+    async def test_batch_some_tests_fail(self):
+        """Mixed pass/fail through batch execution."""
+        batch_results = [
+            CompileResult(status=CompileStatus.SUCCESS, duration_ms=10.0),
+            make_success_execution("t1", "correct"),
+            make_success_execution("t2", "wrong"),  # Output doesn't match expected
+            {"type": "done", "passed": 1, "failed": 1},
+        ]
+        sandbox = MockBatchSandbox(batch_results=batch_results)
+        runner = StdinStdoutRunner(use_batch_execution=True)
+        verifier = ExactMatchVerifier()
+
+        tests = [
+            TestCase(input="input1", expected="correct", id="t1"),
+            TestCase(input="input2", expected="correct", id="t2"),  # Will fail
+        ]
+
+        result = await runner.run_tests(
+            sandbox=sandbox,
+            code="code",
+            tests=tests,
+            verifier=verifier,
+        )
+
+        assert result.all_passed is False
+        assert result.passed_count == 1
+        assert result.total_count == 2
+        assert result.results[0].passed is True
+        assert result.results[1].passed is False
+
+    @pytest.mark.asyncio
+    async def test_batch_runtime_error(self):
+        """Runtime error in batch execution."""
+        batch_results = [
+            CompileResult(status=CompileStatus.SUCCESS, duration_ms=10.0),
+            make_failure_execution("t1", RunStatus.RUNTIME_ERROR),
+            {"type": "done", "passed": 0, "failed": 1},
+        ]
+        sandbox = MockBatchSandbox(batch_results=batch_results)
+        runner = StdinStdoutRunner(use_batch_execution=True)
+        verifier = ExactMatchVerifier()
+
+        tests = [TestCase(input="input1", expected="output", id="t1")]
+
+        result = await runner.run_tests(
+            sandbox=sandbox,
+            code="raise Exception()",
+            tests=tests,
+            verifier=verifier,
+        )
+
+        assert result.passed_count == 0
+        assert result.results[0].passed is False
+        assert "Runtime error" in (result.results[0].comparison_details or "")
+
+    @pytest.mark.asyncio
+    async def test_batch_timeout(self):
+        """Timeout in batch execution."""
+        batch_results = [
+            CompileResult(status=CompileStatus.SUCCESS, duration_ms=10.0),
+            make_failure_execution("t1", RunStatus.TIMEOUT),
+            {"type": "done", "passed": 0, "failed": 1},
+        ]
+        sandbox = MockBatchSandbox(batch_results=batch_results)
+        runner = StdinStdoutRunner(use_batch_execution=True)
+        verifier = ExactMatchVerifier()
+
+        tests = [TestCase(input="input1", expected="output", id="t1")]
+
+        result = await runner.run_tests(
+            sandbox=sandbox,
+            code="while True: pass",
+            tests=tests,
+            verifier=verifier,
+        )
+
+        assert result.passed_count == 0
+        assert result.results[0].passed is False
+
+    @pytest.mark.asyncio
+    async def test_batch_stop_on_first_failure_spec(self):
+        """Verify stop_on_first_failure is passed to BatchExecutionSpec."""
+        batch_results = [
+            CompileResult(status=CompileStatus.SUCCESS, duration_ms=10.0),
+            make_success_execution("t1", "output"),
+            {"type": "done", "passed": 1, "failed": 0},
+        ]
+        sandbox = MockBatchSandbox(batch_results=batch_results)
+        runner = StdinStdoutRunner(use_batch_execution=True)
+        verifier = ExactMatchVerifier()
+
+        tests = [TestCase(input="input1", expected="output", id="t1")]
+
+        await runner.run_tests(
+            sandbox=sandbox,
+            code="code",
+            tests=tests,
+            verifier=verifier,
+            stop_on_first_failure=True,
+        )
+
+        assert len(sandbox.execute_batch_calls) == 1
+        spec = sandbox.execute_batch_calls[0]
+        assert spec.stop_on_first_failure is True
+
+    @pytest.mark.asyncio
+    async def test_batch_broken_stream_sandbox_error(self):
+        """Broken stream marks missing tests as SANDBOX_ERROR."""
+        # Stream breaks after compile result, before any test results
+        batch_results = [
+            CompileResult(status=CompileStatus.SUCCESS, duration_ms=10.0),
+            make_success_execution("t1", "output1"),
+            # Stream breaks here - t2 and t3 never received
+        ]
+        sandbox = MockBatchSandbox(batch_results=batch_results, break_after=2)
+        runner = StdinStdoutRunner(use_batch_execution=True)
+        verifier = ExactMatchVerifier()
+
+        tests = [
+            TestCase(input="input1", expected="output1", id="t1"),
+            TestCase(input="input2", expected="output2", id="t2"),
+            TestCase(input="input3", expected="output3", id="t3"),
+        ]
+
+        result = await runner.run_tests(
+            sandbox=sandbox,
+            code="code",
+            tests=tests,
+            verifier=verifier,
+        )
+
+        # t1 should have succeeded, t2 and t3 should be SANDBOX_ERROR
+        assert len(result.results) == 3
+        assert result.results[0].passed is True
+        assert result.results[0].test_case.id == "t1"
+
+        # Find t2 and t3 results (order may vary due to dict iteration)
+        t2_result = next(r for r in result.results if r.test_case.id == "t2")
+        t3_result = next(r for r in result.results if r.test_case.id == "t3")
+
+        assert t2_result.passed is False
+        assert t2_result.execution.run_status == RunStatus.SANDBOX_ERROR
+        assert "Sandbox crashed" in (t2_result.comparison_details or "")
+
+        assert t3_result.passed is False
+        assert t3_result.execution.run_status == RunStatus.SANDBOX_ERROR
+
+    @pytest.mark.asyncio
+    async def test_batch_no_done_marker_adds_missing(self):
+        """Missing 'done' marker triggers fallback for unreceived tests."""
+        # No "done" marker, but some tests received
+        batch_results = [
+            CompileResult(status=CompileStatus.SUCCESS, duration_ms=10.0),
+            make_success_execution("t1", "output1"),
+            # No "done" marker - stream ended unexpectedly
+        ]
+        sandbox = MockBatchSandbox(batch_results=batch_results)
+        runner = StdinStdoutRunner(use_batch_execution=True)
+        verifier = ExactMatchVerifier()
+
+        tests = [
+            TestCase(input="input1", expected="output1", id="t1"),
+            TestCase(input="input2", expected="output2", id="t2"),
+        ]
+
+        result = await runner.run_tests(
+            sandbox=sandbox,
+            code="code",
+            tests=tests,
+            verifier=verifier,
+        )
+
+        assert len(result.results) == 2
+        assert result.results[0].passed is True
+
+        # t2 should be marked as SANDBOX_ERROR
+        t2_result = next(r for r in result.results if r.test_case.id == "t2")
+        assert t2_result.execution.run_status == RunStatus.SANDBOX_ERROR
+
+    @pytest.mark.asyncio
+    async def test_batch_disabled_falls_back_to_individual(self):
+        """With use_batch_execution=False, individual execution is used."""
+        sandbox = MockBatchSandbox()
+        runner = StdinStdoutRunner(use_batch_execution=False)
+        verifier = ExactMatchVerifier()
+
+        tests = [TestCase(input="input1", expected="", id="t1")]
+
+        await runner.run_tests(
+            sandbox=sandbox,
+            code="code",
+            tests=tests,
+            verifier=verifier,
+        )
+
+        # execute_batch should NOT be called
+        assert len(sandbox.execute_batch_calls) == 0
+
+    @pytest.mark.asyncio
+    async def test_batch_spec_contains_all_test_info(self):
+        """Verify BatchExecutionSpec contains all test information."""
+        batch_results = [
+            CompileResult(status=CompileStatus.SUCCESS),
+            make_success_execution("t1", "out"),
+            {"type": "done"},
+        ]
+        sandbox = MockBatchSandbox(batch_results=batch_results)
+        runner = StdinStdoutRunner(use_batch_execution=True, default_timeout_s=7.5)
+        verifier = ExactMatchVerifier()
+
+        tests = [TestCase(input="my_input", expected="out", id="t1")]
+
+        await runner.run_tests(
+            sandbox=sandbox,
+            code="my_code",
+            tests=tests,
+            verifier=verifier,
+            compile_first=True,
+            stop_on_first_failure=False,
+        )
+
+        assert len(sandbox.execute_batch_calls) == 1
+        spec = sandbox.execute_batch_calls[0]
+
+        assert spec.code == "my_code"
+        assert len(spec.tests) == 1
+        assert spec.tests[0].id == "t1"
+        assert spec.tests[0].input == "my_input"
+        assert spec.compile_first is True
+        assert spec.stop_on_first_failure is False
+        assert spec.timeout_s == 7.5
+
+    @pytest.mark.asyncio
+    async def test_batch_hashes_computed(self):
+        """Verify code_hash and tests_hash are computed for batch execution."""
+        batch_results = [
+            CompileResult(status=CompileStatus.SUCCESS),
+            make_success_execution("t1", "output"),
+            {"type": "done"},
+        ]
+        sandbox = MockBatchSandbox(batch_results=batch_results)
+        runner = StdinStdoutRunner(use_batch_execution=True)
+        verifier = ExactMatchVerifier()
+
+        code = "print('hello')"
+        tests = [TestCase(input="input1", expected="output", id="t1")]
+
+        result = await runner.run_tests(
+            sandbox=sandbox,
+            code=code,
+            tests=tests,
+            verifier=verifier,
+        )
+
+        # Verify hashes are present
+        assert len(result.code_hash) == 16
+        assert len(result.tests_hash) == 16
+        assert all(c in "0123456789abcdef" for c in result.code_hash)
+
+
+class TestBatchExecutionNotRunStatus:
+    """Tests for NOT_RUN status handling in batch execution."""
+
+    @pytest.mark.asyncio
+    async def test_not_run_tests_from_batch_stream(self):
+        """Tests marked as NOT_RUN in batch stream are handled correctly."""
+        batch_results = [
+            CompileResult(status=CompileStatus.SUCCESS, duration_ms=10.0),
+            make_failure_execution("t1", RunStatus.RUNTIME_ERROR),
+            # t2 marked as not_run by batch_runner due to stop_on_first_failure
+            ExecutionResult(
+                compile_result=CompileResult(status=CompileStatus.SUCCESS),
+                run_status=RunStatus.NOT_RUN,
+                stdout="",
+                stderr="",
+                exit_code=None,
+                cache_key="t2",
+            ),
+            {"type": "done", "passed": 0, "failed": 1},
+        ]
+        sandbox = MockBatchSandbox(batch_results=batch_results)
+        runner = StdinStdoutRunner(use_batch_execution=True)
+        verifier = ExactMatchVerifier()
+
+        tests = [
+            TestCase(input="input1", expected="output1", id="t1"),
+            TestCase(input="input2", expected="output2", id="t2"),
+        ]
+
+        result = await runner.run_tests(
+            sandbox=sandbox,
+            code="code",
+            tests=tests,
+            verifier=verifier,
+            stop_on_first_failure=True,
+        )
+
+        assert len(result.results) == 2
+        assert result.results[0].passed is False
+        assert result.results[0].execution.run_status == RunStatus.RUNTIME_ERROR
+
+        t2_result = next(r for r in result.results if r.test_case.id == "t2")
+        assert t2_result.passed is False
+        assert t2_result.execution.run_status == RunStatus.NOT_RUN
diff --git a/tests/test_code_exec_adapters.py b/tests/test_code_exec_adapters.py
new file mode 100644
index 0000000..2d701bd
--- /dev/null
+++ b/tests/test_code_exec_adapters.py
@@ -0,0 +1,335 @@
+"""
+Unit tests for ludic.envs.code_exec.adapters
+
+Tests verifiers and test adapters.
+"""
+
+import pytest
+
+from ludic.envs.code_exec.adapters.base import (
+    ExactMatchVerifier,
+    WhitespaceNormalizedVerifier,
+    FloatTolerantVerifier,
+)
+from ludic.envs.code_exec.adapters.apps import (
+    APPSTestAdapter,
+    APPS_SYSTEM_PROMPT,
+)
+from ludic.envs.code_exec.types import TestCase
+
+
+# ---------------------------------------------------------------------
+# ExactMatchVerifier Tests
+# ---------------------------------------------------------------------
+
+
+class TestExactMatchVerifier:
+    def test_exact_match_passes(self):
+        verifier = ExactMatchVerifier()
+        passed, details = verifier.verify("hello", "hello")
+        assert passed is True
+        assert details is None
+
+    def test_mismatch_fails(self):
+        verifier = ExactMatchVerifier()
+        passed, details = verifier.verify("hello", "world")
+        assert passed is False
+        assert details is not None
+
+    def test_strips_whitespace_by_default(self):
+        verifier = ExactMatchVerifier()
+        passed, _ = verifier.verify("  hello  \n", "hello")
+        assert passed is True
+
+    def test_strip_disabled(self):
+        verifier = ExactMatchVerifier(strip=False)
+        passed, _ = verifier.verify("hello ", "hello")
+        assert passed is False
+
+    def test_case_sensitive_by_default(self):
+        verifier = ExactMatchVerifier()
+        passed, _ = verifier.verify("Hello", "hello")
+        assert passed is False
+
+    def test_case_insensitive(self):
+        verifier = ExactMatchVerifier(case_sensitive=False)
+        passed, _ = verifier.verify("HELLO", "hello")
+        assert passed is True
+
+    def test_length_mismatch_details(self):
+        verifier = ExactMatchVerifier()
+        passed, details = verifier.verify("abc", "abcdef")
+        assert passed is False
+        assert "Length mismatch" in details
+        assert "3" in details
+        assert "6" in details
+
+    def test_first_diff_details(self):
+        verifier = ExactMatchVerifier()
+        passed, details = verifier.verify("abc", "axc")
+        assert passed is False
+        assert "First diff" in details
+
+
+# ---------------------------------------------------------------------
+# WhitespaceNormalizedVerifier Tests
+# ---------------------------------------------------------------------
+
+
+class TestWhitespaceNormalizedVerifier:
+    def test_normalizes_multiple_spaces(self):
+        verifier = WhitespaceNormalizedVerifier()
+        passed, _ = verifier.verify("hello   world", "hello world")
+        assert passed is True
+
+    def test_normalizes_newlines(self):
+        verifier = WhitespaceNormalizedVerifier()
+        passed, _ = verifier.verify("hello\n\nworld", "hello world")
+        assert passed is True
+
+    def test_normalizes_tabs(self):
+        verifier = WhitespaceNormalizedVerifier()
+        passed, _ = verifier.verify("hello\t\tworld", "hello world")
+        assert passed is True
+
+    def test_normalizes_mixed_whitespace(self):
+        verifier = WhitespaceNormalizedVerifier()
+        passed, _ = verifier.verify("  hello \n\t world  ", "hello world")
+        assert passed is True
+
+    def test_content_mismatch_fails(self):
+        verifier = WhitespaceNormalizedVerifier()
+        passed, _ = verifier.verify("hello world", "hello mars")
+        assert passed is False
+
+
+# ---------------------------------------------------------------------
+# FloatTolerantVerifier Tests
+# ---------------------------------------------------------------------
+
+
+class TestFloatTolerantVerifier:
+    def test_exact_float_match(self):
+        verifier = FloatTolerantVerifier()
+        passed, _ = verifier.verify("3.14159", "3.14159")
+        assert passed is True
+
+    def test_float_within_tolerance(self):
+        verifier = FloatTolerantVerifier(abs_tol=1e-6)
+        passed, _ = verifier.verify("3.141590001", "3.14159")
+        assert passed is True
+
+    def test_float_outside_tolerance(self):
+        verifier = FloatTolerantVerifier(abs_tol=1e-9)
+        passed, _ = verifier.verify("3.15", "3.14")
+        assert passed is False
+
+    def test_integer_match(self):
+        verifier = FloatTolerantVerifier()
+        passed, _ = verifier.verify("42", "42")
+        assert passed is True
+
+    def test_string_exact_match(self):
+        verifier = FloatTolerantVerifier()
+        passed, _ = verifier.verify("hello", "hello")
+        assert passed is True
+
+    def test_string_mismatch(self):
+        verifier = FloatTolerantVerifier()
+        passed, _ = verifier.verify("hello", "world")
+        assert passed is False
+
+    def test_multiple_tokens(self):
+        verifier = FloatTolerantVerifier(abs_tol=1e-6)
+        passed, _ = verifier.verify("1.0 2.0 3.0", "1.0 2.0 3.0")
+        assert passed is True
+
+    def test_multiple_tokens_within_tolerance(self):
+        verifier = FloatTolerantVerifier(abs_tol=0.01)
+        passed, _ = verifier.verify("1.001 2.002 3.003", "1.0 2.0 3.0")
+        assert passed is True
+
+    def test_token_count_mismatch(self):
+        verifier = FloatTolerantVerifier()
+        passed, details = verifier.verify("1 2", "1 2 3")
+        assert passed is False
+        assert "Token count mismatch" in details
+
+    def test_relative_tolerance(self):
+        verifier = FloatTolerantVerifier(rel_tol=0.01, abs_tol=0)
+        # 1% of 100 = 1, so 100.5 should match 100
+        passed, _ = verifier.verify("100.5", "100")
+        assert passed is True
+
+    def test_strips_whitespace(self):
+        verifier = FloatTolerantVerifier()
+        passed, _ = verifier.verify("  42  ", "42")
+        assert passed is True
+
+
+# ---------------------------------------------------------------------
+# APPSTestAdapter Tests
+# ---------------------------------------------------------------------
+
+
+class TestAPPSTestAdapter:
+    def test_get_prompt_extracts_question(self):
+        adapter = APPSTestAdapter()
+        sample = {
+            "question": "Write a function to add two numbers.",
+            "inputs": ["1 2"],
+            "outputs": ["3"],
+        }
+        prompt = adapter.get_prompt(sample)
+        assert prompt == "Write a function to add two numbers."
+
+    def test_get_prompt_with_custom_key(self):
+        adapter = APPSTestAdapter(question_key="problem_description")
+        sample = {
+            "problem_description": "Custom problem text",
+            "inputs": [],
+            "outputs": [],
+        }
+        prompt = adapter.get_prompt(sample)
+        assert prompt == "Custom problem text"
+
+    def test_get_problem_id(self):
+        adapter = APPSTestAdapter()
+        sample = {
+            "problem_id": "prob_123",
+            "question": "Q",
+            "inputs": [],
+            "outputs": [],
+        }
+        assert adapter.get_problem_id(sample) == "prob_123"
+
+    def test_get_problem_id_missing_returns_unknown(self):
+        adapter = APPSTestAdapter()
+        sample = {
+            "question": "Q",
+            "inputs": [],
+            "outputs": [],
+        }
+        assert adapter.get_problem_id(sample) == "unknown"
+
+    def test_get_problem_id_custom_key(self):
+        adapter = APPSTestAdapter(problem_id_key="id")
+        sample = {
+            "id": "custom_id",
+            "question": "Q",
+            "inputs": [],
+            "outputs": [],
+        }
+        assert adapter.get_problem_id(sample) == "custom_id"
+
+    def test_get_tests_single_test(self):
+        adapter = APPSTestAdapter()
+        sample = {
+            "question": "Q",
+            "inputs": ["1 2"],
+            "outputs": ["3"],
+        }
+        tests = adapter.get_tests(sample)
+        assert len(tests) == 1
+        assert tests[0].input == "1 2"
+        assert tests[0].expected == "3"
+        assert tests[0].id == "test_0"
+
+    def test_get_tests_multiple_tests(self):
+        adapter = APPSTestAdapter()
+        sample = {
+            "question": "Q",
+            "inputs": ["1", "2", "3"],
+            "outputs": ["a", "b", "c"],
+        }
+        tests = adapter.get_tests(sample)
+        assert len(tests) == 3
+        assert tests[0].input == "1"
+        assert tests[0].expected == "a"
+        assert tests[0].id == "test_0"
+        assert tests[1].input == "2"
+        assert tests[1].expected == "b"
+        assert tests[1].id == "test_1"
+        assert tests[2].input == "3"
+        assert tests[2].expected == "c"
+        assert tests[2].id == "test_2"
+
+    def test_get_tests_mismatched_length_raises(self):
+        adapter = APPSTestAdapter()
+        sample = {
+            "question": "Q",
+            "inputs": ["1", "2", "3"],
+            "outputs": ["a", "b"],  # One less
+        }
+        with pytest.raises(ValueError) as exc_info:
+            adapter.get_tests(sample)
+        assert "Mismatched" in str(exc_info.value)
+
+    def test_get_tests_custom_keys(self):
+        adapter = APPSTestAdapter(inputs_key="test_inputs", outputs_key="test_outputs")
+        sample = {
+            "question": "Q",
+            "test_inputs": ["x"],
+            "test_outputs": ["y"],
+        }
+        tests = adapter.get_tests(sample)
+        assert len(tests) == 1
+        assert tests[0].input == "x"
+        assert tests[0].expected == "y"
+
+    def test_hash_tests_deterministic(self):
+        adapter = APPSTestAdapter()
+        tests = [
+            TestCase(input="1", expected="a", id="t1"),
+            TestCase(input="2", expected="b", id="t2"),
+        ]
+        hash1 = adapter.hash_tests(tests)
+        hash2 = adapter.hash_tests(tests)
+        assert hash1 == hash2
+        assert len(hash1) == 16  # 16 hex chars
+
+    def test_hash_tests_different_for_different_tests(self):
+        adapter = APPSTestAdapter()
+        tests1 = [TestCase(input="1", expected="a", id="t1")]
+        tests2 = [TestCase(input="2", expected="b", id="t1")]
+        hash1 = adapter.hash_tests(tests1)
+        hash2 = adapter.hash_tests(tests2)
+        assert hash1 != hash2
+
+    def test_hash_tests_order_matters(self):
+        adapter = APPSTestAdapter()
+        tests1 = [
+            TestCase(input="1", expected="a", id="t1"),
+            TestCase(input="2", expected="b", id="t2"),
+        ]
+        tests2 = [
+            TestCase(input="2", expected="b", id="t2"),
+            TestCase(input="1", expected="a", id="t1"),
+        ]
+        hash1 = adapter.hash_tests(tests1)
+        hash2 = adapter.hash_tests(tests2)
+        assert hash1 != hash2
+
+    def test_hash_tests_ignores_id(self):
+        """Hash should be based on input/expected, not id."""
+        adapter = APPSTestAdapter()
+        tests1 = [TestCase(input="1", expected="a", id="test_0")]
+        tests2 = [TestCase(input="1", expected="a", id="different_id")]
+        hash1 = adapter.hash_tests(tests1)
+        hash2 = adapter.hash_tests(tests2)
+        assert hash1 == hash2
+
+
+class TestAPPSSystemPrompt:
+    def test_system_prompt_exists(self):
+        assert APPS_SYSTEM_PROMPT is not None
+        assert len(APPS_SYSTEM_PROMPT) > 0
+
+    def test_system_prompt_mentions_python(self):
+        assert "Python" in APPS_SYSTEM_PROMPT or "python" in APPS_SYSTEM_PROMPT
+
+    def test_system_prompt_mentions_stdin(self):
+        assert "stdin" in APPS_SYSTEM_PROMPT
+
+    def test_system_prompt_mentions_stdout(self):
+        assert "stdout" in APPS_SYSTEM_PROMPT
diff --git a/tests/test_code_exec_async_protocol.py b/tests/test_code_exec_async_protocol.py
new file mode 100644
index 0000000..c45142a
--- /dev/null
+++ b/tests/test_code_exec_async_protocol.py
@@ -0,0 +1,465 @@
+"""
+Integration tests for async env support in SingleAgentProtocol.
+
+Tests that the protocol correctly detects and handles envs with async
+env_reset and env_step methods (like CodeExecEnv).
+"""
+
+from typing import Optional, Tuple
+
+import pytest
+
+from ludic.context.full_dialog import FullDialog
+from ludic.interaction.single_agent import SingleAgentProtocol, _has_async_env_methods
+from ludic.agents.base_agent import Agent
+from ludic.envs.single_agent_env import SingleAgentEnv
+from ludic.parsers import ParseResult
+from ludic.types import Info, Observation, StepOutcome
+from tests._mocks import MockClient
+
+
+# Simple pass-through parser for tests
+def _passthrough_parser(raw: str) -> ParseResult:
+    return ParseResult(action=raw, reward=0.0, obs=None)
+
+
+# ---------------------------------------------------------------------
+# Mock Async Env for Testing
+# ---------------------------------------------------------------------
+
+
+class MockAsyncEnv(SingleAgentEnv):
+    """
+    A mock async env that simulates CodeExecEnv behavior.
+
+    Has async env_reset and env_step methods, unlike standard sync envs.
+    """
+
+    def __init__(
+        self,
+        target_action: str = "correct_code",
+        max_steps: int = 3,
+    ):
+        super().__init__()
+        self._target_action = target_action
+        self._max_steps = max_steps
+        self._step_count = 0
+        self._obs = "Write code to solve the problem."
+
+        # Track calls for assertions
+        self.reset_calls = 0
+        self.step_calls = 0
+
+    @property
+    def suggested_sysprompt(self) -> Optional[str]:
+        return "You are a code assistant."
+
+    async def env_reset(self, *, seed: Optional[int] = None) -> Tuple[Observation, Info]:
+        """Async reset method (like CodeExecEnv)."""
+        self.reset_calls += 1
+        self._step_count = 0
+        self._obs = "Write code to solve the problem."
+        return self._obs, {"problem_id": "test_problem", "async_env": True}
+
+    async def env_step(self, action: str) -> StepOutcome:
+        """Async step method (like CodeExecEnv)."""
+        self.step_calls += 1
+        self._step_count += 1
+
+        if action == self._target_action:
+            # Correct code - terminate with success
+            return StepOutcome(
+                obs="All tests passed!",
+                reward=1.0,
+                truncated=False,
+                terminated=True,
+                info={"all_passed": True, "step_count": self._step_count},
+            )
+        elif self._step_count >= self._max_steps:
+            # Max steps reached - truncate
+            return StepOutcome(
+                obs=f"Tests failed. Attempt {self._step_count}/{self._max_steps}.",
+                reward=-0.1,
+                truncated=True,
+                terminated=False,
+                info={"all_passed": False, "step_count": self._step_count},
+            )
+        else:
+            # Wrong code but more attempts allowed
+            return StepOutcome(
+                obs=f"Tests failed. Try again. Attempt {self._step_count}/{self._max_steps}.",
+                reward=-0.1,
+                truncated=False,
+                terminated=False,
+                info={"all_passed": False, "step_count": self._step_count},
+            )
+
+    def env_current_obs(self) -> Observation:
+        return self._obs
+
+
+class MockSyncEnv(SingleAgentEnv):
+    """
+    A standard sync env for comparison testing.
+    Uses regular (non-async) env_reset and env_step.
+    """
+
+    def __init__(self, target_action: str = "correct"):
+        super().__init__()
+        self._target_action = target_action
+        self._obs = "Sync env observation"
+        self.reset_calls = 0
+        self.step_calls = 0
+
+    def env_reset(self, *, seed: Optional[int] = None) -> Tuple[Observation, Info]:
+        """Standard sync reset."""
+        self.reset_calls += 1
+        self._obs = "Sync env observation"
+        return self._obs, {"sync_env": True}
+
+    def env_step(self, action: str) -> StepOutcome:
+        """Standard sync step."""
+        self.step_calls += 1
+        terminated = action == self._target_action
+        return StepOutcome(
+            obs="Success" if terminated else "Wrong",
+            reward=1.0 if terminated else -0.1,
+            truncated=False,
+            terminated=terminated,
+            info={},
+        )
+
+    def env_current_obs(self) -> Observation:
+        return self._obs
+
+
+# ---------------------------------------------------------------------
+# Async Detection Tests
+# ---------------------------------------------------------------------
+
+
+class TestAsyncDetection:
+    def test_detects_async_reset(self):
+        env = MockAsyncEnv()
+        has_async_reset, has_async_step = _has_async_env_methods(env)
+        assert has_async_reset is True
+
+    def test_detects_async_step(self):
+        env = MockAsyncEnv()
+        has_async_reset, has_async_step = _has_async_env_methods(env)
+        assert has_async_step is True
+
+    def test_detects_sync_env(self):
+        env = MockSyncEnv()
+        has_async_reset, has_async_step = _has_async_env_methods(env)
+        assert has_async_reset is False
+        assert has_async_step is False
+
+
+# ---------------------------------------------------------------------
+# Protocol Async Env Integration Tests
+# ---------------------------------------------------------------------
+
+
+class TestProtocolAsyncEnvIntegration:
+    @pytest.mark.asyncio
+    async def test_protocol_runs_async_env_successfully(self):
+        """Protocol should correctly run an async env and produce rollouts."""
+        env = MockAsyncEnv(target_action="correct_code")
+        agent = Agent(
+            client=MockClient(text="correct_code"),
+            model="mock",
+            ctx=FullDialog(),
+            parser=_passthrough_parser,
+        )
+        protocol = SingleAgentProtocol(agent=agent)
+
+        rollouts = await protocol.run(env=env, max_steps=5)
+
+        assert len(rollouts) == 1
+        rollout = rollouts[0]
+
+        # Should terminate on first step with correct action
+        assert len(rollout.steps) == 1
+        assert rollout.steps[0].terminated is True
+        assert rollout.steps[0].reward == pytest.approx(1.0)
+        assert rollout.steps[0].info.get("all_passed") is True
+
+    @pytest.mark.asyncio
+    async def test_protocol_calls_async_reset(self):
+        """Protocol should call async env_reset and receive correct observation."""
+        env = MockAsyncEnv()
+        agent = Agent(
+            client=MockClient(text="wrong_code"),
+            model="mock",
+            ctx=FullDialog(),
+            parser=_passthrough_parser,
+        )
+        protocol = SingleAgentProtocol(agent=agent)
+
+        await protocol.run(env=env, max_steps=1)
+
+        # Verify async reset was called
+        assert env.reset_calls == 1
+
+    @pytest.mark.asyncio
+    async def test_protocol_calls_async_step(self):
+        """Protocol should call async env_step with the parsed action."""
+        env = MockAsyncEnv()
+        agent = Agent(
+            client=MockClient(text="some_code"),
+            model="mock",
+            ctx=FullDialog(),
+            parser=_passthrough_parser,
+        )
+        protocol = SingleAgentProtocol(agent=agent)
+
+        await protocol.run(env=env, max_steps=1)
+
+        # Verify async step was called
+        assert env.step_calls == 1
+
+    @pytest.mark.asyncio
+    async def test_protocol_uses_async_env_system_prompt(self):
+        """Protocol should use the async env's suggested_sysprompt."""
+        env = MockAsyncEnv()
+        agent = Agent(
+            client=MockClient(text="code"),
+            model="mock",
+            ctx=FullDialog(),
+            parser=_passthrough_parser,
+        )
+        protocol = SingleAgentProtocol(agent=agent)
+
+        await protocol.run(env=env, max_steps=1)
+
+        # The agent context should have the system prompt from env
+        messages = agent._ctx.messages
+        # First message should be system prompt
+        assert any(
+            m.get("role") == "system" and "code assistant" in m.get("content", "").lower()
+            for m in messages
+        )
+
+    @pytest.mark.asyncio
+    async def test_async_env_multiple_steps(self):
+        """Test that async env works correctly over multiple steps."""
+        env = MockAsyncEnv(target_action="correct", max_steps=5)
+
+        # Agent says "wrong" first 2 times, then "correct"
+        call_count = 0
+
+        class CountingClient(MockClient):
+            def __init__(self):
+                super().__init__(text="wrong")
+
+            async def complete(self, *args, **kwargs):
+                nonlocal call_count
+                call_count += 1
+                if call_count < 3:
+                    self._text = "wrong"
+                else:
+                    self._text = "correct"
+                return await super().complete(*args, **kwargs)
+
+        agent = Agent(
+            client=CountingClient(),
+            model="mock",
+            ctx=FullDialog(),
+            parser=_passthrough_parser,
+        )
+        protocol = SingleAgentProtocol(agent=agent)
+
+        rollouts = await protocol.run(env=env, max_steps=10)
+
+        assert len(rollouts) == 1
+        rollout = rollouts[0]
+
+        # Should have taken 3 steps to get correct answer
+        assert len(rollout.steps) == 3
+        assert rollout.steps[0].terminated is False
+        assert rollout.steps[1].terminated is False
+        assert rollout.steps[2].terminated is True
+
+        # Total reward: -0.1 + -0.1 + 1.0 = 0.8
+        assert rollout.total_reward == pytest.approx(0.8)
+
+
+# ---------------------------------------------------------------------
+# Backward Compatibility Tests
+# ---------------------------------------------------------------------
+
+
+class TestBackwardCompatibility:
+    @pytest.mark.asyncio
+    async def test_sync_env_still_works(self):
+        """Sync envs should continue to work without changes."""
+        env = MockSyncEnv(target_action="correct")
+        agent = Agent(
+            client=MockClient(text="correct"),
+            model="mock",
+            ctx=FullDialog(),
+            parser=_passthrough_parser,
+        )
+        protocol = SingleAgentProtocol(agent=agent)
+
+        rollouts = await protocol.run(env=env, max_steps=5)
+
+        assert len(rollouts) == 1
+        rollout = rollouts[0]
+
+        assert len(rollout.steps) == 1
+        assert rollout.steps[0].terminated is True
+        assert rollout.steps[0].reward == pytest.approx(1.0)
+
+    @pytest.mark.asyncio
+    async def test_sync_env_reset_is_called(self):
+        """Sync env reset should be called through normal path."""
+        env = MockSyncEnv()
+        agent = Agent(
+            client=MockClient(text="wrong"),
+            model="mock",
+            ctx=FullDialog(),
+            parser=_passthrough_parser,
+        )
+        protocol = SingleAgentProtocol(agent=agent)
+
+        await protocol.run(env=env, max_steps=1)
+
+        assert env.reset_calls == 1
+
+    @pytest.mark.asyncio
+    async def test_sync_env_step_is_called(self):
+        """Sync env step should be called through normal path."""
+        env = MockSyncEnv()
+        agent = Agent(
+            client=MockClient(text="wrong"),
+            model="mock",
+            ctx=FullDialog(),
+            parser=_passthrough_parser,
+        )
+        protocol = SingleAgentProtocol(agent=agent)
+
+        await protocol.run(env=env, max_steps=1)
+
+        assert env.step_calls == 1
+
+
+# ---------------------------------------------------------------------
+# Info Propagation Tests
+# ---------------------------------------------------------------------
+
+
+class TestAsyncEnvInfoPropagation:
+    @pytest.mark.asyncio
+    async def test_reset_info_accessible_in_rollout(self):
+        """Info from async env_reset should be accessible."""
+        env = MockAsyncEnv()
+        agent = Agent(
+            client=MockClient(text="correct_code"),
+            model="mock",
+            ctx=FullDialog(),
+            parser=_passthrough_parser,
+        )
+        protocol = SingleAgentProtocol(agent=agent)
+
+        rollouts = await protocol.run(env=env, max_steps=1)
+
+        # The first step's prev_obs should be from reset
+        assert rollouts[0].steps[0].prev_obs == "Write code to solve the problem."
+
+    @pytest.mark.asyncio
+    async def test_step_info_propagated_to_rollout(self):
+        """Info from async env_step should be in the step info."""
+        env = MockAsyncEnv()
+        agent = Agent(
+            client=MockClient(text="correct_code"),
+            model="mock",
+            ctx=FullDialog(),
+            parser=_passthrough_parser,
+        )
+        protocol = SingleAgentProtocol(agent=agent)
+
+        rollouts = await protocol.run(env=env, max_steps=1)
+
+        step_info = rollouts[0].steps[0].info
+        assert step_info.get("all_passed") is True
+        assert step_info.get("step_count") == 1
+
+
+# ---------------------------------------------------------------------
+# Edge Cases
+# ---------------------------------------------------------------------
+
+
+class TestAsyncEnvEdgeCases:
+    @pytest.mark.asyncio
+    async def test_async_env_truncation_on_max_steps(self):
+        """Async env that never terminates should truncate at max_steps."""
+        env = MockAsyncEnv(target_action="impossible", max_steps=100)
+        agent = Agent(
+            client=MockClient(text="wrong"),
+            model="mock",
+            ctx=FullDialog(),
+            parser=_passthrough_parser,
+        )
+        protocol = SingleAgentProtocol(agent=agent)
+
+        rollouts = await protocol.run(env=env, max_steps=3)
+
+        rollout = rollouts[0]
+        assert len(rollout.steps) == 3
+        assert rollout.steps[-1].truncated is True
+        assert rollout.meta.get("episode_truncated") is True
+
+    @pytest.mark.asyncio
+    async def test_async_env_with_parser(self):
+        """Parser should work correctly with async envs."""
+        from ludic.parsers import xml_tag_parser
+
+        env = MockAsyncEnv(target_action="parsed_code")
+        agent = Agent(
+            client=MockClient(text="<code>parsed_code</code>"),
+            model="mock",
+            ctx=FullDialog(),
+            parser=xml_tag_parser("code"),
+        )
+        protocol = SingleAgentProtocol(agent=agent)
+
+        rollouts = await protocol.run(env=env, max_steps=5)
+
+        rollout = rollouts[0]
+        assert len(rollout.steps) == 1
+        assert rollout.steps[0].terminated is True
+        assert rollout.steps[0].info.get("parsed_action") == "parsed_code"
+
+    @pytest.mark.asyncio
+    async def test_async_env_parser_failure(self):
+        """Parser failures should be handled correctly with async envs."""
+        from ludic.parsers import ParseResult
+
+        def strict_parser(text: str) -> ParseResult:
+            if text.startswith("VALID:"):
+                return ParseResult(action=text[6:], reward=0.1, obs=None)
+            return ParseResult(action=None, reward=-0.5, obs="Invalid format")
+
+        env = MockAsyncEnv()
+        agent = Agent(
+            client=MockClient(text="invalid_format"),
+            model="mock",
+            ctx=FullDialog(),
+            parser=strict_parser,
+        )
+        protocol = SingleAgentProtocol(agent=agent)
+
+        rollouts = await protocol.run(env=env, max_steps=1)
+
+        rollout = rollouts[0]
+        assert len(rollout.steps) == 1
+        step = rollout.steps[0]
+
+        # Parser failure - no env step called
+        assert env.step_calls == 0
+        assert step.info.get("parse_error") is True
+        assert step.reward == pytest.approx(-0.5)
+        assert step.next_obs == "Invalid format"
diff --git a/tests/test_code_exec_cache.py b/tests/test_code_exec_cache.py
new file mode 100644
index 0000000..e04d929
--- /dev/null
+++ b/tests/test_code_exec_cache.py
@@ -0,0 +1,392 @@
+"""
+Unit tests for ludic.envs.code_exec.docker_sandbox.LRUCache
+
+Tests thread safety, eviction behavior, and statistics tracking.
+
+Note: Requires the `docker` package to be installed for LRUCache import.
+"""
+
+import threading
+import time
+from concurrent.futures import ThreadPoolExecutor
+
+import pytest
+
+from ludic.envs.code_exec.types import (
+    BatchTestResult,
+    CompileResult,
+    CompileStatus,
+    ExecutionResult,
+    RunStatus,
+    TestCase,
+    TestResult,
+)
+
+# Try to import LRUCache - skip all tests if docker package not installed
+try:
+    from ludic.envs.code_exec.docker_sandbox import LRUCache
+except ImportError:
+    LRUCache = None  # type: ignore[misc, assignment]
+
+pytestmark = pytest.mark.skipif(
+    LRUCache is None,
+    reason="docker package not installed (required for LRUCache)",
+)
+
+
+def _make_batch_result(passed_count: int = 1, total_count: int = 1) -> BatchTestResult:
+    """Helper to create a BatchTestResult with minimal boilerplate."""
+    results = []
+    for i in range(total_count):
+        passed = i < passed_count
+        results.append(
+            TestResult(
+                test_case=TestCase(input=f"input_{i}", expected="out", id=f"t{i}"),
+                passed=passed,
+                actual="out" if passed else "wrong",
+                execution=ExecutionResult(
+                    compile_result=CompileResult(status=CompileStatus.SUCCESS),
+                    run_status=RunStatus.SUCCESS if passed else RunStatus.RUNTIME_ERROR,
+                ),
+            )
+        )
+    return BatchTestResult(results=results, code_hash="abc", tests_hash="xyz")
+
+
+# ---------------------------------------------------------------------
+# Basic Operations
+# ---------------------------------------------------------------------
+
+
+class TestLRUCacheBasicOperations:
+    def test_get_returns_none_for_missing_key(self):
+        cache = LRUCache(max_size=10)
+        result = cache.get("nonexistent_code", "nonexistent_tests")
+        assert result is None
+
+    def test_put_and_get(self):
+        cache = LRUCache(max_size=10)
+        batch_result = _make_batch_result()
+
+        cache.put("code1", "tests1", batch_result)
+        retrieved = cache.get("code1", "tests1")
+
+        assert retrieved is batch_result
+
+    def test_get_returns_none_after_different_key(self):
+        cache = LRUCache(max_size=10)
+        batch_result = _make_batch_result()
+
+        cache.put("code1", "tests1", batch_result)
+
+        # Different code hash
+        assert cache.get("code2", "tests1") is None
+        # Different tests hash
+        assert cache.get("code1", "tests2") is None
+        # Both different
+        assert cache.get("code2", "tests2") is None
+
+    def test_put_overwrites_existing_entry(self):
+        cache = LRUCache(max_size=10)
+        result1 = _make_batch_result(passed_count=1, total_count=2)
+        result2 = _make_batch_result(passed_count=2, total_count=2)
+
+        cache.put("code1", "tests1", result1)
+        cache.put("code1", "tests1", result2)
+
+        retrieved = cache.get("code1", "tests1")
+        assert retrieved is result2
+        assert retrieved.passed_count == 2
+
+
+# ---------------------------------------------------------------------
+# Eviction Behavior
+# ---------------------------------------------------------------------
+
+
+class TestLRUCacheEviction:
+    def test_evicts_oldest_when_full(self):
+        cache = LRUCache(max_size=3)
+
+        cache.put("code1", "tests", _make_batch_result())
+        cache.put("code2", "tests", _make_batch_result())
+        cache.put("code3", "tests", _make_batch_result())
+
+        # Cache is now full
+        assert cache.stats["size"] == 3
+
+        # Add one more - oldest (code1) should be evicted
+        cache.put("code4", "tests", _make_batch_result())
+
+        assert cache.stats["size"] == 3
+        assert cache.get("code1", "tests") is None  # Evicted
+        assert cache.get("code2", "tests") is not None
+        assert cache.get("code3", "tests") is not None
+        assert cache.get("code4", "tests") is not None
+
+    def test_access_refreshes_entry_avoiding_eviction(self):
+        cache = LRUCache(max_size=3)
+
+        cache.put("code1", "tests", _make_batch_result())
+        cache.put("code2", "tests", _make_batch_result())
+        cache.put("code3", "tests", _make_batch_result())
+
+        # Access code1 to make it most recently used
+        cache.get("code1", "tests")
+
+        # Add new entry - code2 (now oldest accessed) should be evicted
+        cache.put("code4", "tests", _make_batch_result())
+
+        assert cache.get("code1", "tests") is not None  # Still present
+        assert cache.get("code2", "tests") is None  # Evicted
+        assert cache.get("code3", "tests") is not None
+        assert cache.get("code4", "tests") is not None
+
+    def test_put_refreshes_existing_entry(self):
+        cache = LRUCache(max_size=3)
+
+        cache.put("code1", "tests", _make_batch_result())
+        cache.put("code2", "tests", _make_batch_result())
+        cache.put("code3", "tests", _make_batch_result())
+
+        # Update code1 (makes it most recently used)
+        cache.put("code1", "tests", _make_batch_result())
+
+        # Add new entry - code2 should be evicted now
+        cache.put("code4", "tests", _make_batch_result())
+
+        assert cache.get("code1", "tests") is not None
+        assert cache.get("code2", "tests") is None  # Evicted
+        assert cache.get("code3", "tests") is not None
+        assert cache.get("code4", "tests") is not None
+
+    def test_max_size_one(self):
+        cache = LRUCache(max_size=1)
+
+        cache.put("code1", "tests", _make_batch_result())
+        assert cache.get("code1", "tests") is not None
+
+        cache.put("code2", "tests", _make_batch_result())
+        assert cache.get("code1", "tests") is None
+        assert cache.get("code2", "tests") is not None
+
+
+# ---------------------------------------------------------------------
+# Statistics Tracking
+# ---------------------------------------------------------------------
+
+
+class TestLRUCacheStats:
+    def test_initial_stats(self):
+        cache = LRUCache(max_size=100)
+        stats = cache.stats
+
+        assert stats["hits"] == 0
+        assert stats["misses"] == 0
+        assert stats["size"] == 0
+        assert stats["max_size"] == 100
+
+    def test_hit_tracking(self):
+        cache = LRUCache(max_size=10)
+        cache.put("code", "tests", _make_batch_result())
+
+        # First hit
+        cache.get("code", "tests")
+        assert cache.stats["hits"] == 1
+        assert cache.stats["misses"] == 0
+
+        # Second hit
+        cache.get("code", "tests")
+        assert cache.stats["hits"] == 2
+        assert cache.stats["misses"] == 0
+
+    def test_miss_tracking(self):
+        cache = LRUCache(max_size=10)
+
+        # First miss
+        cache.get("nonexistent", "tests")
+        assert cache.stats["hits"] == 0
+        assert cache.stats["misses"] == 1
+
+        # Second miss
+        cache.get("also_nonexistent", "tests")
+        assert cache.stats["hits"] == 0
+        assert cache.stats["misses"] == 2
+
+    def test_mixed_hits_and_misses(self):
+        cache = LRUCache(max_size=10)
+        cache.put("code1", "tests", _make_batch_result())
+
+        cache.get("code1", "tests")  # hit
+        cache.get("code2", "tests")  # miss
+        cache.get("code1", "tests")  # hit
+        cache.get("code3", "tests")  # miss
+        cache.get("code1", "tests")  # hit
+
+        stats = cache.stats
+        assert stats["hits"] == 3
+        assert stats["misses"] == 2
+
+    def test_size_tracking(self):
+        cache = LRUCache(max_size=10)
+
+        assert cache.stats["size"] == 0
+
+        cache.put("code1", "tests", _make_batch_result())
+        assert cache.stats["size"] == 1
+
+        cache.put("code2", "tests", _make_batch_result())
+        assert cache.stats["size"] == 2
+
+        # Overwrite existing doesn't increase size
+        cache.put("code1", "tests", _make_batch_result())
+        assert cache.stats["size"] == 2
+
+
+# ---------------------------------------------------------------------
+# Thread Safety
+# ---------------------------------------------------------------------
+
+
+class TestLRUCacheThreadSafety:
+    def test_concurrent_puts(self):
+        cache = LRUCache(max_size=1000)
+        n_threads = 10
+        puts_per_thread = 100
+
+        def put_items(thread_id: int):
+            for i in range(puts_per_thread):
+                cache.put(f"code_{thread_id}_{i}", "tests", _make_batch_result())
+
+        with ThreadPoolExecutor(max_workers=n_threads) as executor:
+            futures = [executor.submit(put_items, i) for i in range(n_threads)]
+            for f in futures:
+                f.result()
+
+        # All items should be accessible
+        expected_size = n_threads * puts_per_thread
+        assert cache.stats["size"] == expected_size
+
+    def test_concurrent_gets(self):
+        cache = LRUCache(max_size=100)
+
+        # Pre-populate
+        for i in range(100):
+            cache.put(f"code_{i}", "tests", _make_batch_result())
+
+        n_threads = 10
+        gets_per_thread = 100
+
+        def get_items(thread_id: int):
+            hits = 0
+            for i in range(gets_per_thread):
+                key = f"code_{i % 100}"  # Round-robin through existing keys
+                if cache.get(key, "tests") is not None:
+                    hits += 1
+            return hits
+
+        with ThreadPoolExecutor(max_workers=n_threads) as executor:
+            futures = [executor.submit(get_items, i) for i in range(n_threads)]
+            results = [f.result() for f in futures]
+
+        # All gets should have found their items
+        assert all(r == gets_per_thread for r in results)
+
+        # Stats should reflect all hits
+        assert cache.stats["hits"] == n_threads * gets_per_thread
+
+    def test_concurrent_mixed_operations(self):
+        cache = LRUCache(max_size=50)
+        n_threads = 8
+        ops_per_thread = 100
+
+        errors = []
+
+        def mixed_operations(thread_id: int):
+            try:
+                for i in range(ops_per_thread):
+                    if i % 3 == 0:
+                        cache.put(f"code_{i}", "tests", _make_batch_result())
+                    else:
+                        cache.get(f"code_{i % 30}", "tests")
+                    # Access stats during operations
+                    _ = cache.stats
+            except Exception as e:
+                errors.append(str(e))
+
+        with ThreadPoolExecutor(max_workers=n_threads) as executor:
+            futures = [executor.submit(mixed_operations, i) for i in range(n_threads)]
+            for f in futures:
+                f.result()
+
+        # No errors should have occurred
+        assert len(errors) == 0, f"Errors during concurrent operations: {errors}"
+
+        # Cache should be in a consistent state
+        stats = cache.stats
+        assert stats["size"] <= stats["max_size"]
+        assert stats["hits"] >= 0
+        assert stats["misses"] >= 0
+
+    def test_concurrent_eviction_stress(self):
+        """Test that concurrent puts with eviction don't cause issues."""
+        cache = LRUCache(max_size=10)
+        n_threads = 20
+        puts_per_thread = 100
+
+        errors = []
+
+        def stress_puts(thread_id: int):
+            try:
+                for i in range(puts_per_thread):
+                    cache.put(f"code_{thread_id}_{i}", "tests", _make_batch_result())
+            except Exception as e:
+                errors.append(str(e))
+
+        with ThreadPoolExecutor(max_workers=n_threads) as executor:
+            futures = [executor.submit(stress_puts, i) for i in range(n_threads)]
+            for f in futures:
+                f.result()
+
+        assert len(errors) == 0
+        assert cache.stats["size"] == 10  # Should stay at max
+
+
+# ---------------------------------------------------------------------
+# Edge Cases
+# ---------------------------------------------------------------------
+
+
+class TestLRUCacheEdgeCases:
+    def test_empty_hash_strings(self):
+        cache = LRUCache(max_size=10)
+        batch = _make_batch_result()
+
+        cache.put("", "", batch)
+        assert cache.get("", "") is batch
+
+    def test_very_long_hash_strings(self):
+        cache = LRUCache(max_size=10)
+        batch = _make_batch_result()
+
+        long_code_hash = "a" * 10000
+        long_tests_hash = "b" * 10000
+
+        cache.put(long_code_hash, long_tests_hash, batch)
+        assert cache.get(long_code_hash, long_tests_hash) is batch
+
+    def test_special_characters_in_hashes(self):
+        cache = LRUCache(max_size=10)
+        batch = _make_batch_result()
+
+        special_hash = "!@#$%^&*()_+-=[]{}|;':\",./<>?"
+        cache.put(special_hash, special_hash, batch)
+        assert cache.get(special_hash, special_hash) is batch
+
+    def test_unicode_in_hashes(self):
+        cache = LRUCache(max_size=10)
+        batch = _make_batch_result()
+
+        unicode_hash = "hash_with_unicode_"
+        cache.put(unicode_hash, unicode_hash, batch)
+        assert cache.get(unicode_hash, unicode_hash) is batch
diff --git a/tests/test_code_exec_env.py b/tests/test_code_exec_env.py
new file mode 100644
index 0000000..70d3cb1
--- /dev/null
+++ b/tests/test_code_exec_env.py
@@ -0,0 +1,880 @@
+"""
+Unit tests for ludic.envs.code_exec.env.CodeExecEnv
+
+Tests the environment with mock sandbox pools to avoid Docker dependency.
+"""
+
+import pytest
+
+from ludic.envs.code_exec.env import CodeExecConfig, CodeExecEnv
+from ludic.envs.code_exec.types import (
+    BatchTestResult,
+    CompileResult,
+    CompileStatus,
+    ExecutionResult,
+    RunStatus,
+    TestCase,
+    TestResult,
+)
+from ludic.envs.code_exec.adapters.base import ExactMatchVerifier, TestAdapter
+from ludic.envs.code_exec.sandbox import Sandbox, SandboxPool
+
+
+# ---------------------------------------------------------------------
+# Mock Implementations
+# ---------------------------------------------------------------------
+
+
+class MockSandbox:
+    """Mock sandbox for testing without Docker."""
+
+    def __init__(
+        self,
+        compile_result: CompileResult | None = None,
+        execute_results: dict[str, ExecutionResult] | None = None,
+        default_stdout: str = "",
+    ):
+        self._compile_result = compile_result or CompileResult(
+            status=CompileStatus.SUCCESS,
+            duration_ms=10.0,
+        )
+        self._execute_results = execute_results or {}
+        self._default_stdout = default_stdout
+        self._python_version = "3.11"
+
+        # Track calls
+        self.reset_calls = 0
+        self.compile_calls: list[str] = []
+        self.execute_calls: list[tuple[str, str]] = []
+
+    @property
+    def python_version(self) -> str:
+        return self._python_version
+
+    async def reset(self) -> None:
+        self.reset_calls += 1
+
+    async def compile(self, code: str, *, timeout_s: float = 5.0) -> CompileResult:
+        self.compile_calls.append(code)
+        return self._compile_result
+
+    async def execute(
+        self,
+        code: str,
+        *,
+        stdin: str = "",
+        skip_compile: bool = False,
+        timeout_s: float = 10.0,
+        memory_limit_mb: int | None = None,
+        env_vars: dict[str, str] | None = None,
+    ) -> ExecutionResult:
+        self.execute_calls.append((code, stdin))
+
+        if stdin in self._execute_results:
+            return self._execute_results[stdin]
+
+        return ExecutionResult(
+            compile_result=self._compile_result,
+            run_status=RunStatus.SUCCESS,
+            stdout=self._default_stdout,
+            stderr="",
+            exit_code=0,
+            compile_duration_ms=10.0,
+            run_duration_ms=50.0,
+            total_duration_ms=60.0,
+        )
+
+
+class MockSandboxPool:
+    """Mock sandbox pool for testing without Docker."""
+
+    def __init__(
+        self,
+        sandbox: MockSandbox | None = None,
+        python_version: str = "3.11",
+    ):
+        self._sandbox = sandbox or MockSandbox()
+        self._python_version = python_version
+        self._cache: dict[tuple[str, str], BatchTestResult] = {}
+
+        # Track calls
+        self.start_calls = 0
+        self.checkout_calls = 0
+        self.release_calls = 0
+        self.shutdown_calls = 0
+
+    @property
+    def python_version(self) -> str:
+        return self._python_version
+
+    async def start(self) -> None:
+        self.start_calls += 1
+
+    async def checkout(self, timeout_s: float = 30.0) -> Sandbox:
+        self.checkout_calls += 1
+        return self._sandbox
+
+    async def release(self, sandbox: Sandbox) -> None:
+        self.release_calls += 1
+
+    async def shutdown(self) -> None:
+        self.shutdown_calls += 1
+
+    def get_cached(self, code_hash: str, tests_hash: str) -> BatchTestResult | None:
+        return self._cache.get((code_hash, tests_hash))
+
+    def put_cached(
+        self, code_hash: str, tests_hash: str, result: BatchTestResult
+    ) -> None:
+        self._cache[(code_hash, tests_hash)] = result
+
+    @property
+    def cache_stats(self) -> dict[str, int]:
+        """Return mock cache statistics."""
+        return {
+            "hits": 0,
+            "misses": 0,
+            "size": len(self._cache),
+            "max_size": 10000,
+        }
+
+
+class MockTestAdapter:
+    """Mock test adapter for testing."""
+
+    def __init__(
+        self,
+        prompt: str = "Write a program.",
+        problem_id: str = "test_problem",
+        tests: list[TestCase] | None = None,
+    ):
+        self._prompt = prompt
+        self._problem_id = problem_id
+        self._tests = tests or [
+            TestCase(input="1", expected="1", id="test_0"),
+        ]
+
+    def get_prompt(self, sample: dict) -> str:
+        return self._prompt
+
+    def get_problem_id(self, sample: dict) -> str:
+        return self._problem_id
+
+    def get_tests(self, sample: dict) -> list[TestCase]:
+        return self._tests
+
+    def hash_tests(self, tests: list[TestCase]) -> str:
+        return "mock_tests_hash_1234"
+
+
+# ---------------------------------------------------------------------
+# Environment Reset Tests
+# ---------------------------------------------------------------------
+
+
+class TestCodeExecEnvReset:
+    @pytest.mark.asyncio
+    async def test_reset_returns_prompt_and_info(self):
+        sandbox = MockSandbox(default_stdout="1")
+        pool = MockSandboxPool(sandbox=sandbox)
+        adapter = MockTestAdapter(prompt="Add two numbers.", problem_id="prob_1")
+
+        env = CodeExecEnv(
+            sample={"question": "Add two numbers."},
+            sandbox_pool=pool,
+            test_adapter=adapter,
+        )
+
+        obs, info = await env.env_reset()
+
+        assert obs == "Add two numbers."
+        assert info["problem_id"] == "prob_1"
+        assert "num_tests" in info
+        assert "tests_hash" in info
+        assert "python_version" in info
+
+    @pytest.mark.asyncio
+    async def test_reset_extracts_correct_number_of_tests(self):
+        sandbox = MockSandbox(default_stdout="out")
+        pool = MockSandboxPool(sandbox=sandbox)
+        adapter = MockTestAdapter(
+            tests=[
+                TestCase(input="1", expected="a", id="t0"),
+                TestCase(input="2", expected="b", id="t1"),
+                TestCase(input="3", expected="c", id="t2"),
+            ]
+        )
+
+        env = CodeExecEnv(
+            sample={},
+            sandbox_pool=pool,
+            test_adapter=adapter,
+        )
+
+        obs, info = await env.env_reset()
+
+        assert info["num_tests"] == 3
+
+    @pytest.mark.asyncio
+    async def test_reset_respects_max_tests_config(self):
+        sandbox = MockSandbox(default_stdout="out")
+        pool = MockSandboxPool(sandbox=sandbox)
+        adapter = MockTestAdapter(
+            tests=[
+                TestCase(input="1", expected="a", id="t0"),
+                TestCase(input="2", expected="b", id="t1"),
+                TestCase(input="3", expected="c", id="t2"),
+                TestCase(input="4", expected="d", id="t3"),
+                TestCase(input="5", expected="e", id="t4"),
+            ]
+        )
+
+        config = CodeExecConfig(max_tests=2)
+        env = CodeExecEnv(
+            sample={},
+            sandbox_pool=pool,
+            test_adapter=adapter,
+            config=config,
+        )
+
+        obs, info = await env.env_reset()
+
+        assert info["num_tests"] == 2
+
+    @pytest.mark.asyncio
+    async def test_reset_handles_empty_tests(self):
+        sandbox = MockSandbox()
+        pool = MockSandboxPool(sandbox=sandbox)
+
+        # Create adapter that returns empty tests
+        class EmptyTestsAdapter:
+            def get_prompt(self, sample: dict) -> str:
+                return "Write a program."
+
+            def get_problem_id(self, sample: dict) -> str:
+                return "test_problem"
+
+            def get_tests(self, sample: dict) -> list[TestCase]:
+                return []  # No tests!
+
+            def hash_tests(self, tests: list[TestCase]) -> str:
+                return "empty_hash"
+
+        adapter = EmptyTestsAdapter()
+
+        env = CodeExecEnv(
+            sample={},
+            sandbox_pool=pool,
+            test_adapter=adapter,
+        )
+
+        obs, info = await env.env_reset()
+
+        assert "error" in info
+        assert info["error"] == "no_tests_extracted"
+
+    @pytest.mark.asyncio
+    async def test_reset_sets_system_prompt(self):
+        sandbox = MockSandbox()
+        pool = MockSandboxPool(sandbox=sandbox)
+        adapter = MockTestAdapter()
+
+        env = CodeExecEnv(
+            sample={},
+            sandbox_pool=pool,
+            test_adapter=adapter,
+            system_prompt="You are a Python expert.",
+        )
+
+        assert env.suggested_sysprompt == "You are a Python expert."
+
+
+# ---------------------------------------------------------------------
+# Environment Step Tests - Success Cases
+# ---------------------------------------------------------------------
+
+
+class TestCodeExecEnvStepSuccess:
+    @pytest.mark.asyncio
+    async def test_step_all_tests_pass(self):
+        sandbox = MockSandbox(default_stdout="expected_output")
+        pool = MockSandboxPool(sandbox=sandbox)
+        adapter = MockTestAdapter(
+            tests=[
+                TestCase(input="in1", expected="expected_output", id="t0"),
+                TestCase(input="in2", expected="expected_output", id="t1"),
+            ]
+        )
+
+        env = CodeExecEnv(
+            sample={},
+            sandbox_pool=pool,
+            test_adapter=adapter,
+        )
+
+        await env.env_reset()
+        outcome = await env.env_step("print('expected_output')")
+
+        assert outcome.terminated is True
+        assert outcome.truncated is False
+        assert outcome.reward == 1.0
+        assert outcome.info["all_passed"] is True
+        assert outcome.info["passed"] == 2
+        assert outcome.info["total"] == 2
+        assert "All" in outcome.obs and "passed" in outcome.obs
+
+    @pytest.mark.asyncio
+    async def test_step_releases_sandbox(self):
+        sandbox = MockSandbox(default_stdout="output")
+        pool = MockSandboxPool(sandbox=sandbox)
+        adapter = MockTestAdapter(
+            tests=[TestCase(input="x", expected="output", id="t0")]
+        )
+
+        env = CodeExecEnv(
+            sample={},
+            sandbox_pool=pool,
+            test_adapter=adapter,
+        )
+
+        await env.env_reset()
+        await env.env_step("code")
+
+        assert pool.checkout_calls == 1
+        assert pool.release_calls == 1
+
+
+# ---------------------------------------------------------------------
+# Environment Step Tests - Failure Cases
+# ---------------------------------------------------------------------
+
+
+class TestCodeExecEnvStepFailure:
+    @pytest.mark.asyncio
+    async def test_step_without_reset_returns_error(self):
+        sandbox = MockSandbox()
+        pool = MockSandboxPool(sandbox=sandbox)
+        adapter = MockTestAdapter()
+
+        env = CodeExecEnv(
+            sample={},
+            sandbox_pool=pool,
+            test_adapter=adapter,
+        )
+
+        # Skip reset
+        outcome = await env.env_step("some code")
+
+        assert outcome.terminated is True
+        assert outcome.reward == -1.0
+        assert outcome.info["error"] == "reset_not_called"
+
+    @pytest.mark.asyncio
+    async def test_step_with_empty_code(self):
+        sandbox = MockSandbox()
+        pool = MockSandboxPool(sandbox=sandbox)
+        adapter = MockTestAdapter()
+        config = CodeExecConfig(compile_failure_reward=-0.5)
+
+        env = CodeExecEnv(
+            sample={},
+            sandbox_pool=pool,
+            test_adapter=adapter,
+            config=config,
+        )
+
+        await env.env_reset()
+        outcome = await env.env_step("")
+
+        assert outcome.terminated is True
+        assert outcome.reward == -0.5
+        assert outcome.info["error"] == "empty_code"
+
+    @pytest.mark.asyncio
+    async def test_step_with_whitespace_only_code(self):
+        sandbox = MockSandbox()
+        pool = MockSandboxPool(sandbox=sandbox)
+        adapter = MockTestAdapter()
+
+        env = CodeExecEnv(
+            sample={},
+            sandbox_pool=pool,
+            test_adapter=adapter,
+        )
+
+        await env.env_reset()
+        outcome = await env.env_step("   \n\t  ")
+
+        assert outcome.info["error"] == "empty_code"
+
+    @pytest.mark.asyncio
+    async def test_step_compile_failure(self):
+        compile_result = CompileResult(
+            status=CompileStatus.SYNTAX_ERROR,
+            error_message="SyntaxError: invalid syntax",
+            error_line=5,
+            duration_ms=10.0,
+        )
+        sandbox = MockSandbox(compile_result=compile_result)
+        pool = MockSandboxPool(sandbox=sandbox)
+        adapter = MockTestAdapter()
+        config = CodeExecConfig(compile_failure_reward=-0.2)
+
+        env = CodeExecEnv(
+            sample={},
+            sandbox_pool=pool,
+            test_adapter=adapter,
+            config=config,
+        )
+
+        await env.env_reset()
+        outcome = await env.env_step("def foo(")
+
+        assert outcome.reward == -0.2
+        assert outcome.info["compile_failed"] is True
+        assert "Compilation failed" in outcome.obs
+        assert "SyntaxError" in outcome.obs
+
+    @pytest.mark.asyncio
+    async def test_step_some_tests_fail(self):
+        execute_results = {
+            "input1": ExecutionResult(
+                compile_result=CompileResult(status=CompileStatus.SUCCESS),
+                run_status=RunStatus.SUCCESS,
+                stdout="correct",
+            ),
+            "input2": ExecutionResult(
+                compile_result=CompileResult(status=CompileStatus.SUCCESS),
+                run_status=RunStatus.SUCCESS,
+                stdout="wrong",  # Will fail
+            ),
+        }
+        sandbox = MockSandbox(execute_results=execute_results)
+        pool = MockSandboxPool(sandbox=sandbox)
+        adapter = MockTestAdapter(
+            tests=[
+                TestCase(input="input1", expected="correct", id="t0"),
+                TestCase(input="input2", expected="correct", id="t1"),
+            ]
+        )
+
+        env = CodeExecEnv(
+            sample={},
+            sandbox_pool=pool,
+            test_adapter=adapter,
+            config=CodeExecConfig(stop_on_first_failure=False),
+        )
+
+        await env.env_reset()
+        outcome = await env.env_step("code")
+
+        assert outcome.reward == 0.0  # Binary reward, not all passed
+        assert outcome.info["all_passed"] is False
+        assert outcome.info["passed"] == 1
+        assert outcome.info["total"] == 2
+
+
+# ---------------------------------------------------------------------
+# Reward Shaping Tests
+# ---------------------------------------------------------------------
+
+
+class TestCodeExecEnvRewardShaping:
+    @pytest.mark.asyncio
+    async def test_binary_reward_all_pass(self):
+        sandbox = MockSandbox(default_stdout="out")
+        pool = MockSandboxPool(sandbox=sandbox)
+        adapter = MockTestAdapter(
+            tests=[TestCase(input="x", expected="out", id="t0")]
+        )
+        config = CodeExecConfig(partial_credit=False)
+
+        env = CodeExecEnv(
+            sample={},
+            sandbox_pool=pool,
+            test_adapter=adapter,
+            config=config,
+        )
+
+        await env.env_reset()
+        outcome = await env.env_step("code")
+
+        assert outcome.reward == 1.0
+
+    @pytest.mark.asyncio
+    async def test_binary_reward_some_fail(self):
+        execute_results = {
+            "in1": ExecutionResult(
+                compile_result=CompileResult(status=CompileStatus.SUCCESS),
+                run_status=RunStatus.SUCCESS,
+                stdout="correct",
+            ),
+            "in2": ExecutionResult(
+                compile_result=CompileResult(status=CompileStatus.SUCCESS),
+                run_status=RunStatus.SUCCESS,
+                stdout="wrong",
+            ),
+        }
+        sandbox = MockSandbox(execute_results=execute_results)
+        pool = MockSandboxPool(sandbox=sandbox)
+        adapter = MockTestAdapter(
+            tests=[
+                TestCase(input="in1", expected="correct", id="t0"),
+                TestCase(input="in2", expected="correct", id="t1"),
+            ]
+        )
+        config = CodeExecConfig(partial_credit=False, stop_on_first_failure=False)
+
+        env = CodeExecEnv(
+            sample={},
+            sandbox_pool=pool,
+            test_adapter=adapter,
+            config=config,
+        )
+
+        await env.env_reset()
+        outcome = await env.env_step("code")
+
+        assert outcome.reward == 0.0  # Binary: all or nothing
+
+    @pytest.mark.asyncio
+    async def test_partial_credit_half_pass(self):
+        execute_results = {
+            "in1": ExecutionResult(
+                compile_result=CompileResult(status=CompileStatus.SUCCESS),
+                run_status=RunStatus.SUCCESS,
+                stdout="correct",
+            ),
+            "in2": ExecutionResult(
+                compile_result=CompileResult(status=CompileStatus.SUCCESS),
+                run_status=RunStatus.SUCCESS,
+                stdout="correct",
+            ),
+            "in3": ExecutionResult(
+                compile_result=CompileResult(status=CompileStatus.SUCCESS),
+                run_status=RunStatus.SUCCESS,
+                stdout="wrong",
+            ),
+            "in4": ExecutionResult(
+                compile_result=CompileResult(status=CompileStatus.SUCCESS),
+                run_status=RunStatus.SUCCESS,
+                stdout="wrong",
+            ),
+        }
+        sandbox = MockSandbox(execute_results=execute_results)
+        pool = MockSandboxPool(sandbox=sandbox)
+        adapter = MockTestAdapter(
+            tests=[
+                TestCase(input="in1", expected="correct", id="t0"),
+                TestCase(input="in2", expected="correct", id="t1"),
+                TestCase(input="in3", expected="correct", id="t2"),
+                TestCase(input="in4", expected="correct", id="t3"),
+            ]
+        )
+        config = CodeExecConfig(partial_credit=True, stop_on_first_failure=False)
+
+        env = CodeExecEnv(
+            sample={},
+            sandbox_pool=pool,
+            test_adapter=adapter,
+            config=config,
+        )
+
+        await env.env_reset()
+        outcome = await env.env_step("code")
+
+        assert outcome.reward == pytest.approx(0.5)  # 2/4 passed
+
+
+# ---------------------------------------------------------------------
+# Caching Tests
+# ---------------------------------------------------------------------
+
+
+class TestCodeExecEnvCaching:
+    @pytest.mark.asyncio
+    async def test_cache_hit_skips_execution(self):
+        sandbox = MockSandbox(default_stdout="output")
+        pool = MockSandboxPool(sandbox=sandbox)
+        adapter = MockTestAdapter(
+            tests=[TestCase(input="x", expected="output", id="t0")]
+        )
+        config = CodeExecConfig(use_cache=True)
+
+        env = CodeExecEnv(
+            sample={},
+            sandbox_pool=pool,
+            test_adapter=adapter,
+            config=config,
+        )
+
+        await env.env_reset()
+
+        # First call - should execute
+        outcome1 = await env.env_step("print('output')")
+        assert pool.checkout_calls == 1
+        assert outcome1.info["cache_hit"] is False
+
+        # Second call with same code - should hit cache
+        await env.env_reset()  # Reset to allow another step
+        outcome2 = await env.env_step("print('output')")
+        assert pool.checkout_calls == 1  # No new checkout
+        assert outcome2.info["cache_hit"] is True
+
+    @pytest.mark.asyncio
+    async def test_cache_disabled(self):
+        sandbox = MockSandbox(default_stdout="output")
+        pool = MockSandboxPool(sandbox=sandbox)
+        adapter = MockTestAdapter(
+            tests=[TestCase(input="x", expected="output", id="t0")]
+        )
+        config = CodeExecConfig(use_cache=False)
+
+        env = CodeExecEnv(
+            sample={},
+            sandbox_pool=pool,
+            test_adapter=adapter,
+            config=config,
+        )
+
+        await env.env_reset()
+        outcome1 = await env.env_step("print('output')")
+        assert pool.checkout_calls == 1
+
+        await env.env_reset()
+        outcome2 = await env.env_step("print('output')")
+        assert pool.checkout_calls == 2  # New execution each time
+        assert outcome2.info["cache_hit"] is False
+
+
+# ---------------------------------------------------------------------
+# Info Dict Tests
+# ---------------------------------------------------------------------
+
+
+class TestCodeExecEnvInfo:
+    @pytest.mark.asyncio
+    async def test_info_contains_required_fields(self):
+        sandbox = MockSandbox(default_stdout="out")
+        pool = MockSandboxPool(sandbox=sandbox, python_version="3.10")
+        adapter = MockTestAdapter(problem_id="prob_42")
+        adapter._tests = [TestCase(input="x", expected="out", id="t0")]
+
+        env = CodeExecEnv(
+            sample={},
+            sandbox_pool=pool,
+            test_adapter=adapter,
+        )
+
+        await env.env_reset()
+        outcome = await env.env_step("code")
+        info = outcome.info
+
+        # Problem metadata
+        assert info["problem_id"] == "prob_42"
+        assert "code_hash" in info
+        assert "tests_hash" in info
+
+        # Test results summary
+        assert "passed" in info
+        assert "total" in info
+        assert "all_passed" in info
+        assert "pass_rate" in info
+        assert "compile_failed" in info
+
+        # Detailed results
+        assert "test_results" in info
+        assert isinstance(info["test_results"], list)
+
+        # Timing
+        assert "timing" in info
+        assert "total_compile_ms" in info["timing"]
+        assert "total_run_ms" in info["timing"]
+        assert "total_execution_ms" in info["timing"]
+
+        # Cache and env info
+        assert "cache_hit" in info
+        assert info["python_version"] == "3.10"
+
+    @pytest.mark.asyncio
+    async def test_info_test_results_detail(self):
+        execute_results = {
+            "in1": ExecutionResult(
+                compile_result=CompileResult(status=CompileStatus.SUCCESS),
+                run_status=RunStatus.SUCCESS,
+                stdout="correct",
+                run_duration_ms=100.0,
+            ),
+        }
+        sandbox = MockSandbox(execute_results=execute_results)
+        pool = MockSandboxPool(sandbox=sandbox)
+        adapter = MockTestAdapter(
+            tests=[TestCase(input="in1", expected="correct", id="test_001")]
+        )
+
+        env = CodeExecEnv(
+            sample={},
+            sandbox_pool=pool,
+            test_adapter=adapter,
+        )
+
+        await env.env_reset()
+        outcome = await env.env_step("code")
+
+        test_result = outcome.info["test_results"][0]
+        assert test_result["test_id"] == "test_001"
+        assert test_result["passed"] is True
+        assert test_result["compiled"] is True
+        assert test_result["ran"] is True
+        assert test_result["run_status"] == "success"
+        assert test_result["compile_status"] == "success"
+
+
+# ---------------------------------------------------------------------
+# Observation Building Tests
+# ---------------------------------------------------------------------
+
+
+class TestCodeExecEnvObservation:
+    @pytest.mark.asyncio
+    async def test_observation_on_success(self):
+        sandbox = MockSandbox(default_stdout="out")
+        pool = MockSandboxPool(sandbox=sandbox)
+        adapter = MockTestAdapter(
+            tests=[
+                TestCase(input="x", expected="out", id="t0"),
+                TestCase(input="y", expected="out", id="t1"),
+            ]
+        )
+
+        env = CodeExecEnv(sample={}, sandbox_pool=pool, test_adapter=adapter)
+
+        await env.env_reset()
+        outcome = await env.env_step("code")
+
+        assert "All 2 tests passed" in outcome.obs
+
+    @pytest.mark.asyncio
+    async def test_observation_on_compile_error_includes_line(self):
+        compile_result = CompileResult(
+            status=CompileStatus.SYNTAX_ERROR,
+            error_message="invalid syntax",
+            error_line=42,
+            duration_ms=5.0,
+        )
+        sandbox = MockSandbox(compile_result=compile_result)
+        pool = MockSandboxPool(sandbox=sandbox)
+        adapter = MockTestAdapter()
+
+        env = CodeExecEnv(sample={}, sandbox_pool=pool, test_adapter=adapter)
+
+        await env.env_reset()
+        outcome = await env.env_step("bad code")
+
+        assert "Compilation failed" in outcome.obs
+        assert "line 42" in outcome.obs
+
+    @pytest.mark.asyncio
+    async def test_observation_truncates_long_errors(self):
+        long_error = "E" * 1000
+        compile_result = CompileResult(
+            status=CompileStatus.SYNTAX_ERROR,
+            error_message=long_error,
+            duration_ms=5.0,
+        )
+        sandbox = MockSandbox(compile_result=compile_result)
+        pool = MockSandboxPool(sandbox=sandbox)
+        adapter = MockTestAdapter()
+        config = CodeExecConfig(max_error_length=100)
+
+        env = CodeExecEnv(
+            sample={},
+            sandbox_pool=pool,
+            test_adapter=adapter,
+            config=config,
+        )
+
+        await env.env_reset()
+        outcome = await env.env_step("code")
+
+        # Error should be truncated with "..."
+        assert len(outcome.obs) < len(long_error)
+        assert "..." in outcome.obs
+
+    @pytest.mark.asyncio
+    async def test_observation_includes_stderr_when_configured(self):
+        execute_results = {
+            "input": ExecutionResult(
+                compile_result=CompileResult(status=CompileStatus.SUCCESS),
+                run_status=RunStatus.RUNTIME_ERROR,
+                stdout="",
+                stderr="NameError: x is not defined",
+            ),
+        }
+        sandbox = MockSandbox(execute_results=execute_results)
+        pool = MockSandboxPool(sandbox=sandbox)
+        adapter = MockTestAdapter(
+            tests=[TestCase(input="input", expected="output", id="t0")]
+        )
+        config = CodeExecConfig(include_stderr_in_obs=True)
+
+        env = CodeExecEnv(
+            sample={},
+            sandbox_pool=pool,
+            test_adapter=adapter,
+            config=config,
+        )
+
+        await env.env_reset()
+        outcome = await env.env_step("print(x)")
+
+        assert "Stderr" in outcome.obs
+        assert "NameError" in outcome.obs
+
+
+# ---------------------------------------------------------------------
+# Current Observation Tests
+# ---------------------------------------------------------------------
+
+
+class TestCodeExecEnvCurrentObs:
+    @pytest.mark.asyncio
+    async def test_env_current_obs_before_reset(self):
+        sandbox = MockSandbox()
+        pool = MockSandboxPool(sandbox=sandbox)
+        adapter = MockTestAdapter()
+
+        env = CodeExecEnv(sample={}, sandbox_pool=pool, test_adapter=adapter)
+
+        obs = env.env_current_obs()
+        assert "Error" in obs
+        assert "reset" in obs.lower()
+
+    @pytest.mark.asyncio
+    async def test_env_current_obs_after_reset(self):
+        sandbox = MockSandbox()
+        pool = MockSandboxPool(sandbox=sandbox)
+        adapter = MockTestAdapter(prompt="Solve this problem.")
+
+        env = CodeExecEnv(sample={}, sandbox_pool=pool, test_adapter=adapter)
+
+        await env.env_reset()
+        obs = env.env_current_obs()
+
+        assert obs == "Solve this problem."
+
+    @pytest.mark.asyncio
+    async def test_env_current_obs_after_step(self):
+        sandbox = MockSandbox(default_stdout="result")
+        pool = MockSandboxPool(sandbox=sandbox)
+        adapter = MockTestAdapter(
+            tests=[TestCase(input="x", expected="result", id="t0")]
+        )
+
+        env = CodeExecEnv(sample={}, sandbox_pool=pool, test_adapter=adapter)
+
+        await env.env_reset()
+        await env.env_step("code")
+        obs = env.env_current_obs()
+
+        assert "passed" in obs
diff --git a/tests/test_code_exec_podman.py b/tests/test_code_exec_podman.py
new file mode 100644
index 0000000..301184d
--- /dev/null
+++ b/tests/test_code_exec_podman.py
@@ -0,0 +1,546 @@
+"""
+Unit tests for Podman-HPC sandbox implementation.
+
+These tests mock subprocess calls to test the logic without requiring
+actual podman-hpc CLI or containers.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import os
+from dataclasses import dataclass
+from typing import Optional
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+from ludic.envs.code_exec.podman_sandbox import (
+    LRUCache,
+    PodmanConfig,
+    PodmanError,
+    PodmanHPCSandbox,
+    PodmanHPCSandboxPool,
+    PodmanResult,
+    _get_container_name_prefix,
+)
+from ludic.envs.code_exec.backend import (
+    SandboxBackend,
+    detect_available_backend,
+    is_docker_available,
+    is_podman_hpc_available,
+    is_singularity_available,
+    get_backend_info,
+)
+from ludic.envs.code_exec.types import (
+    BatchTestResult,
+    CompileStatus,
+    RunStatus,
+    TestCase,
+    TestResult,
+    CompileResult,
+    ExecutionResult,
+)
+
+
+# ============================================================================
+# Container naming tests
+# ============================================================================
+
+
+class TestContainerNaming:
+    """Tests for container name prefix generation."""
+
+    def test_local_prefix_without_slurm(self):
+        """Without SLURM_JOB_ID, should use 'local' prefix."""
+        with patch.dict(os.environ, {}, clear=True):
+            # Ensure SLURM_JOB_ID is not set
+            os.environ.pop("SLURM_JOB_ID", None)
+            prefix = _get_container_name_prefix()
+            assert prefix == "ludic-sandbox-local"
+
+    def test_slurm_prefix_with_job_id(self):
+        """With SLURM_JOB_ID, should include job ID in prefix."""
+        with patch.dict(os.environ, {"SLURM_JOB_ID": "12345"}):
+            prefix = _get_container_name_prefix()
+            assert prefix == "ludic-sandbox-12345"
+
+
+# ============================================================================
+# PodmanConfig tests
+# ============================================================================
+
+
+class TestPodmanConfig:
+    """Tests for PodmanConfig dataclass."""
+
+    def test_default_config(self):
+        """Test default configuration values."""
+        config = PodmanConfig()
+        assert config.memory_limit == "256m"
+        assert config.cpu_quota is None
+        assert config.network_disabled is True
+        assert config.working_dir == "/workspace"
+        assert config.gpu is False
+        assert config.extra_args is None
+
+    def test_custom_config(self):
+        """Test custom configuration values."""
+        config = PodmanConfig(
+            memory_limit="512m",
+            cpu_quota=0.5,
+            network_disabled=False,
+            gpu=True,
+            extra_args=["--security-opt", "label=disable"],
+        )
+        assert config.memory_limit == "512m"
+        assert config.cpu_quota == 0.5
+        assert config.network_disabled is False
+        assert config.gpu is True
+        assert config.extra_args == ["--security-opt", "label=disable"]
+
+
+# ============================================================================
+# LRUCache tests (same as Docker implementation)
+# ============================================================================
+
+
+class TestLRUCache:
+    """Tests for LRUCache implementation."""
+
+    def _make_batch_result(self, code_hash: str, tests_hash: str) -> BatchTestResult:
+        """Helper to create a BatchTestResult."""
+        return BatchTestResult(
+            results=[],
+            code_hash=code_hash,
+            tests_hash=tests_hash,
+        )
+
+    def test_get_miss(self):
+        """Cache miss should return None and increment miss counter."""
+        cache = LRUCache(max_size=10)
+        result = cache.get("code1", "tests1")
+        assert result is None
+        assert cache.stats["misses"] == 1
+        assert cache.stats["hits"] == 0
+
+    def test_put_and_get(self):
+        """Should store and retrieve values."""
+        cache = LRUCache(max_size=10)
+        batch_result = self._make_batch_result("code1", "tests1")
+        cache.put("code1", "tests1", batch_result)
+
+        result = cache.get("code1", "tests1")
+        assert result is batch_result
+        assert cache.stats["hits"] == 1
+        assert cache.stats["size"] == 1
+
+    def test_lru_eviction(self):
+        """Should evict least recently used when full."""
+        cache = LRUCache(max_size=2)
+
+        result1 = self._make_batch_result("code1", "tests1")
+        result2 = self._make_batch_result("code2", "tests2")
+        result3 = self._make_batch_result("code3", "tests3")
+
+        cache.put("code1", "tests1", result1)
+        cache.put("code2", "tests2", result2)
+        # Access code1 to make it recently used
+        cache.get("code1", "tests1")
+        # Add code3, should evict code2 (least recently used)
+        cache.put("code3", "tests3", result3)
+
+        assert cache.get("code1", "tests1") is result1  # Still there
+        assert cache.get("code2", "tests2") is None  # Evicted
+        assert cache.get("code3", "tests3") is result3  # Still there
+
+    def test_put_overwrites_existing(self):
+        """Should overwrite existing values with same key."""
+        cache = LRUCache(max_size=10)
+        result1 = self._make_batch_result("code1", "tests1")
+        result2 = self._make_batch_result("code1", "tests1")
+
+        cache.put("code1", "tests1", result1)
+        cache.put("code1", "tests1", result2)
+
+        result = cache.get("code1", "tests1")
+        assert result is result2
+        assert cache.stats["size"] == 1
+
+
+# ============================================================================
+# PodmanHPCSandbox tests (mocked subprocess)
+# ============================================================================
+
+
+class TestPodmanHPCSandbox:
+    """Tests for PodmanHPCSandbox with mocked subprocess."""
+
+    @pytest.fixture
+    def sandbox(self):
+        """Create a sandbox instance for testing."""
+        config = PodmanConfig(memory_limit="256m", network_disabled=True)
+        return PodmanHPCSandbox(
+            container_name="test-container",
+            image="python:3.11-slim",
+            config=config,
+            python_version="3.11",
+        )
+
+    @pytest.mark.asyncio
+    async def test_start_creates_container(self, sandbox):
+        """Start should create and run a persistent container."""
+        mock_process = AsyncMock()
+        mock_process.returncode = 0
+        mock_process.communicate = AsyncMock(return_value=(b"", b""))
+
+        with patch("asyncio.create_subprocess_exec", return_value=mock_process) as mock_exec:
+            await sandbox.start()
+
+            # Should have called rm -f, run -d, and mkdir
+            assert mock_exec.call_count == 3
+            calls = mock_exec.call_args_list
+
+            # First call: rm -f
+            assert calls[0][0][0] == "podman-hpc"
+            assert "rm" in calls[0][0]
+            assert "-f" in calls[0][0]
+
+            # Second call: run -d
+            assert calls[1][0][0] == "podman-hpc"
+            assert "run" in calls[1][0]
+            assert "-d" in calls[1][0]
+            assert "--name" in calls[1][0]
+            assert "test-container" in calls[1][0]
+            assert "sleep" in calls[1][0]
+            assert "infinity" in calls[1][0]
+
+            # Third call: mkdir
+            assert calls[2][0][0] == "podman-hpc"
+            assert "exec" in calls[2][0]
+            assert "mkdir" in calls[2][0]
+
+    @pytest.mark.asyncio
+    async def test_reset_clears_workspace(self, sandbox):
+        """Reset should clear the workspace directory."""
+        sandbox._started = True
+
+        mock_process = AsyncMock()
+        mock_process.returncode = 0
+        mock_process.communicate = AsyncMock(return_value=(b"", b""))
+
+        with patch("asyncio.create_subprocess_exec", return_value=mock_process) as mock_exec:
+            await sandbox.reset()
+
+            mock_exec.assert_called_once()
+            args = mock_exec.call_args[0]
+            assert "podman-hpc" in args
+            assert "exec" in args
+            assert "rm" in " ".join(args)
+            assert "/workspace/*" in " ".join(args)
+
+    @pytest.mark.asyncio
+    async def test_compile_success(self, sandbox):
+        """Compile should return SUCCESS for valid code."""
+        sandbox._started = True
+
+        mock_process = AsyncMock()
+        mock_process.returncode = 0
+        mock_process.communicate = AsyncMock(return_value=(b"", b""))
+
+        with patch("asyncio.create_subprocess_exec", return_value=mock_process):
+            result = await sandbox.compile("print('hello')")
+
+            assert result.status == CompileStatus.SUCCESS
+            assert result.error_message is None
+
+    @pytest.mark.asyncio
+    async def test_compile_syntax_error(self, sandbox):
+        """Compile should return SYNTAX_ERROR for invalid code."""
+        sandbox._started = True
+
+        error_output = b"  File \"_check.py\", line 1\n    def foo(\n         ^\nSyntaxError: invalid syntax"
+
+        # Create two different mock processes:
+        # 1. For _write_file (tar command) - should succeed
+        # 2. For py_compile - should fail with syntax error
+        write_process = AsyncMock()
+        write_process.returncode = 0
+        write_process.communicate = AsyncMock(return_value=(b"", b""))
+
+        compile_process = AsyncMock()
+        compile_process.returncode = 1
+        compile_process.communicate = AsyncMock(return_value=(b"", error_output))
+
+        call_count = [0]
+        def create_mock_process(*args, **kwargs):
+            call_count[0] += 1
+            # First call is tar (write_file), second is py_compile
+            if call_count[0] == 1:
+                return write_process
+            return compile_process
+
+        with patch("asyncio.create_subprocess_exec", side_effect=create_mock_process):
+            result = await sandbox.compile("def foo(")
+
+            assert result.status == CompileStatus.SYNTAX_ERROR
+            assert "SyntaxError" in result.error_message
+            assert result.error_line == 1
+
+    @pytest.mark.asyncio
+    async def test_execute_success(self, sandbox):
+        """Execute should return SUCCESS and stdout for valid code."""
+        sandbox._started = True
+
+        # Mock two processes: one for compile (py_compile), one for execute
+        compile_process = AsyncMock()
+        compile_process.returncode = 0
+        compile_process.communicate = AsyncMock(return_value=(b"", b""))
+
+        exec_process = AsyncMock()
+        exec_process.returncode = 0
+        exec_process.communicate = AsyncMock(return_value=(b"hello world\n", b""))
+
+        call_count = [0]
+        def mock_create_subprocess(*args, **kwargs):
+            call_count[0] += 1
+            # First few calls are for compile (write file, py_compile)
+            # Later calls are for execute (write file, run)
+            if "py_compile" in args or call_count[0] <= 2:
+                return compile_process
+            return exec_process
+
+        with patch("asyncio.create_subprocess_exec", side_effect=mock_create_subprocess):
+            result = await sandbox.execute("print('hello world')")
+
+            assert result.compiled
+            assert result.run_status == RunStatus.SUCCESS
+            assert "hello world" in result.stdout
+
+    @pytest.mark.asyncio
+    async def test_execute_runtime_error(self, sandbox):
+        """Execute should return RUNTIME_ERROR for code that raises exception."""
+        sandbox._started = True
+
+        # Mock processes for various stages:
+        # 1. tar write (compile _write_file)
+        # 2. py_compile
+        # 3. tar write (execute _write_file)
+        # 4. python execution (runtime error)
+        success_process = AsyncMock()
+        success_process.returncode = 0
+        success_process.communicate = AsyncMock(return_value=(b"", b""))
+
+        exec_process = AsyncMock()
+        exec_process.returncode = 1
+        exec_process.communicate = AsyncMock(return_value=(b"", b"ZeroDivisionError: division by zero"))
+
+        call_count = [0]
+        def mock_create_subprocess(*args, **kwargs):
+            call_count[0] += 1
+            # Calls 1-3 are compile phase (tar, py_compile) and execute tar
+            # Call 4 is the actual execution
+            if call_count[0] <= 3:
+                return success_process
+            return exec_process
+
+        with patch("asyncio.create_subprocess_exec", side_effect=mock_create_subprocess):
+            result = await sandbox.execute("1/0")
+
+            assert result.compiled
+            assert result.run_status == RunStatus.RUNTIME_ERROR
+            assert "ZeroDivisionError" in result.stderr
+
+    def test_parse_syntax_error(self):
+        """Test syntax error parsing."""
+        error_msg = """  File "_check.py", line 5
+    def foo(
+           ^
+SyntaxError: invalid syntax"""
+
+        line, column, clean_msg = PodmanHPCSandbox._parse_syntax_error(error_msg)
+
+        assert line == 5
+        assert "SyntaxError" in clean_msg
+        assert "invalid syntax" in clean_msg
+
+
+# ============================================================================
+# PodmanHPCSandboxPool tests
+# ============================================================================
+
+
+class TestPodmanHPCSandboxPool:
+    """Tests for PodmanHPCSandboxPool."""
+
+    def test_parse_python_version_from_image(self):
+        """Should extract Python version from image name."""
+        assert PodmanHPCSandboxPool._parse_python_version("python:3.11-slim") == "3.11"
+        assert PodmanHPCSandboxPool._parse_python_version("python:3.10") == "3.10"
+        assert PodmanHPCSandboxPool._parse_python_version("ghcr.io/foo/python:3.12-bullseye") == "3.12"
+        assert PodmanHPCSandboxPool._parse_python_version("custom-image:latest") == "3.11"  # fallback
+
+    def test_pool_initialization(self):
+        """Test pool initialization without starting."""
+        pool = PodmanHPCSandboxPool(
+            n_workers=4,
+            image="python:3.11-slim",
+            cache_size=1000,
+        )
+
+        assert pool.python_version == "3.11"
+        assert pool.available == 0  # Not started yet
+        assert pool.cache_stats["size"] == 0
+
+    @pytest.mark.asyncio
+    async def test_checkout_before_start_raises(self):
+        """Checkout before start should raise RuntimeError."""
+        pool = PodmanHPCSandboxPool(n_workers=2)
+
+        with pytest.raises(RuntimeError, match="not started"):
+            await pool.checkout()
+
+    @pytest.mark.asyncio
+    async def test_cache_operations(self):
+        """Test cache get/put operations."""
+        pool = PodmanHPCSandboxPool(n_workers=2, cache_size=100)
+
+        batch_result = BatchTestResult(
+            results=[],
+            code_hash="abc123",
+            tests_hash="def456",
+        )
+
+        # Cache miss
+        assert pool.get_cached("abc123", "def456") is None
+
+        # Cache put
+        pool.put_cached("abc123", "def456", batch_result)
+
+        # Cache hit
+        result = pool.get_cached("abc123", "def456")
+        assert result is batch_result
+
+
+# ============================================================================
+# Backend detection tests
+# ============================================================================
+
+
+class TestBackendDetection:
+    """Tests for backend detection functions."""
+
+    def test_sandbox_backend_enum(self):
+        """Test SandboxBackend enum values."""
+        assert SandboxBackend.DOCKER.value == "docker"
+        assert SandboxBackend.PODMAN_HPC.value == "podman-hpc"
+        assert SandboxBackend.SINGULARITY.value == "singularity"
+        assert SandboxBackend.AUTO.value == "auto"
+
+    def test_is_podman_hpc_available_not_installed(self):
+        """Should return False when podman-hpc is not in PATH."""
+        with patch("shutil.which", return_value=None):
+            assert is_podman_hpc_available() is False
+
+    def test_is_podman_hpc_available_installed(self):
+        """Should return True when podman-hpc is in PATH."""
+        with patch("shutil.which", return_value="/usr/bin/podman-hpc"):
+            assert is_podman_hpc_available() is True
+
+    def test_is_singularity_available_not_installed(self):
+        """Should return False when singularity is not in PATH."""
+        with patch("shutil.which", return_value=None):
+            assert is_singularity_available() is False
+
+    def test_is_singularity_available_installed(self):
+        """Should return True when singularity is in PATH."""
+        def mock_which(cmd):
+            if cmd == "singularity":
+                return "/usr/bin/singularity"
+            return None
+
+        with patch("shutil.which", side_effect=mock_which):
+            assert is_singularity_available() is True
+
+    def test_is_singularity_available_apptainer(self):
+        """Should return True when apptainer (renamed singularity) is in PATH."""
+        def mock_which(cmd):
+            if cmd == "apptainer":
+                return "/usr/bin/apptainer"
+            return None
+
+        with patch("shutil.which", side_effect=mock_which):
+            assert is_singularity_available() is True
+
+    def test_detect_backend_in_slurm_with_podman(self):
+        """In Slurm with podman-hpc available, should prefer podman-hpc."""
+        with patch.dict(os.environ, {"SLURM_JOB_ID": "12345"}):
+            with patch("shutil.which", return_value="/usr/bin/podman-hpc"):
+                with patch("ludic.envs.code_exec.backend.is_docker_available", return_value=True):
+                    backend = detect_available_backend()
+                    assert backend == "podman-hpc"
+
+    def test_detect_backend_outside_slurm_with_docker(self):
+        """Outside Slurm with Docker available, should prefer Docker."""
+        with patch.dict(os.environ, {}, clear=True):
+            os.environ.pop("SLURM_JOB_ID", None)
+            with patch("ludic.envs.code_exec.backend.is_docker_available", return_value=True):
+                backend = detect_available_backend()
+                assert backend == "docker"
+
+    def test_detect_backend_outside_slurm_no_docker_with_podman(self):
+        """Outside Slurm without Docker but with podman-hpc, should use podman-hpc."""
+        with patch.dict(os.environ, {}, clear=True):
+            os.environ.pop("SLURM_JOB_ID", None)
+            with patch("ludic.envs.code_exec.backend.is_docker_available", return_value=False):
+                with patch("shutil.which", return_value="/usr/bin/podman-hpc"):
+                    backend = detect_available_backend()
+                    assert backend == "podman-hpc"
+
+    def test_detect_backend_none_available_raises(self):
+        """Should raise RuntimeError when no backend is available."""
+        with patch.dict(os.environ, {}, clear=True):
+            os.environ.pop("SLURM_JOB_ID", None)
+            with patch("ludic.envs.code_exec.backend.is_docker_available", return_value=False):
+                with patch("shutil.which", return_value=None):
+                    with pytest.raises(RuntimeError, match="No sandbox backend available"):
+                        detect_available_backend()
+
+    def test_get_backend_info(self):
+        """Test get_backend_info returns structured data."""
+        with patch.dict(os.environ, {"SLURM_JOB_ID": "99999"}):
+            with patch("ludic.envs.code_exec.backend.is_docker_available", return_value=False):
+                with patch("shutil.which", return_value="/usr/bin/podman-hpc"):
+                    info = get_backend_info()
+
+                    assert info["environment"]["in_slurm"] is True
+                    assert info["environment"]["slurm_job_id"] == "99999"
+                    assert "docker" in info["backends"]
+                    assert "podman-hpc" in info["backends"]
+                    assert info["backends"]["podman-hpc"]["available"] is True
+                    assert info["backends"]["docker"]["available"] is False
+
+
+# ============================================================================
+# Factory tests
+# ============================================================================
+
+
+class TestFactory:
+    """Tests for create_sandbox_pool factory."""
+
+    @pytest.mark.asyncio
+    async def test_factory_unknown_backend_raises(self):
+        """Factory should raise ValueError for unknown backend."""
+        from ludic.envs.code_exec.factory import create_sandbox_pool
+
+        with pytest.raises(ValueError, match="Unknown backend"):
+            await create_sandbox_pool(backend="unknown")
+
+    @pytest.mark.asyncio
+    async def test_factory_singularity_not_implemented(self):
+        """Factory should raise NotImplementedError for singularity."""
+        from ludic.envs.code_exec.factory import create_sandbox_pool
+
+        with pytest.raises(NotImplementedError, match="Singularity backend is not yet implemented"):
+            await create_sandbox_pool(backend="singularity")
diff --git a/tests/test_code_exec_runners.py b/tests/test_code_exec_runners.py
new file mode 100644
index 0000000..7853c4c
--- /dev/null
+++ b/tests/test_code_exec_runners.py
@@ -0,0 +1,478 @@
+"""
+Unit tests for ludic.envs.code_exec.runners
+
+Tests hash utilities and StdinStdoutRunner with mock sandbox.
+"""
+
+import pytest
+
+from ludic.envs.code_exec.runners import (
+    compute_hash,
+    hash_tests,
+    StdinStdoutRunner,
+)
+from ludic.envs.code_exec.types import (
+    TestCase,
+    CompileResult,
+    CompileStatus,
+    ExecutionResult,
+    RunStatus,
+)
+from ludic.envs.code_exec.adapters.base import ExactMatchVerifier
+
+
+# ---------------------------------------------------------------------
+# Hash Utility Tests
+# ---------------------------------------------------------------------
+
+
+class TestComputeHash:
+    def test_returns_16_chars(self):
+        result = compute_hash("hello world")
+        assert len(result) == 16
+
+    def test_deterministic(self):
+        result1 = compute_hash("test content")
+        result2 = compute_hash("test content")
+        assert result1 == result2
+
+    def test_different_content_different_hash(self):
+        result1 = compute_hash("content a")
+        result2 = compute_hash("content b")
+        assert result1 != result2
+
+    def test_hex_characters_only(self):
+        result = compute_hash("any content")
+        assert all(c in "0123456789abcdef" for c in result)
+
+    def test_empty_string(self):
+        result = compute_hash("")
+        assert len(result) == 16
+
+
+class TestHashTests:
+    def test_returns_16_chars(self):
+        tests = [TestCase(input="1", expected="2", id="t1")]
+        result = hash_tests(tests)
+        assert len(result) == 16
+
+    def test_deterministic(self):
+        tests = [
+            TestCase(input="1", expected="a", id="t1"),
+            TestCase(input="2", expected="b", id="t2"),
+        ]
+        result1 = hash_tests(tests)
+        result2 = hash_tests(tests)
+        assert result1 == result2
+
+    def test_different_tests_different_hash(self):
+        tests1 = [TestCase(input="1", expected="a", id="t1")]
+        tests2 = [TestCase(input="2", expected="b", id="t2")]
+        result1 = hash_tests(tests1)
+        result2 = hash_tests(tests2)
+        assert result1 != result2
+
+    def test_order_matters(self):
+        tests1 = [
+            TestCase(input="1", expected="a", id="t1"),
+            TestCase(input="2", expected="b", id="t2"),
+        ]
+        tests2 = [
+            TestCase(input="2", expected="b", id="t2"),
+            TestCase(input="1", expected="a", id="t1"),
+        ]
+        result1 = hash_tests(tests1)
+        result2 = hash_tests(tests2)
+        assert result1 != result2
+
+    def test_empty_list(self):
+        result = hash_tests([])
+        assert len(result) == 16
+
+
+# ---------------------------------------------------------------------
+# Mock Sandbox for Runner Tests
+# ---------------------------------------------------------------------
+
+
+class MockSandbox:
+    """
+    A mock sandbox for testing runners.
+
+    Can be configured with:
+      - compile_result: What to return from compile()
+      - execute_results: Dict mapping stdin -> ExecutionResult
+      - default_execute_result: Fallback for unmapped stdin
+    """
+
+    def __init__(
+        self,
+        compile_result: CompileResult | None = None,
+        execute_results: dict[str, ExecutionResult] | None = None,
+        default_stdout: str = "",
+    ):
+        self._compile_result = compile_result or CompileResult(
+            status=CompileStatus.SUCCESS,
+            duration_ms=10.0,
+        )
+        self._execute_results = execute_results or {}
+        self._default_stdout = default_stdout
+        self._python_version = "3.11"
+
+        # Track calls for assertions
+        self.compile_calls: list[str] = []
+        self.execute_calls: list[tuple[str, str]] = []
+
+    @property
+    def python_version(self) -> str:
+        return self._python_version
+
+    async def reset(self) -> None:
+        pass
+
+    async def compile(self, code: str, *, timeout_s: float = 5.0) -> CompileResult:
+        self.compile_calls.append(code)
+        return self._compile_result
+
+    async def execute(
+        self,
+        code: str,
+        *,
+        stdin: str = "",
+        skip_compile: bool = False,
+        timeout_s: float = 10.0,
+        memory_limit_mb: int | None = None,
+        env_vars: dict[str, str] | None = None,
+    ) -> ExecutionResult:
+        self.execute_calls.append((code, stdin))
+
+        if stdin in self._execute_results:
+            return self._execute_results[stdin]
+
+        # Default: successful execution returning default_stdout
+        return ExecutionResult(
+            compile_result=self._compile_result,
+            run_status=RunStatus.SUCCESS,
+            stdout=self._default_stdout,
+            stderr="",
+            exit_code=0,
+            compile_duration_ms=10.0,
+            run_duration_ms=50.0,
+            total_duration_ms=60.0,
+        )
+
+
+# ---------------------------------------------------------------------
+# StdinStdoutRunner Tests
+# ---------------------------------------------------------------------
+
+
+class TestStdinStdoutRunner:
+    @pytest.mark.asyncio
+    async def test_all_tests_pass(self):
+        sandbox = MockSandbox(default_stdout="expected_output")
+        runner = StdinStdoutRunner()
+        verifier = ExactMatchVerifier()
+
+        tests = [
+            TestCase(input="input1", expected="expected_output", id="t1"),
+            TestCase(input="input2", expected="expected_output", id="t2"),
+        ]
+
+        result = await runner.run_tests(
+            sandbox=sandbox,
+            code="print('expected_output')",
+            tests=tests,
+            verifier=verifier,
+        )
+
+        assert result.all_passed is True
+        assert result.passed_count == 2
+        assert result.total_count == 2
+
+    @pytest.mark.asyncio
+    async def test_some_tests_fail(self):
+        # First test passes, second fails
+        execute_results = {
+            "input1": ExecutionResult(
+                compile_result=CompileResult(status=CompileStatus.SUCCESS),
+                run_status=RunStatus.SUCCESS,
+                stdout="correct",
+            ),
+            "input2": ExecutionResult(
+                compile_result=CompileResult(status=CompileStatus.SUCCESS),
+                run_status=RunStatus.SUCCESS,
+                stdout="wrong",
+            ),
+        }
+        sandbox = MockSandbox(execute_results=execute_results)
+        runner = StdinStdoutRunner()
+        verifier = ExactMatchVerifier()
+
+        tests = [
+            TestCase(input="input1", expected="correct", id="t1"),
+            TestCase(input="input2", expected="correct", id="t2"),  # Will fail
+        ]
+
+        result = await runner.run_tests(
+            sandbox=sandbox,
+            code="code",
+            tests=tests,
+            verifier=verifier,
+        )
+
+        assert result.all_passed is False
+        assert result.passed_count == 1
+        assert result.total_count == 2
+        assert result.results[0].passed is True
+        assert result.results[1].passed is False
+
+    @pytest.mark.asyncio
+    async def test_compile_failure_fails_all_tests(self):
+        compile_result = CompileResult(
+            status=CompileStatus.SYNTAX_ERROR,
+            error_message="SyntaxError: invalid syntax",
+            error_line=5,
+            duration_ms=5.0,
+        )
+        sandbox = MockSandbox(compile_result=compile_result)
+        runner = StdinStdoutRunner()
+        verifier = ExactMatchVerifier()
+
+        tests = [
+            TestCase(input="input1", expected="x", id="t1"),
+            TestCase(input="input2", expected="y", id="t2"),
+            TestCase(input="input3", expected="z", id="t3"),
+        ]
+
+        result = await runner.run_tests(
+            sandbox=sandbox,
+            code="invalid syntax here",
+            tests=tests,
+            verifier=verifier,
+            compile_first=True,
+        )
+
+        assert result.compile_failed is True
+        assert result.all_passed is False
+        assert result.passed_count == 0
+        assert len(result.results) == 3
+
+        # All should have compile failure details
+        for r in result.results:
+            assert r.compiled is False
+            assert "Compilation failed" in (r.comparison_details or "")
+
+    @pytest.mark.asyncio
+    async def test_stop_on_first_failure(self):
+        execute_results = {
+            "input1": ExecutionResult(
+                compile_result=CompileResult(status=CompileStatus.SUCCESS),
+                run_status=RunStatus.SUCCESS,
+                stdout="wrong",  # First test fails
+            ),
+        }
+        sandbox = MockSandbox(execute_results=execute_results, default_stdout="correct")
+        runner = StdinStdoutRunner()
+        verifier = ExactMatchVerifier()
+
+        tests = [
+            TestCase(input="input1", expected="correct", id="t1"),  # Fails
+            TestCase(input="input2", expected="correct", id="t2"),  # Should be skipped
+            TestCase(input="input3", expected="correct", id="t3"),  # Should be skipped
+        ]
+
+        result = await runner.run_tests(
+            sandbox=sandbox,
+            code="code",
+            tests=tests,
+            verifier=verifier,
+            stop_on_first_failure=True,
+        )
+
+        assert result.passed_count == 0
+        assert len(result.results) == 3
+
+        # First test ran and failed
+        assert result.results[0].passed is False
+        assert result.results[0].ran is True
+
+        # Second and third were skipped
+        assert result.results[1].passed is False
+        assert result.results[1].execution.run_status == RunStatus.NOT_RUN
+        assert result.results[2].passed is False
+        assert result.results[2].execution.run_status == RunStatus.NOT_RUN
+
+    @pytest.mark.asyncio
+    async def test_runtime_error_fails_test(self):
+        execute_results = {
+            "input1": ExecutionResult(
+                compile_result=CompileResult(status=CompileStatus.SUCCESS),
+                run_status=RunStatus.RUNTIME_ERROR,
+                stdout="",
+                stderr="NameError: name 'x' is not defined",
+                exit_code=1,
+            ),
+        }
+        sandbox = MockSandbox(execute_results=execute_results)
+        runner = StdinStdoutRunner()
+        verifier = ExactMatchVerifier()
+
+        tests = [TestCase(input="input1", expected="output", id="t1")]
+
+        result = await runner.run_tests(
+            sandbox=sandbox,
+            code="print(x)",
+            tests=tests,
+            verifier=verifier,
+        )
+
+        assert result.passed_count == 0
+        assert result.results[0].passed is False
+        assert "Runtime error" in (result.results[0].comparison_details or "")
+
+    @pytest.mark.asyncio
+    async def test_timeout_fails_test(self):
+        execute_results = {
+            "input1": ExecutionResult(
+                compile_result=CompileResult(status=CompileStatus.SUCCESS),
+                run_status=RunStatus.TIMEOUT,
+                stdout="",
+                stderr="",
+                run_duration_ms=5000.0,
+            ),
+        }
+        sandbox = MockSandbox(execute_results=execute_results)
+        runner = StdinStdoutRunner()
+        verifier = ExactMatchVerifier()
+
+        tests = [TestCase(input="input1", expected="output", id="t1")]
+
+        result = await runner.run_tests(
+            sandbox=sandbox,
+            code="while True: pass",
+            tests=tests,
+            verifier=verifier,
+        )
+
+        assert result.passed_count == 0
+        assert result.results[0].passed is False
+        assert "timed out" in (result.results[0].comparison_details or "").lower()
+
+    @pytest.mark.asyncio
+    async def test_memory_exceeded_fails_test(self):
+        execute_results = {
+            "input1": ExecutionResult(
+                compile_result=CompileResult(status=CompileStatus.SUCCESS),
+                run_status=RunStatus.MEMORY_EXCEEDED,
+                stdout="",
+                stderr="",
+            ),
+        }
+        sandbox = MockSandbox(execute_results=execute_results)
+        runner = StdinStdoutRunner()
+        verifier = ExactMatchVerifier()
+
+        tests = [TestCase(input="input1", expected="output", id="t1")]
+
+        result = await runner.run_tests(
+            sandbox=sandbox,
+            code="x = [0] * 10**9",
+            tests=tests,
+            verifier=verifier,
+        )
+
+        assert result.passed_count == 0
+        assert result.results[0].passed is False
+        assert "Memory" in (result.results[0].comparison_details or "")
+
+    @pytest.mark.asyncio
+    async def test_per_test_timeout_override(self):
+        sandbox = MockSandbox(default_stdout="output")
+        runner = StdinStdoutRunner(default_timeout_s=5.0)
+        verifier = ExactMatchVerifier()
+
+        tests = [
+            TestCase(
+                input="input1",
+                expected="output",
+                id="t1",
+                metadata={"timeout_s": 30.0},  # Override
+            ),
+        ]
+
+        await runner.run_tests(
+            sandbox=sandbox,
+            code="code",
+            tests=tests,
+            verifier=verifier,
+        )
+
+        # Check that execute was called with the overridden timeout
+        # The mock doesn't actually use timeout, but we can verify the call was made
+        assert len(sandbox.execute_calls) == 1
+
+    @pytest.mark.asyncio
+    async def test_compile_first_false_skips_compile(self):
+        sandbox = MockSandbox(default_stdout="output")
+        runner = StdinStdoutRunner()
+        verifier = ExactMatchVerifier()
+
+        tests = [TestCase(input="input1", expected="output", id="t1")]
+
+        await runner.run_tests(
+            sandbox=sandbox,
+            code="code",
+            tests=tests,
+            verifier=verifier,
+            compile_first=False,
+        )
+
+        # compile() should not be called when compile_first=False
+        assert len(sandbox.compile_calls) == 0
+        assert len(sandbox.execute_calls) == 1
+
+    @pytest.mark.asyncio
+    async def test_hashes_computed_correctly(self):
+        sandbox = MockSandbox(default_stdout="output")
+        runner = StdinStdoutRunner()
+        verifier = ExactMatchVerifier()
+
+        code = "print('hello')"
+        tests = [TestCase(input="input1", expected="output", id="t1")]
+
+        result = await runner.run_tests(
+            sandbox=sandbox,
+            code=code,
+            tests=tests,
+            verifier=verifier,
+        )
+
+        # Verify hashes are present and have correct format
+        assert len(result.code_hash) == 16
+        assert len(result.tests_hash) == 16
+        assert all(c in "0123456789abcdef" for c in result.code_hash)
+        assert all(c in "0123456789abcdef" for c in result.tests_hash)
+
+        # Verify code_hash matches compute_hash
+        from ludic.envs.code_exec.runners import compute_hash
+
+        assert result.code_hash == compute_hash(code)
+
+    @pytest.mark.asyncio
+    async def test_whitespace_stripping_in_comparison(self):
+        """Verifier should strip whitespace from output."""
+        sandbox = MockSandbox(default_stdout="  output\n")
+        runner = StdinStdoutRunner()
+        verifier = ExactMatchVerifier()  # strips by default
+
+        tests = [TestCase(input="input1", expected="output", id="t1")]
+
+        result = await runner.run_tests(
+            sandbox=sandbox,
+            code="code",
+            tests=tests,
+            verifier=verifier,
+        )
+
+        assert result.all_passed is True
diff --git a/tests/test_code_exec_types.py b/tests/test_code_exec_types.py
new file mode 100644
index 0000000..edffc08
--- /dev/null
+++ b/tests/test_code_exec_types.py
@@ -0,0 +1,400 @@
+"""
+Unit tests for ludic.envs.code_exec.types
+
+Tests all dataclasses and their properties/methods.
+"""
+
+import pytest
+
+from ludic.envs.code_exec.types import (
+    CompileStatus,
+    RunStatus,
+    CompileResult,
+    ExecutionResult,
+    TestCase,
+    TestResult,
+    BatchTestResult,
+)
+
+
+# ---------------------------------------------------------------------
+# CompileResult Tests
+# ---------------------------------------------------------------------
+
+
+class TestCompileResult:
+    def test_success_property_true_when_status_success(self):
+        result = CompileResult(status=CompileStatus.SUCCESS)
+        assert result.success is True
+
+    def test_success_property_false_when_syntax_error(self):
+        result = CompileResult(
+            status=CompileStatus.SYNTAX_ERROR,
+            error_message="SyntaxError: invalid syntax",
+            error_line=5,
+            error_column=10,
+        )
+        assert result.success is False
+
+    def test_success_property_false_for_all_error_statuses(self):
+        error_statuses = [
+            CompileStatus.SYNTAX_ERROR,
+            CompileStatus.IMPORT_ERROR,
+            CompileStatus.TIMEOUT,
+            CompileStatus.UNKNOWN_ERROR,
+        ]
+        for status in error_statuses:
+            result = CompileResult(status=status)
+            assert result.success is False, f"Expected success=False for {status}"
+
+    def test_duration_ms_default_zero(self):
+        result = CompileResult(status=CompileStatus.SUCCESS)
+        assert result.duration_ms == 0.0
+
+
+# ---------------------------------------------------------------------
+# ExecutionResult Tests
+# ---------------------------------------------------------------------
+
+
+class TestExecutionResult:
+    def test_compiled_true_when_compile_succeeded(self):
+        result = ExecutionResult(
+            compile_result=CompileResult(status=CompileStatus.SUCCESS),
+            run_status=RunStatus.SUCCESS,
+        )
+        assert result.compiled is True
+
+    def test_compiled_false_when_compile_failed(self):
+        result = ExecutionResult(
+            compile_result=CompileResult(status=CompileStatus.SYNTAX_ERROR),
+        )
+        assert result.compiled is False
+
+    def test_succeeded_true_when_compiled_and_run_success(self):
+        result = ExecutionResult(
+            compile_result=CompileResult(status=CompileStatus.SUCCESS),
+            run_status=RunStatus.SUCCESS,
+            stdout="output",
+        )
+        assert result.succeeded is True
+
+    def test_succeeded_false_when_compile_failed(self):
+        result = ExecutionResult(
+            compile_result=CompileResult(status=CompileStatus.SYNTAX_ERROR),
+        )
+        assert result.succeeded is False
+
+    def test_succeeded_false_when_runtime_error(self):
+        result = ExecutionResult(
+            compile_result=CompileResult(status=CompileStatus.SUCCESS),
+            run_status=RunStatus.RUNTIME_ERROR,
+            stderr="NameError: name 'x' is not defined",
+        )
+        assert result.succeeded is False
+
+    def test_succeeded_false_when_timeout(self):
+        result = ExecutionResult(
+            compile_result=CompileResult(status=CompileStatus.SUCCESS),
+            run_status=RunStatus.TIMEOUT,
+        )
+        assert result.succeeded is False
+
+    def test_timed_out_true_when_compile_timeout(self):
+        result = ExecutionResult(
+            compile_result=CompileResult(status=CompileStatus.TIMEOUT),
+        )
+        assert result.timed_out is True
+
+    def test_timed_out_true_when_run_timeout(self):
+        result = ExecutionResult(
+            compile_result=CompileResult(status=CompileStatus.SUCCESS),
+            run_status=RunStatus.TIMEOUT,
+        )
+        assert result.timed_out is True
+
+    def test_timed_out_false_when_success(self):
+        result = ExecutionResult(
+            compile_result=CompileResult(status=CompileStatus.SUCCESS),
+            run_status=RunStatus.SUCCESS,
+        )
+        assert result.timed_out is False
+
+    def test_default_values(self):
+        result = ExecutionResult(
+            compile_result=CompileResult(status=CompileStatus.SUCCESS),
+        )
+        assert result.run_status is None
+        assert result.stdout == ""
+        assert result.stderr == ""
+        assert result.exit_code is None
+        assert result.cache_hit is False
+
+
+# ---------------------------------------------------------------------
+# TestCase Tests
+# ---------------------------------------------------------------------
+
+
+class TestTestCase:
+    def test_basic_creation(self):
+        tc = TestCase(input="1 2", expected="3", id="test_add")
+        assert tc.input == "1 2"
+        assert tc.expected == "3"
+        assert tc.id == "test_add"
+
+    def test_default_weight(self):
+        tc = TestCase(input="x", expected="y")
+        assert tc.weight == 1.0
+
+    def test_default_metadata_is_empty_dict(self):
+        tc = TestCase(input="x", expected="y")
+        assert tc.metadata == {}
+
+    def test_metadata_with_custom_values(self):
+        tc = TestCase(
+            input="x",
+            expected="y",
+            metadata={"timeout_s": 10.0, "category": "math"},
+        )
+        assert tc.metadata["timeout_s"] == 10.0
+        assert tc.metadata["category"] == "math"
+
+
+# ---------------------------------------------------------------------
+# TestResult Tests
+# ---------------------------------------------------------------------
+
+
+class TestTestResult:
+    def test_compiled_delegates_to_execution(self):
+        execution = ExecutionResult(
+            compile_result=CompileResult(status=CompileStatus.SUCCESS),
+            run_status=RunStatus.SUCCESS,
+        )
+        tr = TestResult(
+            test_case=TestCase(input="1", expected="1"),
+            passed=True,
+            actual="1",
+            execution=execution,
+        )
+        assert tr.compiled is True
+
+    def test_compiled_false_when_execution_compile_failed(self):
+        execution = ExecutionResult(
+            compile_result=CompileResult(status=CompileStatus.SYNTAX_ERROR),
+        )
+        tr = TestResult(
+            test_case=TestCase(input="1", expected="1"),
+            passed=False,
+            actual="",
+            execution=execution,
+        )
+        assert tr.compiled is False
+
+    def test_ran_true_when_execution_has_run_status(self):
+        execution = ExecutionResult(
+            compile_result=CompileResult(status=CompileStatus.SUCCESS),
+            run_status=RunStatus.SUCCESS,
+        )
+        tr = TestResult(
+            test_case=TestCase(input="1", expected="1"),
+            passed=True,
+            actual="1",
+            execution=execution,
+        )
+        assert tr.ran is True
+
+    def test_ran_false_when_run_status_none(self):
+        execution = ExecutionResult(
+            compile_result=CompileResult(status=CompileStatus.SYNTAX_ERROR),
+            run_status=None,
+        )
+        tr = TestResult(
+            test_case=TestCase(input="1", expected="1"),
+            passed=False,
+            actual="",
+            execution=execution,
+        )
+        assert tr.ran is False
+
+    def test_ran_false_when_run_status_not_run(self):
+        execution = ExecutionResult(
+            compile_result=CompileResult(status=CompileStatus.SUCCESS),
+            run_status=RunStatus.NOT_RUN,
+        )
+        tr = TestResult(
+            test_case=TestCase(input="1", expected="1"),
+            passed=False,
+            actual="",
+            execution=execution,
+        )
+        assert tr.ran is False
+
+
+# ---------------------------------------------------------------------
+# BatchTestResult Tests
+# ---------------------------------------------------------------------
+
+
+def _make_test_result(passed: bool, compiled: bool = True) -> TestResult:
+    """Helper to create TestResult with minimal boilerplate."""
+    if compiled:
+        compile_result = CompileResult(status=CompileStatus.SUCCESS)
+        run_status = RunStatus.SUCCESS if passed else RunStatus.RUNTIME_ERROR
+    else:
+        compile_result = CompileResult(status=CompileStatus.SYNTAX_ERROR)
+        run_status = None
+
+    return TestResult(
+        test_case=TestCase(input="x", expected="y"),
+        passed=passed,
+        actual="y" if passed else "z",
+        execution=ExecutionResult(
+            compile_result=compile_result,
+            run_status=run_status,
+        ),
+    )
+
+
+class TestBatchTestResult:
+    def test_passed_count(self):
+        results = [
+            _make_test_result(passed=True),
+            _make_test_result(passed=True),
+            _make_test_result(passed=False),
+        ]
+        batch = BatchTestResult(results=results, code_hash="abc", tests_hash="xyz")
+        assert batch.passed_count == 2
+
+    def test_total_count(self):
+        results = [_make_test_result(passed=True) for _ in range(5)]
+        batch = BatchTestResult(results=results, code_hash="abc", tests_hash="xyz")
+        assert batch.total_count == 5
+
+    def test_all_passed_true_when_all_pass(self):
+        results = [_make_test_result(passed=True) for _ in range(3)]
+        batch = BatchTestResult(results=results, code_hash="abc", tests_hash="xyz")
+        assert batch.all_passed is True
+
+    def test_all_passed_false_when_one_fails(self):
+        results = [
+            _make_test_result(passed=True),
+            _make_test_result(passed=False),
+        ]
+        batch = BatchTestResult(results=results, code_hash="abc", tests_hash="xyz")
+        assert batch.all_passed is False
+
+    def test_all_passed_false_when_empty(self):
+        batch = BatchTestResult(results=[], code_hash="abc", tests_hash="xyz")
+        assert batch.all_passed is False
+
+    def test_pass_rate_full(self):
+        results = [_make_test_result(passed=True) for _ in range(4)]
+        batch = BatchTestResult(results=results, code_hash="abc", tests_hash="xyz")
+        assert batch.pass_rate == pytest.approx(1.0)
+
+    def test_pass_rate_half(self):
+        results = [
+            _make_test_result(passed=True),
+            _make_test_result(passed=True),
+            _make_test_result(passed=False),
+            _make_test_result(passed=False),
+        ]
+        batch = BatchTestResult(results=results, code_hash="abc", tests_hash="xyz")
+        assert batch.pass_rate == pytest.approx(0.5)
+
+    def test_pass_rate_zero_when_empty(self):
+        batch = BatchTestResult(results=[], code_hash="abc", tests_hash="xyz")
+        assert batch.pass_rate == pytest.approx(0.0)
+
+    def test_first_failure_returns_first_failed_test(self):
+        results = [
+            _make_test_result(passed=True),
+            _make_test_result(passed=False),  # first failure
+            _make_test_result(passed=False),
+        ]
+        batch = BatchTestResult(results=results, code_hash="abc", tests_hash="xyz")
+        assert batch.first_failure is results[1]
+
+    def test_first_failure_none_when_all_pass(self):
+        results = [_make_test_result(passed=True) for _ in range(3)]
+        batch = BatchTestResult(results=results, code_hash="abc", tests_hash="xyz")
+        assert batch.first_failure is None
+
+    def test_compile_failed_true_when_first_result_not_compiled(self):
+        results = [
+            _make_test_result(passed=False, compiled=False),
+            _make_test_result(passed=False, compiled=False),
+        ]
+        batch = BatchTestResult(results=results, code_hash="abc", tests_hash="xyz")
+        assert batch.compile_failed is True
+
+    def test_compile_failed_false_when_compiled(self):
+        results = [_make_test_result(passed=True)]
+        batch = BatchTestResult(results=results, code_hash="abc", tests_hash="xyz")
+        assert batch.compile_failed is False
+
+    def test_compile_failed_false_when_empty(self):
+        batch = BatchTestResult(results=[], code_hash="abc", tests_hash="xyz")
+        assert batch.compile_failed is False
+
+    def test_get_failures_returns_only_failed_tests(self):
+        results = [
+            _make_test_result(passed=True),
+            _make_test_result(passed=False),
+            _make_test_result(passed=True),
+            _make_test_result(passed=False),
+        ]
+        batch = BatchTestResult(results=results, code_hash="abc", tests_hash="xyz")
+        failures = batch.get_failures()
+        assert len(failures) == 2
+        assert failures[0] is results[1]
+        assert failures[1] is results[3]
+
+    def test_get_successes_returns_only_passed_tests(self):
+        results = [
+            _make_test_result(passed=True),
+            _make_test_result(passed=False),
+            _make_test_result(passed=True),
+        ]
+        batch = BatchTestResult(results=results, code_hash="abc", tests_hash="xyz")
+        successes = batch.get_successes()
+        assert len(successes) == 2
+        assert successes[0] is results[0]
+        assert successes[1] is results[2]
+
+    def test_timing_aggregation(self):
+        # Create results with specific timing
+        def make_result_with_timing(compile_ms: float, run_ms: float) -> TestResult:
+            return TestResult(
+                test_case=TestCase(input="x", expected="y"),
+                passed=True,
+                actual="y",
+                execution=ExecutionResult(
+                    compile_result=CompileResult(
+                        status=CompileStatus.SUCCESS,
+                        duration_ms=compile_ms,
+                    ),
+                    run_status=RunStatus.SUCCESS,
+                    compile_duration_ms=compile_ms,
+                    run_duration_ms=run_ms,
+                    total_duration_ms=compile_ms + run_ms,
+                ),
+            )
+
+        results = [
+            make_result_with_timing(10.0, 100.0),
+            make_result_with_timing(10.0, 200.0),
+            make_result_with_timing(10.0, 150.0),
+        ]
+        batch = BatchTestResult(results=results, code_hash="abc", tests_hash="xyz")
+
+        # Compile time: max across all (since compilation usually happens once)
+        assert batch.total_compile_ms == pytest.approx(10.0)
+
+        # Run time: sum across all tests
+        assert batch.total_run_ms == pytest.approx(450.0)  # 100 + 200 + 150
+
+        # Total: sum of all total_duration_ms
+        assert batch.total_execution_ms == pytest.approx(480.0)  # 110 + 210 + 160
diff --git a/tests/test_flash_attention.py b/tests/test_flash_attention.py
new file mode 100644
index 0000000..fa33e07
--- /dev/null
+++ b/tests/test_flash_attention.py
@@ -0,0 +1,151 @@
+"""
+GPU tests for Flash Attention and hardware detection.
+
+These tests are designed to run on interactive GPU nodes (not login nodes).
+Mark with @pytest.mark.gpu and run with: pytest -v -m gpu
+
+Usage on Isambard:
+    srun --nodes=1 --gpus=1 --time=10:00 --pty bash
+    uv run pytest tests/test_flash_attention.py -v -m gpu -s
+"""
+
+from __future__ import annotations
+
+import logging
+import pytest
+import torch
+
+# Configure logging for visibility during tests
+logging.basicConfig(level=logging.DEBUG, format="%(name)s: %(message)s")
+
+
+
+@pytest.mark.gpu
+def test_cuda_available():
+    """Verify CUDA is available (basic sanity check)."""
+    assert torch.cuda.is_available(), "CUDA not available - run on a GPU node"
+
+
+@pytest.mark.gpu
+def test_flash_sdp_enabled():
+    """Verify Flash SDP backend can be enabled."""
+    torch.backends.cuda.enable_flash_sdp(True)
+    # Note: flash_sdp_enabled() returns True only if flash kernels are actually usable
+    # This depends on the input shapes and dtypes at runtime
+    assert hasattr(torch.backends.cuda, "flash_sdp_enabled")
+
+
+@pytest.mark.gpu
+def test_detect_gpu_architecture():
+    """Detect real GPU architecture."""
+    from ludic.training.hardware import detect_gpu_architecture
+    
+    arch = detect_gpu_architecture()
+    assert arch is not None, "Could not detect GPU architecture"
+    
+    # Log the detected architecture
+    device_name = torch.cuda.get_device_name()
+    capability = torch.cuda.get_device_capability()
+    print(f"GPU: {device_name}")
+    print(f"Compute capability: sm_{capability[0]}{capability[1]}")
+    print(f"Detected architecture: {arch}")
+    
+    # Validate known architectures
+    assert arch in ("hopper", "ampere", "ada", "turing", "volta", "older")
+
+
+@pytest.mark.gpu
+def test_get_cuda_version():
+    """Verify CUDA version detection."""
+    from ludic.training.hardware import get_cuda_version
+    
+    version = get_cuda_version()
+    assert version is not None, "Could not get CUDA version"
+    
+    major, minor = version
+    print(f"CUDA version: {major}.{minor}")
+    
+    # Reasonable version bounds
+    assert major >= 11, f"CUDA version {major}.{minor} is too old for Flash Attention"
+
+
+@pytest.mark.gpu
+def test_flash_attn_import():
+    """Verify flash-attn package loads and reports version."""
+    try:
+        import flash_attn
+        version = flash_attn.__version__
+        print(f"flash-attn version: {version}")
+        
+        # Check version is >= 2.7.0 for FA3 support
+        parts = version.split(".")
+        major, minor = int(parts[0]), int(parts[1])
+        assert (major, minor) >= (2, 7), f"flash-attn {version} < 2.7.0, FA3 not supported"
+        
+    except ImportError as e:
+        pytest.skip(f"flash-attn not installed: {e}")
+
+
+@pytest.mark.gpu
+def test_get_optimal_attention_impl():
+    """Test optimal attention implementation selection."""
+    from ludic.training.hardware import get_optimal_attention_impl
+    
+    # With flash attention enabled (default)
+    impl = get_optimal_attention_impl(disable_flash_attn=False)
+    print(f"Optimal attention (enabled): {impl}")
+    assert impl in ("flash_attention_3", "flash_attention_2", "sdpa", "eager")
+    
+    # With flash attention disabled
+    impl_disabled = get_optimal_attention_impl(disable_flash_attn=True)
+    print(f"Optimal attention (disabled): {impl_disabled}")
+    assert impl_disabled == "sdpa"
+
+
+@pytest.mark.gpu
+def test_configure_flash_attention():
+    """Test full Flash Attention configuration."""
+    from ludic.training.hardware import configure_flash_attention
+    
+    # Configure for CUDA device
+    attn_impl = configure_flash_attention("cuda", disable_flash_attn=False)
+    print(f"Configured attention: {attn_impl}")
+    assert attn_impl in ("flash_attention_3", "flash_attention_2", "sdpa")
+    
+    # Configure for CPU (should return eager)
+    attn_impl_cpu = configure_flash_attention("cpu", disable_flash_attn=False)
+    assert attn_impl_cpu == "eager"
+
+
+@pytest.mark.gpu
+def test_model_with_flash_attention():
+    """Load a small model with flash attention and run forward pass."""
+    from ludic.training.hardware import configure_flash_attention
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+    
+    model_name = "Qwen/Qwen2.5-0.5B-Instruct"
+    
+    # Configure flash attention
+    attn_impl = configure_flash_attention("cuda", disable_flash_attn=False)
+    print(f"Using attention: {attn_impl}")
+    
+    # Load model with flash attention
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        torch_dtype=torch.bfloat16,
+        trust_remote_code=True,
+        attn_implementation=attn_impl,
+    ).cuda()
+    
+    # Verify model loaded with correct attention
+    print(f"Model attention impl: {model.config._attn_implementation}")
+    
+    # Run a forward pass
+    inputs = tokenizer("Hello, world!", return_tensors="pt").to("cuda")
+    with torch.no_grad():
+        outputs = model(**inputs)
+    
+    assert outputs.logits is not None
+    assert outputs.logits.shape[0] == 1  # batch size
+    print(f"Forward pass successful, logits shape: {outputs.logits.shape}")
diff --git a/tests/test_incomplete_completion.py b/tests/test_incomplete_completion.py
index c72989d..d40c044 100644
--- a/tests/test_incomplete_completion.py
+++ b/tests/test_incomplete_completion.py
@@ -4,7 +4,7 @@
 
 from ludic.agents.base_agent import Agent
 from ludic.context.full_dialog import FullDialog
-from ludic.interaction.single_agent import SingleAgentSyncProtocol
+from ludic.interaction.single_agent import SingleAgentProtocol
 from ludic.parsers import ParseResult
 from tests._mocks import MockClient, MockEnv
 
@@ -66,7 +66,7 @@ async def test_single_agent_protocol_marks_incomplete_completion_as_parse_error(
         ctx=FullDialog(),
         parser=pass_through_parser,
     )
-    protocol = SingleAgentSyncProtocol(agent=agent)
+    protocol = SingleAgentProtocol(agent=agent)
 
     env = MockEnv(max_steps=10, target="1")
     rollouts = await protocol.run(env=env, max_steps=1)
diff --git a/tests/test_interaction.py b/tests/test_interaction.py
index 9159c2d..2351710 100644
--- a/tests/test_interaction.py
+++ b/tests/test_interaction.py
@@ -2,7 +2,7 @@
 import pytest
 
 from ludic.context.full_dialog import FullDialog
-from ludic.interaction.single_agent import SingleAgentSyncProtocol
+from ludic.interaction.single_agent import SingleAgentProtocol
 from ludic.interaction.multi_agent import MultiAgentProtocol
 from ludic.agents.base_agent import Agent
 from ludic.inference.client import ChatResponse
@@ -28,7 +28,7 @@ async def test_happy_path_terminates_immediately():
     env = MockEnv(max_steps=3, target="1")
     # MockAgent provides a default ctx and a pass-through parser
     agent = MockAgent(client=MockClient(text="1"))
-    protocol = SingleAgentSyncProtocol(agent=agent)
+    protocol = SingleAgentProtocol(agent=agent)
 
     # run() now returns List[Rollout]
     rollouts = await protocol.run(
@@ -51,7 +51,7 @@ async def complete(self, request: ChatCompletionRequest, **kwargs):
 
     env = MockEnv(max_steps=2, target="1")
     agent = MockAgent(client=WrongClient())
-    protocol = SingleAgentSyncProtocol(agent=agent)
+    protocol = SingleAgentProtocol(agent=agent)
 
     rollouts = await protocol.run(
         env=env,
@@ -97,7 +97,7 @@ async def test_run_episode_uses_action_parser_and_logs_parsed_action():
         parser=action_parser
     )
     
-    protocol = SingleAgentSyncProtocol(agent=agent)
+    protocol = SingleAgentProtocol(agent=agent)
 
     rollouts = await protocol.run(
         env=env,
@@ -312,7 +312,7 @@ async def test_multi_agent_handles_unmanaged_bot_turns():
 @pytest.mark.asyncio
 async def test_single_agent_protocol_logs_parser_failure_without_env_step():
     """
-    If the agent parser fails, SingleAgentSyncProtocol should:
+    If the agent parser fails, SingleAgentProtocol should:
       - NOT call env.step()
       - log a synthetic step with parse_error info
       - feed the synthetic observation back to the agent context
@@ -337,7 +337,7 @@ def always_fail_parser(_: str) -> ParseResult:
         ctx=FullDialog(),
         parser=always_fail_parser,
     )
-    protocol = SingleAgentSyncProtocol(agent=agent)
+    protocol = SingleAgentProtocol(agent=agent)
 
     rollouts = await protocol.run(env=env, max_steps=1)
 
@@ -497,7 +497,7 @@ async def test_single_agent_max_steps_truncation():
     # Agent always says "wrong", env wants "correct"
     env = MockEnv(max_steps=10, target="correct")  # env allows many steps
     agent = MockAgent(client=MockClient(text="wrong"))
-    protocol = SingleAgentSyncProtocol(agent=agent)
+    protocol = SingleAgentProtocol(agent=agent)
 
     # Protocol max_steps=3, so we'll hit that before env's max_steps
     rollouts = await protocol.run(env=env, max_steps=3)
@@ -533,7 +533,7 @@ async def test_single_agent_env_truncation_preserved():
     # Env will truncate after 2 wrong answers
     env = MockEnv(max_steps=2, target="correct")
     agent = MockAgent(client=MockClient(text="wrong"))
-    protocol = SingleAgentSyncProtocol(agent=agent)
+    protocol = SingleAgentProtocol(agent=agent)
 
     # Protocol allows many steps, but env will truncate at 2
     rollouts = await protocol.run(env=env, max_steps=100)
@@ -559,7 +559,7 @@ async def test_single_agent_normal_termination_not_truncated():
     """
     env = MockEnv(max_steps=10, target="win")
     agent = MockAgent(client=MockClient(text="win"))
-    protocol = SingleAgentSyncProtocol(agent=agent)
+    protocol = SingleAgentProtocol(agent=agent)
 
     rollouts = await protocol.run(env=env, max_steps=100)
 
diff --git a/tests/test_public_api_imports.py b/tests/test_public_api_imports.py
index fc5609c..35bbd51 100644
--- a/tests/test_public_api_imports.py
+++ b/tests/test_public_api_imports.py
@@ -8,7 +8,7 @@ def test_top_level_exports_import() -> None:
     from ludic.context import ContextStrategy, FullDialog, TruncatedThinkingContext  # noqa: F401
     from ludic.envs import LudicEnv, SingleAgentEnv, DatasetQAEnv  # noqa: F401
     from ludic.inference import VLLMChatClient, start_vllm_server, wait_for_vllm_health  # noqa: F401
-    from ludic.interaction import InteractionProtocol, SingleAgentSyncProtocol, MultiAgentProtocol, TraceCollector  # noqa: F401
+    from ludic.interaction import InteractionProtocol, SingleAgentProtocol, MultiAgentProtocol, TraceCollector  # noqa: F401
     from ludic.parsers import boxed_parser, xml_tag_parser, compose_parsers, think_prefix_parser  # noqa: F401
     from ludic.distributed import create_vllm_publisher  # noqa: F401
     from ludic.types import Rollout, Step  # noqa: F401
diff --git a/tests/test_rollout_engine.py b/tests/test_rollout_engine.py
index 20c17ed..85f6010 100644
--- a/tests/test_rollout_engine.py
+++ b/tests/test_rollout_engine.py
@@ -8,7 +8,7 @@
 from ludic.agents.base_agent import Agent
 from ludic.inference.client import ChatResponse
 from ludic.interaction.base import InteractionProtocol
-from ludic.interaction.single_agent import SingleAgentSyncProtocol
+from ludic.interaction.single_agent import SingleAgentProtocol
 from ludic.context.full_dialog import FullDialog
 from ludic.envs.env import LudicEnv
 from ludic.inference.request import ChatCompletionRequest, InferenceSpec, ReturnSpec
@@ -111,7 +111,7 @@ async def test_generate_rollouts_basic_metadata_and_termination(
     mock_agent,
 ) -> None:
     protocol_registry: ProtocolRegistry = {
-        "mock_protocol": lambda: SingleAgentSyncProtocol(agent=mock_agent)
+        "mock_protocol": lambda: SingleAgentProtocol(agent=mock_agent)
     }
     
     engine = RolloutEngine(
@@ -203,7 +203,7 @@ async def test_generate_rollouts_unknown_env_raises(
     mock_agent,
 ) -> None:
     protocol_registry = {
-        "mock_protocol": lambda: SingleAgentSyncProtocol(agent=mock_agent)
+        "mock_protocol": lambda: SingleAgentProtocol(agent=mock_agent)
     }
     engine = RolloutEngine(
         env_registry=env_registry,
@@ -259,10 +259,10 @@ async def test_generate_rollouts_heterogeneous_protocols(
     """
     # Define two different agent/protocol setups
     agent_A = MockAgent(client=MockClient(text="Agent A says hi"))
-    protocol_A = SingleAgentSyncProtocol(agent=agent_A)
+    protocol_A = SingleAgentProtocol(agent=agent_A)
 
     agent_B = MockAgent(client=MockClient(text="Agent B says hi"))
-    protocol_B = SingleAgentSyncProtocol(agent=agent_B)
+    protocol_B = SingleAgentProtocol(agent=agent_B)
 
     protocol_registry = {
         "protocol_A": lambda: protocol_A,
@@ -317,7 +317,7 @@ async def test_generate_rollouts_writes_jsonl(
     jsonl_path = tmp_path / "rollouts.jsonl"
     
     protocol_registry = {
-        "mock_protocol": lambda: SingleAgentSyncProtocol(agent=mock_agent)
+        "mock_protocol": lambda: SingleAgentProtocol(agent=mock_agent)
     }
 
     engine = RolloutEngine(
@@ -373,7 +373,7 @@ async def test_generate_batch_uses_model_token_ids_when_available(
     )
     
     protocol_registry = {
-        "token_protocol": lambda: SingleAgentSyncProtocol(agent=agent)
+        "token_protocol": lambda: SingleAgentProtocol(agent=agent)
     }
 
     engine = RolloutEngine(
@@ -429,7 +429,7 @@ async def test_generate_batch_raises_if_no_token_ids_and_no_retokenize(
     mock_agent,
 ) -> None:
     protocol_registry = {
-        "mock_protocol": lambda: SingleAgentSyncProtocol(agent=mock_agent)
+        "mock_protocol": lambda: SingleAgentProtocol(agent=mock_agent)
     }
     
     engine = RolloutEngine(
@@ -471,7 +471,7 @@ async def test_rollout_batch_source_next_batch_integration(
         parser=_mock_parser,
     )
     protocol_registry = {
-        "mock_protocol": lambda: SingleAgentSyncProtocol(agent=agent)
+        "mock_protocol": lambda: SingleAgentProtocol(agent=agent)
     }
     
     engine = RolloutEngine(
@@ -527,7 +527,7 @@ async def test_rollout_batch_source_passes_sample_filter(
         parser=_mock_parser,
     )
     protocol_registry = {
-        "mock_protocol": lambda: SingleAgentSyncProtocol(agent=agent)
+        "mock_protocol": lambda: SingleAgentProtocol(agent=agent)
     }
 
     engine = RolloutEngine(
@@ -579,7 +579,7 @@ async def test_saw_item_contains_truncation_flags(
         parser=_mock_parser,
     )  # Never terminates the env since it never outputs target="win"
     protocol_registry = {
-        "mock_protocol": lambda: SingleAgentSyncProtocol(agent=agent),
+        "mock_protocol": lambda: SingleAgentProtocol(agent=agent),
     }
 
     engine = RolloutEngine(
@@ -634,7 +634,7 @@ async def test_generate_batch_applies_sample_filter_and_updates_counts(
         parser=_mock_parser,
     )  # Never terminates the env since it never outputs target="win"
     protocol_registry = {
-        "mock_protocol": lambda: SingleAgentSyncProtocol(agent=agent),
+        "mock_protocol": lambda: SingleAgentProtocol(agent=agent),
     }
 
     engine = RolloutEngine(
@@ -701,7 +701,7 @@ async def complete(  # type: ignore[override]
         parser=_mock_parser,
     )
     protocol_registry = {
-        "mock_protocol": lambda: SingleAgentSyncProtocol(agent=agent),
+        "mock_protocol": lambda: SingleAgentProtocol(agent=agent),
     }
 
     engine = RolloutEngine(
diff --git a/uv.lock b/uv.lock
index a49fea3..a17cec9 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1,6 +1,11 @@
 version = 1
 revision = 3
 requires-python = "==3.12.*"
+resolution-markers = [
+    "sys_platform != 'darwin' and sys_platform != 'linux'",
+    "sys_platform == 'darwin'",
+    "sys_platform == 'linux'",
+]
 
 [[package]]
 name = "accelerate"
@@ -8,12 +13,15 @@ version = "1.12.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "huggingface-hub" },
-    { name = "numpy" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux'" },
+    { name = "numpy", version = "2.4.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'linux'" },
     { name = "packaging" },
     { name = "psutil" },
     { name = "pyyaml" },
     { name = "safetensors" },
-    { name = "torch" },
+    { name = "torch", version = "2.9.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux'" },
+    { name = "torch", version = "2.9.1", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform == 'darwin'" },
+    { name = "torch", version = "2.9.1+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform != 'darwin' and sys_platform != 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/4a/8e/ac2a9566747a93f8be36ee08532eb0160558b07630a081a6056a9f89bf1d/accelerate-1.12.0.tar.gz", hash = "sha256:70988c352feb481887077d2ab845125024b2a137a5090d6d7a32b57d03a45df6", size = 398399, upload-time = "2025-11-21T11:27:46.973Z" }
 wheels = [
@@ -78,11 +86,11 @@ wheels = [
 
 [[package]]
 name = "annotated-doc"
-version = "0.0.3"
+version = "0.0.4"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/d7/a6/dc46877b911e40c00d395771ea710d5e77b6de7bacd5fdcd78d70cc5a48f/annotated_doc-0.0.3.tar.gz", hash = "sha256:e18370014c70187422c33e945053ff4c286f453a984eba84d0dbfa0c935adeda", size = 5535, upload-time = "2025-10-24T14:57:10.718Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/57/ba/046ceea27344560984e26a590f90bc7f4a75b06701f653222458922b558c/annotated_doc-0.0.4.tar.gz", hash = "sha256:fbcda96e87e9c92ad167c2e53839e57503ecfda18804ea28102353485033faa4", size = 7288, upload-time = "2025-11-10T22:07:42.062Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/02/b7/cf592cb5de5cb3bade3357f8d2cf42bf103bbe39f459824b4939fd212911/annotated_doc-0.0.3-py3-none-any.whl", hash = "sha256:348ec6664a76f1fd3be81f43dffbee4c7e8ce931ba71ec67cc7f4ade7fbbb580", size = 5488, upload-time = "2025-10-24T14:57:09.462Z" },
+    { url = "https://files.pythonhosted.org/packages/1e/d3/26bf1008eb3d2daa8ef4cacc7f3bfdc11818d111f7e2d0201bc6e3b49d45/annotated_doc-0.0.4-py3-none-any.whl", hash = "sha256:571ac1dc6991c450b25a9c2d84a3705e2ae7a53467b5d111c24fa8baabbed320", size = 5303, upload-time = "2025-11-10T22:07:40.673Z" },
 ]
 
 [[package]]
@@ -99,41 +107,31 @@ name = "anthropic"
 version = "0.71.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "anyio" },
-    { name = "distro" },
-    { name = "docstring-parser" },
-    { name = "httpx" },
-    { name = "jiter" },
-    { name = "pydantic" },
-    { name = "sniffio" },
-    { name = "typing-extensions" },
+    { name = "anyio", marker = "sys_platform == 'linux'" },
+    { name = "distro", marker = "sys_platform == 'linux'" },
+    { name = "docstring-parser", marker = "sys_platform == 'linux'" },
+    { name = "httpx", marker = "sys_platform == 'linux'" },
+    { name = "jiter", marker = "sys_platform == 'linux'" },
+    { name = "pydantic", marker = "sys_platform == 'linux'" },
+    { name = "sniffio", marker = "sys_platform == 'linux'" },
+    { name = "typing-extensions", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/82/4f/70682b068d897841f43223df82d96ec1d617435a8b759c4a2d901a50158b/anthropic-0.71.0.tar.gz", hash = "sha256:eb8e6fa86d049061b3ef26eb4cbae0174ebbff21affa6de7b3098da857d8de6a", size = 489102, upload-time = "2025-10-16T15:54:40.08Z" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/5d/77/073e8ac488f335aec7001952825275582fb8f433737e90f24eeef9d878f6/anthropic-0.71.0-py3-none-any.whl", hash = "sha256:85c5015fcdbdc728390f11b17642a65a4365d03b12b799b18b6cc57e71fdb327", size = 355035, upload-time = "2025-10-16T15:54:38.238Z" },
 ]
 
-[[package]]
-name = "antlr4-python3-runtime"
-version = "4.13.2"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/33/5f/2cdf6f7aca3b20d3f316e9f505292e1f256a32089bd702034c29ebde6242/antlr4_python3_runtime-4.13.2.tar.gz", hash = "sha256:909b647e1d2fc2b70180ac586df3933e38919c85f98ccc656a96cd3f25ef3916", size = 117467, upload-time = "2024-08-03T19:00:12.757Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/89/03/a851e84fcbb85214dc637b6378121ef9a0dd61b4c65264675d8a5c9b1ae7/antlr4_python3_runtime-4.13.2-py3-none-any.whl", hash = "sha256:fe3835eb8d33daece0e799090eda89719dbccee7aa39ef94eed3818cafa5a7e8", size = 144462, upload-time = "2024-08-03T19:00:11.134Z" },
-]
-
 [[package]]
 name = "anyio"
-version = "4.11.0"
+version = "4.12.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "idna" },
-    { name = "sniffio" },
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/c6/78/7d432127c41b50bccba979505f272c16cbcadcc33645d5fa3a738110ae75/anyio-4.11.0.tar.gz", hash = "sha256:82a8d0b81e318cc5ce71a5f1f8b5c4e63619620b63141ef8c995fa0db95a57c4", size = 219094, upload-time = "2025-09-23T09:19:12.58Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/16/ce/8a777047513153587e5434fd752e89334ac33e379aa3497db860eeb60377/anyio-4.12.0.tar.gz", hash = "sha256:73c693b567b0c55130c104d0b43a9baf3aa6a31fc6110116509f27bf75e21ec0", size = 228266, upload-time = "2025-11-28T23:37:38.911Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/15/b3/9b1a8074496371342ec1e796a96f99c82c945a339cd81a8e73de28b4cf9e/anyio-4.11.0-py3-none-any.whl", hash = "sha256:0287e96f4d26d4149305414d4e3bc32f0dcd0862365a4bddea19d7a1ec38c4fc", size = 109097, upload-time = "2025-09-23T09:19:10.601Z" },
+    { url = "https://files.pythonhosted.org/packages/7f/9c/36c5c37947ebfb8c7f22e0eb6e4d188ee2d53aa3880f3f2744fb894f0cb1/anyio-4.12.0-py3-none-any.whl", hash = "sha256:dad2376a628f98eeca4881fc56cd06affd18f659b17a747d3ff0307ced94b1bb", size = 113362, upload-time = "2025-11-28T23:36:57.897Z" },
 ]
 
 [[package]]
@@ -141,16 +139,14 @@ name = "apache-tvm-ffi"
 version = "0.1.6"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "typing-extensions" },
+    { name = "typing-extensions", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/45/20/8da071821b2142bdeed757d2859dede4817e0b82a96e9a4d8cfbffd49006/apache_tvm_ffi-0.1.6.tar.gz", hash = "sha256:53088126f7fce11823ddf0fb101e968a90298d79fd68829c0a981f25467a574c", size = 2387987, upload-time = "2025-12-16T19:00:33.523Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/1f/de/4ae5dd4d493b1cea755a25d59088895486432c053cff5a3287b75e36ce54/apache_tvm_ffi-0.1.6-cp312-abi3-macosx_11_0_arm64.whl", hash = "sha256:5f4c0678854dbf3bfaa37795465f570d79c68759896b04b3d31774af0a03bcb8", size = 1779381, upload-time = "2025-12-16T18:59:59.593Z" },
     { url = "https://files.pythonhosted.org/packages/2d/40/2e943cbda764c3266a6966a34e582d3f0ac6046ab6aaa756631df9afd7bf/apache_tvm_ffi-0.1.6-cp312-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:653f1d4c8ffd6bca5300fd1825a81373a5be82f31dc79353d1c476fa31cf377a", size = 1936756, upload-time = "2025-12-16T19:00:00.844Z" },
     { url = "https://files.pythonhosted.org/packages/a3/91/fc43f155b4d4363e61707655c1f4bee75af1d6dd4a76680f4956dd9846fe/apache_tvm_ffi-0.1.6-cp312-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6a2cdfa90860a80e3cfb2364ce3b66a559fa5748de8d593a203b2e5992d92bc1", size = 2013641, upload-time = "2025-12-16T19:00:02.479Z" },
     { url = "https://files.pythonhosted.org/packages/14/9b/45208f2a9c70a88fd8e65668c0628f3917625d64668800ff55a2390d7fe0/apache_tvm_ffi-0.1.6-cp312-abi3-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:223ac7ac08b34a6dbabe7085f23939b4aaa70666e72ddad41015659034e095af", size = 1881149, upload-time = "2025-12-16T19:00:03.776Z" },
     { url = "https://files.pythonhosted.org/packages/7d/c5/e3ba08379127578bb3417605b61e9cd5e513184a6947ec7f3fac93d16355/apache_tvm_ffi-0.1.6-cp312-abi3-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:05cedb3ba7600dc9ae35c17b7325d44ecf02c56c3ba1b62668dca8390da7ec28", size = 1992886, upload-time = "2025-12-16T19:00:05.047Z" },
-    { url = "https://files.pythonhosted.org/packages/d6/7b/4df1e523ae4bcbfbe65a3e7ef3c8810cb76e9ae44fa9b44c9fac152ecc2b/apache_tvm_ffi-0.1.6-cp312-abi3-win_amd64.whl", hash = "sha256:a6c29ba9dbc6273f4534bfc0e8a52a784f264724eb62df62daedc2b349dabe85", size = 1758454, upload-time = "2025-12-16T19:00:06.498Z" },
 ]
 
 [[package]]
@@ -186,8 +182,6 @@ version = "1.0.8"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/75/aa/abcd75e9600987a0bc6cfe9b6b2ff3f0e2cb08c170addc6e76035b5c4cb3/blake3-1.0.8.tar.gz", hash = "sha256:513cc7f0f5a7c035812604c2c852a0c1468311345573de647e310aca4ab165ba", size = 117308, upload-time = "2025-10-14T06:47:48.83Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/ed/a0/b7b6dff04012cfd6e665c09ee446f749bd8ea161b00f730fe1bdecd0f033/blake3-1.0.8-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:d8da4233984d51471bd4e4366feda1d90d781e712e0a504ea54b1f2b3577557b", size = 347983, upload-time = "2025-10-14T06:45:47.214Z" },
-    { url = "https://files.pythonhosted.org/packages/5b/a2/264091cac31d7ae913f1f296abc20b8da578b958ffb86100a7ce80e8bf5c/blake3-1.0.8-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1257be19f2d381c868a34cc822fc7f12f817ddc49681b6d1a2790bfbda1a9865", size = 325415, upload-time = "2025-10-14T06:45:48.482Z" },
     { url = "https://files.pythonhosted.org/packages/ee/7d/85a4c0782f613de23d114a7a78fcce270f75b193b3ff3493a0de24ba104a/blake3-1.0.8-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:269f255b110840e52b6ce9db02217e39660ebad3e34ddd5bca8b8d378a77e4e1", size = 371296, upload-time = "2025-10-14T06:45:49.674Z" },
     { url = "https://files.pythonhosted.org/packages/e3/20/488475254976ed93fab57c67aa80d3b40df77f7d9db6528c9274bff53e08/blake3-1.0.8-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:66ca28a673025c40db3eba21a9cac52f559f83637efa675b3f6bd8683f0415f3", size = 374516, upload-time = "2025-10-14T06:45:51.23Z" },
     { url = "https://files.pythonhosted.org/packages/7b/21/2a1c47fedb77fb396512677ec6d46caf42ac6e9a897db77edd0a2a46f7bb/blake3-1.0.8-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bcb04966537777af56c1f399b35525aa70a1225816e121ff95071c33c0f7abca", size = 447911, upload-time = "2025-10-14T06:45:52.637Z" },
@@ -196,17 +190,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/5b/94/eafaa5cdddadc0c9c603a6a6d8339433475e1a9f60c8bb9c2eed2d8736b6/blake3-1.0.8-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:504d1399b7fb91dfe5c25722d2807990493185faa1917456455480c36867adb5", size = 388001, upload-time = "2025-10-14T06:45:57.067Z" },
     { url = "https://files.pythonhosted.org/packages/17/81/735fa00d13de7f68b25e1b9cb36ff08c6f165e688d85d8ec2cbfcdedccc5/blake3-1.0.8-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:c84af132aa09abeadf9a0118c8fb26f4528f3f42c10ef8be0fcf31c478774ec4", size = 550302, upload-time = "2025-10-14T06:45:58.657Z" },
     { url = "https://files.pythonhosted.org/packages/0e/c6/d1fe8bdea4a6088bd54b5a58bc40aed89a4e784cd796af7722a06f74bae7/blake3-1.0.8-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:a25db3d36b55f5ed6a86470155cc749fc9c5b91c949b8d14f48658f9d960d9ec", size = 554211, upload-time = "2025-10-14T06:46:00.269Z" },
-    { url = "https://files.pythonhosted.org/packages/55/d1/ca74aa450cbe10e396e061f26f7a043891ffa1485537d6b30d3757e20995/blake3-1.0.8-cp312-cp312-win32.whl", hash = "sha256:e0fee93d5adcd44378b008c147e84f181f23715307a64f7b3db432394bbfce8b", size = 228343, upload-time = "2025-10-14T06:46:01.533Z" },
-    { url = "https://files.pythonhosted.org/packages/4d/42/bbd02647169e3fbed27558555653ac2578c6f17ccacf7d1956c58ef1d214/blake3-1.0.8-cp312-cp312-win_amd64.whl", hash = "sha256:6a6eafc29e4f478d365a87d2f25782a521870c8514bb43734ac85ae9be71caf7", size = 215704, upload-time = "2025-10-14T06:46:02.79Z" },
 ]
 
 [[package]]
 name = "cachetools"
-version = "6.2.1"
+version = "6.2.4"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/cc/7e/b975b5814bd36faf009faebe22c1072a1fa1168db34d285ef0ba071ad78c/cachetools-6.2.1.tar.gz", hash = "sha256:3f391e4bd8f8bf0931169baf7456cc822705f4e2a31f840d218f445b9a854201", size = 31325, upload-time = "2025-10-12T14:55:30.139Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/bc/1d/ede8680603f6016887c062a2cf4fc8fdba905866a3ab8831aa8aa651320c/cachetools-6.2.4.tar.gz", hash = "sha256:82c5c05585e70b6ba2d3ae09ea60b79548872185d2f24ae1f2709d37299fd607", size = 31731, upload-time = "2025-12-15T18:24:53.744Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/96/c5/1e741d26306c42e2bf6ab740b2202872727e0f606033c9dd713f8b93f5a8/cachetools-6.2.1-py3-none-any.whl", hash = "sha256:09868944b6dde876dfd44e1d47e18484541eaf12f26f29b7af91b26cc892d701", size = 11280, upload-time = "2025-10-12T14:55:28.382Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/fc/1d7b80d0eb7b714984ce40efc78859c022cd930e402f599d8ca9e39c78a4/cachetools-6.2.4-py3-none-any.whl", hash = "sha256:69a7a52634fed8b8bf6e24a050fb60bff1c9bd8f6d24572b99c32d4e71e62a51", size = 11551, upload-time = "2025-12-15T18:24:52.332Z" },
 ]
 
 [[package]]
@@ -215,24 +207,20 @@ version = "5.7.1"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/a2/b8/c0f6a7d46f816cb18b1fda61a2fe648abe16039f1ff93ea720a6e9fb3cee/cbor2-5.7.1.tar.gz", hash = "sha256:7a405a1d7c8230ee9acf240aad48ae947ef584e8af05f169f3c1bde8f01f8b71", size = 102467, upload-time = "2025-10-24T09:23:06.569Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/56/54/48426472f0c051982c647331441aed09b271a0500356ae0b7054c813d174/cbor2-5.7.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:bd5ca44891c06f6b85d440836c967187dc1d30b15f86f315d55c675d3a841078", size = 69031, upload-time = "2025-10-24T09:22:25.438Z" },
-    { url = "https://files.pythonhosted.org/packages/d3/68/1dd58c7706e9752188358223db58c83f3c48e07f728aa84221ffd244652f/cbor2-5.7.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:537d73ef930ccc1a7b6a2e8d2cbf81407d270deb18e40cda5eb511bd70f71078", size = 68825, upload-time = "2025-10-24T09:22:26.497Z" },
     { url = "https://files.pythonhosted.org/packages/09/4e/380562fe9f9995a1875fb5ec26fd041e19d61f4630cb690a98c5195945fc/cbor2-5.7.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:edbf814dd7763b6eda27a5770199f6ccd55bd78be8f4367092460261bfbf19d0", size = 286222, upload-time = "2025-10-24T09:22:27.546Z" },
     { url = "https://files.pythonhosted.org/packages/7c/bb/9eccdc1ea3c4d5c7cdb2e49b9de49534039616be5455ce69bd64c0b2efe2/cbor2-5.7.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9fc81da8c0e09beb42923e455e477b36ff14a03b9ca18a8a2e9b462de9a953e8", size = 285688, upload-time = "2025-10-24T09:22:28.651Z" },
     { url = "https://files.pythonhosted.org/packages/59/8c/4696d82f5bd04b3d45d9a64ec037fa242630c134e3218d6c252b4f59b909/cbor2-5.7.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e4a7d660d428911a3aadb7105e94438d7671ab977356fdf647a91aab751033bd", size = 277063, upload-time = "2025-10-24T09:22:29.775Z" },
     { url = "https://files.pythonhosted.org/packages/95/50/6538e44ca970caaad2fa376b81701d073d84bf597aac07a59d0a253b1a7f/cbor2-5.7.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:228e0af9c0a9ddf6375b6ae010eaa1942a1901d403f134ac9ee6a76a322483f9", size = 278334, upload-time = "2025-10-24T09:22:30.904Z" },
-    { url = "https://files.pythonhosted.org/packages/64/a9/156ccd2207fb26b5b61d23728b4dbdc595d1600125aa79683a4a8ddc9313/cbor2-5.7.1-cp312-cp312-win_amd64.whl", hash = "sha256:2d08a6c0d9ed778448e185508d870f4160ba74f59bb17a966abd0d14d0ff4dd3", size = 68404, upload-time = "2025-10-24T09:22:32.108Z" },
-    { url = "https://files.pythonhosted.org/packages/4f/49/adc53615e9dd32c4421f6935dfa2235013532c6e6b28ee515bbdd92618be/cbor2-5.7.1-cp312-cp312-win_arm64.whl", hash = "sha256:752506cfe72da0f4014b468b30191470ee8919a64a0772bd3b36a4fccf5fcefc", size = 64047, upload-time = "2025-10-24T09:22:33.147Z" },
     { url = "https://files.pythonhosted.org/packages/d5/7d/383bafeabb54c17fe5b6d5aca4e863e6b7df10bcc833b34aa169e9dfce1a/cbor2-5.7.1-py3-none-any.whl", hash = "sha256:68834e4eff2f56629ce6422b0634bc3f74c5a4269de5363f5265fe452c706ba7", size = 23829, upload-time = "2025-10-24T09:23:05.54Z" },
 ]
 
 [[package]]
 name = "certifi"
-version = "2025.10.5"
+version = "2025.11.12"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/4c/5b/b6ce21586237c77ce67d01dc5507039d444b630dd76611bbca2d8e5dcd91/certifi-2025.10.5.tar.gz", hash = "sha256:47c09d31ccf2acf0be3f701ea53595ee7e0b8fa08801c6624be771df09ae7b43", size = 164519, upload-time = "2025-10-05T04:12:15.808Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/a2/8c/58f469717fa48465e4a50c014a0400602d3c437d7c0c468e17ada824da3a/certifi-2025.11.12.tar.gz", hash = "sha256:d8ab5478f2ecd78af242878415affce761ca6bc54a22a27e026d7c25357c3316", size = 160538, upload-time = "2025-11-12T02:54:51.517Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/e4/37/af0d2ef3967ac0d6113837b44a4f0bfe1328c2b9763bd5b1744520e5cfed/certifi-2025.10.5-py3-none-any.whl", hash = "sha256:0f212c2744a9bb6de0c56639a6f68afe01ecd92d91f14ae897c4fe7bbeeef0de", size = 163286, upload-time = "2025-10-05T04:12:14.03Z" },
+    { url = "https://files.pythonhosted.org/packages/70/7d/9bc192684cea499815ff478dfcdc13835ddf401365057044fb721ec6bddb/certifi-2025.11.12-py3-none-any.whl", hash = "sha256:97de8790030bbd5c2d96b7ec782fc2f7820ef8dba6db909ccf95449f2d062d4b", size = 159438, upload-time = "2025-11-12T02:54:49.735Z" },
 ]
 
 [[package]]
@@ -240,12 +228,10 @@ name = "cffi"
 version = "2.0.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "pycparser", marker = "implementation_name != 'PyPy'" },
+    { name = "pycparser", marker = "implementation_name != 'PyPy' and sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/eb/56/b1ba7935a17738ae8453301356628e8147c79dbb825bcbc73dc7401f9846/cffi-2.0.0.tar.gz", hash = "sha256:44d1b5909021139fe36001ae048dbdde8214afa20200eda0f64c068cac5d5529", size = 523588, upload-time = "2025-09-08T23:24:04.541Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/ea/47/4f61023ea636104d4f16ab488e268b93008c3d0bb76893b1b31db1f96802/cffi-2.0.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6d02d6655b0e54f54c4ef0b94eb6be0607b70853c45ce98bd278dc7de718be5d", size = 185271, upload-time = "2025-09-08T23:22:44.795Z" },
-    { url = "https://files.pythonhosted.org/packages/df/a2/781b623f57358e360d62cdd7a8c681f074a71d445418a776eef0aadb4ab4/cffi-2.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8eca2a813c1cb7ad4fb74d368c2ffbbb4789d377ee5bb8df98373c2cc0dee76c", size = 181048, upload-time = "2025-09-08T23:22:45.938Z" },
     { url = "https://files.pythonhosted.org/packages/ff/df/a4f0fbd47331ceeba3d37c2e51e9dfc9722498becbeec2bd8bc856c9538a/cffi-2.0.0-cp312-cp312-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:21d1152871b019407d8ac3985f6775c079416c282e431a4da6afe7aefd2bccbe", size = 212529, upload-time = "2025-09-08T23:22:47.349Z" },
     { url = "https://files.pythonhosted.org/packages/d5/72/12b5f8d3865bf0f87cf1404d8c374e7487dcf097a1c91c436e72e6badd83/cffi-2.0.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:b21e08af67b8a103c71a250401c78d5e0893beff75e28c53c98f4de42f774062", size = 220097, upload-time = "2025-09-08T23:22:48.677Z" },
     { url = "https://files.pythonhosted.org/packages/c2/95/7a135d52a50dfa7c882ab0ac17e8dc11cec9d55d2c18dda414c051c5e69e/cffi-2.0.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:1e3a615586f05fc4065a8b22b8152f0c1b00cdbc60596d187c2a74f9e3036e4e", size = 207983, upload-time = "2025-09-08T23:22:50.06Z" },
@@ -253,9 +239,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/78/2d/7fa73dfa841b5ac06c7b8855cfc18622132e365f5b81d02230333ff26e9e/cffi-2.0.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3e17ed538242334bf70832644a32a7aae3d83b57567f9fd60a26257e992b79ba", size = 219572, upload-time = "2025-09-08T23:22:52.902Z" },
     { url = "https://files.pythonhosted.org/packages/07/e0/267e57e387b4ca276b90f0434ff88b2c2241ad72b16d31836adddfd6031b/cffi-2.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3925dd22fa2b7699ed2617149842d2e6adde22b262fcbfada50e3d195e4b3a94", size = 222963, upload-time = "2025-09-08T23:22:54.518Z" },
     { url = "https://files.pythonhosted.org/packages/b6/75/1f2747525e06f53efbd878f4d03bac5b859cbc11c633d0fb81432d98a795/cffi-2.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2c8f814d84194c9ea681642fd164267891702542f028a15fc97d4674b6206187", size = 221361, upload-time = "2025-09-08T23:22:55.867Z" },
-    { url = "https://files.pythonhosted.org/packages/7b/2b/2b6435f76bfeb6bbf055596976da087377ede68df465419d192acf00c437/cffi-2.0.0-cp312-cp312-win32.whl", hash = "sha256:da902562c3e9c550df360bfa53c035b2f241fed6d9aef119048073680ace4a18", size = 172932, upload-time = "2025-09-08T23:22:57.188Z" },
-    { url = "https://files.pythonhosted.org/packages/f8/ed/13bd4418627013bec4ed6e54283b1959cf6db888048c7cf4b4c3b5b36002/cffi-2.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:da68248800ad6320861f129cd9c1bf96ca849a2771a59e0344e88681905916f5", size = 183557, upload-time = "2025-09-08T23:22:58.351Z" },
-    { url = "https://files.pythonhosted.org/packages/95/31/9f7f93ad2f8eff1dbc1c3656d7ca5bfd8fb52c9d786b4dcf19b2d02217fa/cffi-2.0.0-cp312-cp312-win_arm64.whl", hash = "sha256:4671d9dd5ec934cb9a73e7ee9676f9362aba54f7f34910956b84d727b0d73fb6", size = 177762, upload-time = "2025-09-08T23:22:59.668Z" },
 ]
 
 [[package]]
@@ -285,14 +268,14 @@ wheels = [
 
 [[package]]
 name = "click"
-version = "8.2.1"
+version = "8.3.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "colorama", marker = "sys_platform == 'win32'" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/60/6c/8ca2efa64cf75a977a0d7fac081354553ebe483345c734fb6b6515d96bbc/click-8.2.1.tar.gz", hash = "sha256:27c491cc05d968d271d5a1db13e3b5a184636d9d930f148c50b038f0d0646202", size = 286342, upload-time = "2025-05-20T23:19:49.832Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/3d/fa/656b739db8587d7b5dfa22e22ed02566950fbfbcdc20311993483657a5c0/click-8.3.1.tar.gz", hash = "sha256:12ff4785d337a1bb490bb7e9c2b1ee5da3112e94a8622f26a6c77f5d2fc6842a", size = 295065, upload-time = "2025-11-15T20:45:42.706Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/85/32/10bb5764d90a8eee674e9dc6f4db6a0ab47c8c4d0d83c27f7c39ac415a4d/click-8.2.1-py3-none-any.whl", hash = "sha256:61a3265b914e850b85317d0b3109c7f8cd35a670f963866005d6ef1d5175a12b", size = 102215, upload-time = "2025-05-20T23:19:47.796Z" },
+    { url = "https://files.pythonhosted.org/packages/98/78/01c019cdb5d6498122777c1a43056ebb3ebfeef2076d9d026bfe15583b2b/click-8.3.1-py3-none-any.whl", hash = "sha256:981153a64e25f12d547d3426c367a4857371575ee7ad18df2a6183ab0545b2a6", size = 108274, upload-time = "2025-11-15T20:45:41.139Z" },
 ]
 
 [[package]]
@@ -318,10 +301,10 @@ name = "compressed-tensors"
 version = "0.12.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "loguru" },
-    { name = "pydantic" },
-    { name = "torch" },
-    { name = "transformers" },
+    { name = "loguru", marker = "sys_platform == 'linux'" },
+    { name = "pydantic", marker = "sys_platform == 'linux'" },
+    { name = "torch", version = "2.9.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux'" },
+    { name = "transformers", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/a2/79/4c5c1cd14266f8cf2650bdb940f986ce7fcaeb56aad8cfa9e9afedf14e2f/compressed_tensors-0.12.2.tar.gz", hash = "sha256:5bb40856dd17f128ab73557ecc73799f80db4dd82fab6de875f1e6899b9ea0c4", size = 190409, upload-time = "2025-10-07T14:30:59.302Z" }
 wheels = [
@@ -333,11 +316,10 @@ name = "cryptography"
 version = "46.0.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "cffi", marker = "platform_python_implementation != 'PyPy'" },
+    { name = "cffi", marker = "platform_python_implementation != 'PyPy' and sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/9f/33/c00162f49c0e2fe8064a62cb92b93e50c74a72bc370ab92f86112b33ff62/cryptography-46.0.3.tar.gz", hash = "sha256:a8b17438104fed022ce745b362294d9ce35b4c2e45c1d958ad4a4b019285f4a1", size = 749258, upload-time = "2025-10-15T23:18:31.74Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/1d/42/9c391dd801d6cf0d561b5890549d4b27bafcc53b39c31a817e69d87c625b/cryptography-46.0.3-cp311-abi3-macosx_10_9_universal2.whl", hash = "sha256:109d4ddfadf17e8e7779c39f9b18111a09efb969a301a31e987416a0191ed93a", size = 7225004, upload-time = "2025-10-15T23:16:52.239Z" },
     { url = "https://files.pythonhosted.org/packages/1c/67/38769ca6b65f07461eb200e85fc1639b438bdc667be02cf7f2cd6a64601c/cryptography-46.0.3-cp311-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:09859af8466b69bc3c27bdf4f5d84a665e0f7ab5088412e9e2ec49758eca5cbc", size = 4296667, upload-time = "2025-10-15T23:16:54.369Z" },
     { url = "https://files.pythonhosted.org/packages/5c/49/498c86566a1d80e978b42f0d702795f69887005548c041636df6ae1ca64c/cryptography-46.0.3-cp311-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:01ca9ff2885f3acc98c29f1860552e37f6d7c7d013d7334ff2a9de43a449315d", size = 4450807, upload-time = "2025-10-15T23:16:56.414Z" },
     { url = "https://files.pythonhosted.org/packages/4b/0a/863a3604112174c8624a2ac3c038662d9e59970c7f926acdcfaed8d61142/cryptography-46.0.3-cp311-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:6eae65d4c3d33da080cff9c4ab1f711b15c1d9760809dad6ea763f3812d254cb", size = 4299615, upload-time = "2025-10-15T23:16:58.442Z" },
@@ -349,10 +331,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/8f/29/798fc4ec461a1c9e9f735f2fc58741b0daae30688f41b2497dcbc9ed1355/cryptography-46.0.3-cp311-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:10b01676fc208c3e6feeb25a8b83d81767e8059e1fe86e1dc62d10a3018fa926", size = 4481596, upload-time = "2025-10-15T23:17:09.343Z" },
     { url = "https://files.pythonhosted.org/packages/15/8d/03cd48b20a573adfff7652b76271078e3045b9f49387920e7f1f631d125e/cryptography-46.0.3-cp311-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:0abf1ffd6e57c67e92af68330d05760b7b7efb243aab8377e583284dbab72c71", size = 4426782, upload-time = "2025-10-15T23:17:11.22Z" },
     { url = "https://files.pythonhosted.org/packages/fa/b1/ebacbfe53317d55cf33165bda24c86523497a6881f339f9aae5c2e13e57b/cryptography-46.0.3-cp311-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a04bee9ab6a4da801eb9b51f1b708a1b5b5c9eb48c03f74198464c66f0d344ac", size = 4698381, upload-time = "2025-10-15T23:17:12.829Z" },
-    { url = "https://files.pythonhosted.org/packages/96/92/8a6a9525893325fc057a01f654d7efc2c64b9de90413adcf605a85744ff4/cryptography-46.0.3-cp311-abi3-win32.whl", hash = "sha256:f260d0d41e9b4da1ed1e0f1ce571f97fe370b152ab18778e9e8f67d6af432018", size = 3055988, upload-time = "2025-10-15T23:17:14.65Z" },
-    { url = "https://files.pythonhosted.org/packages/7e/bf/80fbf45253ea585a1e492a6a17efcb93467701fa79e71550a430c5e60df0/cryptography-46.0.3-cp311-abi3-win_amd64.whl", hash = "sha256:a9a3008438615669153eb86b26b61e09993921ebdd75385ddd748702c5adfddb", size = 3514451, upload-time = "2025-10-15T23:17:16.142Z" },
-    { url = "https://files.pythonhosted.org/packages/2e/af/9b302da4c87b0beb9db4e756386a7c6c5b8003cd0e742277888d352ae91d/cryptography-46.0.3-cp311-abi3-win_arm64.whl", hash = "sha256:5d7f93296ee28f68447397bf5198428c9aeeab45705a55d53a6343455dcb2c3c", size = 2928007, upload-time = "2025-10-15T23:17:18.04Z" },
-    { url = "https://files.pythonhosted.org/packages/fd/23/45fe7f376a7df8daf6da3556603b36f53475a99ce4faacb6ba2cf3d82021/cryptography-46.0.3-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:cb3d760a6117f621261d662bccc8ef5bc32ca673e037c83fbe565324f5c46936", size = 7218248, upload-time = "2025-10-15T23:17:46.294Z" },
     { url = "https://files.pythonhosted.org/packages/27/32/b68d27471372737054cbd34c84981f9edbc24fe67ca225d389799614e27f/cryptography-46.0.3-cp38-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:4b7387121ac7d15e550f5cb4a43aef2559ed759c35df7336c402bb8275ac9683", size = 4294089, upload-time = "2025-10-15T23:17:48.269Z" },
     { url = "https://files.pythonhosted.org/packages/26/42/fa8389d4478368743e24e61eea78846a0006caffaf72ea24a15159215a14/cryptography-46.0.3-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:15ab9b093e8f09daab0f2159bb7e47532596075139dd74365da52ecc9cb46c5d", size = 4440029, upload-time = "2025-10-15T23:17:49.837Z" },
     { url = "https://files.pythonhosted.org/packages/5f/eb/f483db0ec5ac040824f269e93dd2bd8a21ecd1027e77ad7bdf6914f2fd80/cryptography-46.0.3-cp38-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:46acf53b40ea38f9c6c229599a4a13f0d46a6c3fa9ef19fc1a124d62e338dfa0", size = 4297222, upload-time = "2025-10-15T23:17:51.357Z" },
@@ -364,9 +342,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/fc/59/873633f3f2dcd8a053b8dd1d38f783043b5fce589c0f6988bf55ef57e43e/cryptography-46.0.3-cp38-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:402b58fc32614f00980b66d6e56a5b4118e6cb362ae8f3fda141ba4689bd4506", size = 4472642, upload-time = "2025-10-15T23:18:02.749Z" },
     { url = "https://files.pythonhosted.org/packages/3d/39/8e71f3930e40f6877737d6f69248cf74d4e34b886a3967d32f919cc50d3b/cryptography-46.0.3-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:ef639cb3372f69ec44915fafcd6698b6cc78fbe0c2ea41be867f6ed612811963", size = 4423126, upload-time = "2025-10-15T23:18:04.85Z" },
     { url = "https://files.pythonhosted.org/packages/cd/c7/f65027c2810e14c3e7268353b1681932b87e5a48e65505d8cc17c99e36ae/cryptography-46.0.3-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:3b51b8ca4f1c6453d8829e1eb7299499ca7f313900dd4d89a24b8b87c0a780d4", size = 4686573, upload-time = "2025-10-15T23:18:06.908Z" },
-    { url = "https://files.pythonhosted.org/packages/0a/6e/1c8331ddf91ca4730ab3086a0f1be19c65510a33b5a441cb334e7a2d2560/cryptography-46.0.3-cp38-abi3-win32.whl", hash = "sha256:6276eb85ef938dc035d59b87c8a7dc559a232f954962520137529d77b18ff1df", size = 3036695, upload-time = "2025-10-15T23:18:08.672Z" },
-    { url = "https://files.pythonhosted.org/packages/90/45/b0d691df20633eff80955a0fc7695ff9051ffce8b69741444bd9ed7bd0db/cryptography-46.0.3-cp38-abi3-win_amd64.whl", hash = "sha256:416260257577718c05135c55958b674000baef9a1c7d9e8f306ec60d71db850f", size = 3501720, upload-time = "2025-10-15T23:18:10.632Z" },
-    { url = "https://files.pythonhosted.org/packages/e8/cb/2da4cc83f5edb9c3257d09e1e7ab7b23f049c7962cae8d842bbef0a9cec9/cryptography-46.0.3-cp38-abi3-win_arm64.whl", hash = "sha256:d89c3468de4cdc4f08a57e214384d0471911a3830fcdaf7a8cc587e42a866372", size = 2918740, upload-time = "2025-10-15T23:18:12.277Z" },
 ]
 
 [[package]]
@@ -374,12 +349,11 @@ name = "cuda-bindings"
 version = "13.1.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "cuda-pathfinder" },
+    { name = "cuda-pathfinder", marker = "sys_platform == 'linux'" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/53/3d/c8ed9d169843091f3f0d6b8218e826fd59520a37e0434c204feada597988/cuda_bindings-13.1.1-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1e75ad0cb863330df784236d289612d71ca855c013d19ae00e5693574abd6915", size = 15530160, upload-time = "2025-12-09T22:05:55.386Z" },
     { url = "https://files.pythonhosted.org/packages/4a/8e/368295623ee43fba622909d780fbb6863efc1638dff55f67a0f04eac6470/cuda_bindings-13.1.1-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:25785d1a3cdcd98f151240fd5efd025609319a6720a217dee2a929241749d488", size = 16110386, upload-time = "2025-12-09T22:05:57.71Z" },
-    { url = "https://files.pythonhosted.org/packages/60/1f/ecc4701ade3e85f091c625a920574527b9daf7fb354189fbfbc5516af6cd/cuda_bindings-13.1.1-cp312-cp312-win_amd64.whl", hash = "sha256:ccde9c95c0e953b31fe7731bb08da9d0a34b1770498df9a3c156fdfdbe3951ad", size = 15250028, upload-time = "2025-12-09T22:06:00.346Z" },
 ]
 
 [[package]]
@@ -395,8 +369,8 @@ name = "cuda-python"
 version = "13.1.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "cuda-bindings" },
-    { name = "cuda-pathfinder" },
+    { name = "cuda-bindings", marker = "sys_platform == 'linux'" },
+    { name = "cuda-pathfinder", marker = "sys_platform == 'linux'" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/cd/08/b5e3b9822662d72d540d830531e3ab6a7cabbda3dd56175696aabccfeb76/cuda_python-13.1.1-py3-none-any.whl", hash = "sha256:944cc4fe6482673d28dd545797a28840945a1668739328fa2ad1e9be4f7050d9", size = 8038, upload-time = "2025-12-09T22:13:10.719Z" },
@@ -407,18 +381,17 @@ name = "cupy-cuda12x"
 version = "13.6.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "fastrlock" },
-    { name = "numpy" },
+    { name = "fastrlock", marker = "sys_platform == 'linux'" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux'" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/12/c5/7e7fc4816d0de0154e5d9053242c3a08a0ca8b43ee656a6f7b3b95055a7b/cupy_cuda12x-13.6.0-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:a6970ceefe40f9acbede41d7fe17416bd277b1bd2093adcde457b23b578c5a59", size = 127334633, upload-time = "2025-08-18T08:24:43.065Z" },
     { url = "https://files.pythonhosted.org/packages/e0/95/d7e1295141e7d530674a3cc567e13ed0eb6b81524cb122d797ed996b5bea/cupy_cuda12x-13.6.0-cp312-cp312-manylinux2014_x86_64.whl", hash = "sha256:79b0cacb5e8b190ef409f9e03f06ac8de1b021b0c0dda47674d446f5557e0eb1", size = 112886268, upload-time = "2025-08-18T08:24:49.294Z" },
-    { url = "https://files.pythonhosted.org/packages/ae/8c/14555b63fd78cfac7b88af0094cea0a3cb845d243661ec7da69f7b3ea0de/cupy_cuda12x-13.6.0-cp312-cp312-win_amd64.whl", hash = "sha256:ca06fede7b8b83ca9ad80062544ef2e5bb8d4762d1c4fc3ac8349376de9c8a5e", size = 89785108, upload-time = "2025-08-18T08:24:54.527Z" },
 ]
 
 [[package]]
 name = "datasets"
-version = "4.4.1"
+version = "4.4.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "dill" },
@@ -427,7 +400,8 @@ dependencies = [
     { name = "httpx" },
     { name = "huggingface-hub" },
     { name = "multiprocess" },
-    { name = "numpy" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux'" },
+    { name = "numpy", version = "2.4.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'linux'" },
     { name = "packaging" },
     { name = "pandas" },
     { name = "pyarrow" },
@@ -436,9 +410,9 @@ dependencies = [
     { name = "tqdm" },
     { name = "xxhash" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/93/bf/0dae295d6d1ba0b1a200a9dd216838464b5bbd05da01407cb1330b377445/datasets-4.4.1.tar.gz", hash = "sha256:80322699aa8c0bbbdb7caa87906da689c3c2e29523cff698775c67f28fdab1fc", size = 585341, upload-time = "2025-11-05T16:00:38.162Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/c4/54/9359803da96bc65439a28fbb014dc2c90b7d4d8034a93b72362b0d40191f/datasets-4.4.2.tar.gz", hash = "sha256:9de16e415c4ba4713eac0493f7c7dc74f3aa21599297f00cc6ddab409cb7b24b", size = 586474, upload-time = "2025-12-19T15:03:09.129Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/3b/5e/6f8d874366788ad5d549e9ba258037d974dda6e004843be1bda794571701/datasets-4.4.1-py3-none-any.whl", hash = "sha256:c1163de5211e42546079ab355cc0250c7e6db16eb209ac5ac6252f801f596c44", size = 511591, upload-time = "2025-11-05T16:00:36.365Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/b5/fefa518c809de7bced5cddb7c21c010da66fa2ae494bda96844a280cc6ce/datasets-4.4.2-py3-none-any.whl", hash = "sha256:6f5ef3417504d9cd663c71c1b90b9a494ff4c2076a2cd6a6e40ceee6ad95befc", size = 512268, upload-time = "2025-12-19T15:03:07.087Z" },
 ]
 
 [[package]]
@@ -446,8 +420,8 @@ name = "depyf"
 version = "0.20.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "astor" },
-    { name = "dill" },
+    { name = "astor", marker = "sys_platform == 'linux'" },
+    { name = "dill", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/88/35/83fb0178212279aa0af031031905804c6de5618435d229f41ed21bb9ad2c/depyf-0.20.0.tar.gz", hash = "sha256:fb7683bd72c44f67b56029df2c47721e9a02ffa4d7b19095f1c54c4ebf797a98", size = 6168761, upload-time = "2025-10-13T12:33:38.589Z" }
 wheels = [
@@ -490,6 +464,20 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ba/5a/18ad964b0086c6e62e2e7500f7edc89e3faa45033c71c1893d34eed2b2de/dnspython-2.8.0-py3-none-any.whl", hash = "sha256:01d9bbc4a2d76bf0db7c1f729812ded6d912bd318d3b1cf81d30c0f845dbf3af", size = 331094, upload-time = "2025-09-07T18:57:58.071Z" },
 ]
 
+[[package]]
+name = "docker"
+version = "7.1.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pywin32", marker = "sys_platform == 'win32'" },
+    { name = "requests" },
+    { name = "urllib3" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/91/9b/4a2ea29aeba62471211598dac5d96825bb49348fa07e906ea930394a83ce/docker-7.1.0.tar.gz", hash = "sha256:ad8c70e6e3f8926cb8a92619b832b4ea5299e2831c14284663184e200546fa6c", size = 117834, upload-time = "2024-05-23T11:13:57.216Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e3/26/57c6fb270950d476074c087527a558ccb6f4436657314bfb6cdf484114c4/docker-7.1.0-py3-none-any.whl", hash = "sha256:c96b93b7f0a746f9e77d325bcfb87422a3d8bd4f03136ae8a85b37f1898d5fc0", size = 147774, upload-time = "2024-05-23T11:13:55.01Z" },
+]
+
 [[package]]
 name = "docstring-parser"
 version = "0.17.0"
@@ -513,8 +501,8 @@ name = "email-validator"
 version = "2.3.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "dnspython" },
-    { name = "idna" },
+    { name = "dnspython", marker = "sys_platform == 'linux'" },
+    { name = "idna", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/f5/22/900cb125c76b7aaa450ce02fd727f452243f2e91a61af068b40adba60ea9/email_validator-2.3.0.tar.gz", hash = "sha256:9fc05c37f2f6cf439ff414f8fc46d917929974a82244c20eb10231ba60c54426", size = 51238, upload-time = "2025-08-26T13:09:06.831Z" }
 wheels = [
@@ -523,65 +511,86 @@ wheels = [
 
 [[package]]
 name = "fastapi"
-version = "0.121.1"
+version = "0.128.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "annotated-doc" },
-    { name = "pydantic" },
-    { name = "starlette" },
-    { name = "typing-extensions" },
+    { name = "annotated-doc", marker = "sys_platform == 'linux'" },
+    { name = "pydantic", marker = "sys_platform == 'linux'" },
+    { name = "starlette", marker = "sys_platform == 'linux'" },
+    { name = "typing-extensions", marker = "sys_platform == 'linux'" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/6b/a4/29e1b861fc9017488ed02ff1052feffa40940cb355ed632a8845df84ce84/fastapi-0.121.1.tar.gz", hash = "sha256:b6dba0538fd15dab6fe4d3e5493c3957d8a9e1e9257f56446b5859af66f32441", size = 342523, upload-time = "2025-11-08T21:48:14.068Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/52/08/8c8508db6c7b9aae8f7175046af41baad690771c9bcde676419965e338c7/fastapi-0.128.0.tar.gz", hash = "sha256:1cc179e1cef10a6be60ffe429f79b829dce99d8de32d7acb7e6c8dfdf7f2645a", size = 365682, upload-time = "2025-12-27T15:21:13.714Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/94/fd/2e6f7d706899cc08690c5f6641e2ffbfffe019e8f16ce77104caa5730910/fastapi-0.121.1-py3-none-any.whl", hash = "sha256:2c5c7028bc3a58d8f5f09aecd3fd88a000ccc0c5ad627693264181a3c33aa1fc", size = 109192, upload-time = "2025-11-08T21:48:12.458Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/05/5cbb59154b093548acd0f4c7c474a118eda06da25aa75c616b72d8fcd92a/fastapi-0.128.0-py3-none-any.whl", hash = "sha256:aebd93f9716ee3b4f4fcfe13ffb7cf308d99c9f3ab5622d8877441072561582d", size = 103094, upload-time = "2025-12-27T15:21:12.154Z" },
 ]
 
 [package.optional-dependencies]
 standard = [
-    { name = "email-validator" },
-    { name = "fastapi-cli", extra = ["standard"] },
-    { name = "httpx" },
-    { name = "jinja2" },
-    { name = "python-multipart" },
-    { name = "uvicorn", extra = ["standard"] },
+    { name = "email-validator", marker = "sys_platform == 'linux'" },
+    { name = "fastapi-cli", extra = ["standard"], marker = "sys_platform == 'linux'" },
+    { name = "httpx", marker = "sys_platform == 'linux'" },
+    { name = "jinja2", marker = "sys_platform == 'linux'" },
+    { name = "pydantic-extra-types", marker = "sys_platform == 'linux'" },
+    { name = "pydantic-settings", marker = "sys_platform == 'linux'" },
+    { name = "python-multipart", marker = "sys_platform == 'linux'" },
+    { name = "uvicorn", extra = ["standard"], marker = "sys_platform == 'linux'" },
 ]
 
 [[package]]
 name = "fastapi-cli"
-version = "0.0.14"
+version = "0.0.20"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "rich-toolkit" },
-    { name = "typer" },
-    { name = "uvicorn", extra = ["standard"] },
+    { name = "rich-toolkit", marker = "sys_platform == 'linux'" },
+    { name = "typer", marker = "sys_platform == 'linux'" },
+    { name = "uvicorn", extra = ["standard"], marker = "sys_platform == 'linux'" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/cc/13/11e43d630be84e51ba5510a6da6a11eb93b44b72caa796137c5dddda937b/fastapi_cli-0.0.14.tar.gz", hash = "sha256:ddfb5de0a67f77a8b3271af1460489bd4d7f4add73d11fbfac613827b0275274", size = 17994, upload-time = "2025-10-20T16:33:21.054Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/d3/ca/d90fb3bfbcbd6e56c77afd9d114dd6ce8955d8bb90094399d1c70e659e40/fastapi_cli-0.0.20.tar.gz", hash = "sha256:d17c2634f7b96b6b560bc16b0035ed047d523c912011395f49f00a421692bc3a", size = 19786, upload-time = "2025-12-22T17:13:33.794Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/40/e8/bc8bbfd93dcc8e347ce98a3e654fb0d2e5f2739afb46b98f41a30c339269/fastapi_cli-0.0.14-py3-none-any.whl", hash = "sha256:e66b9ad499ee77a4e6007545cde6de1459b7f21df199d7f29aad2adaab168eca", size = 11151, upload-time = "2025-10-20T16:33:19.318Z" },
+    { url = "https://files.pythonhosted.org/packages/08/89/5c4eef60524d0fd704eb0706885b82cd5623a43396b94e4a5b17d3a3f516/fastapi_cli-0.0.20-py3-none-any.whl", hash = "sha256:e58b6a0038c0b1532b7a0af690656093dee666201b6b19d3c87175b358e9f783", size = 12390, upload-time = "2025-12-22T17:13:31.708Z" },
 ]
 
 [package.optional-dependencies]
 standard = [
-    { name = "fastapi-cloud-cli" },
-    { name = "uvicorn", extra = ["standard"] },
+    { name = "fastapi-cloud-cli", marker = "sys_platform == 'linux'" },
+    { name = "uvicorn", extra = ["standard"], marker = "sys_platform == 'linux'" },
 ]
 
 [[package]]
 name = "fastapi-cloud-cli"
-version = "0.3.1"
+version = "0.8.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "httpx" },
-    { name = "pydantic", extra = ["email"] },
-    { name = "rich-toolkit" },
-    { name = "rignore" },
-    { name = "sentry-sdk" },
-    { name = "typer" },
-    { name = "uvicorn", extra = ["standard"] },
+    { name = "fastar", marker = "sys_platform == 'linux'" },
+    { name = "httpx", marker = "sys_platform == 'linux'" },
+    { name = "pydantic", extra = ["email"], marker = "sys_platform == 'linux'" },
+    { name = "rich-toolkit", marker = "sys_platform == 'linux'" },
+    { name = "rignore", marker = "sys_platform == 'linux'" },
+    { name = "sentry-sdk", marker = "sys_platform == 'linux'" },
+    { name = "typer", marker = "sys_platform == 'linux'" },
+    { name = "uvicorn", extra = ["standard"], marker = "sys_platform == 'linux'" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/f9/48/0f14d8555b750dc8c04382804e4214f1d7f55298127f3a0237ba566e69dd/fastapi_cloud_cli-0.3.1.tar.gz", hash = "sha256:8c7226c36e92e92d0c89827e8f56dbf164ab2de4444bd33aa26b6c3f7675db69", size = 24080, upload-time = "2025-10-09T11:32:58.174Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/51/5d/3b33438de35521fab4968b232caa9a4bd568a5078f2b2dfb7bb8a4528603/fastapi_cloud_cli-0.8.0.tar.gz", hash = "sha256:cf07c502528bfd9e6b184776659f05d9212811d76bbec9fbb6bf34bed4c7456f", size = 30257, upload-time = "2025-12-23T12:08:33.904Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/dd/8e/abb95ef59e91bb5adaa2d18fbf9ea70fd524010bb03f406a2dd2a4775ef9/fastapi_cloud_cli-0.8.0-py3-none-any.whl", hash = "sha256:e9f40bee671d985fd25d7a5409b56d4f103777bf8a0c6d746ea5fbf97a8186d9", size = 22306, upload-time = "2025-12-23T12:08:32.68Z" },
+]
+
+[[package]]
+name = "fastar"
+version = "0.8.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/69/e7/f89d54fb04104114dd0552836dc2b47914f416cc0e200b409dd04a33de5e/fastar-0.8.0.tar.gz", hash = "sha256:f4d4d68dbf1c4c2808f0e730fac5843493fc849f70fe3ad3af60dfbaf68b9a12", size = 68524, upload-time = "2025-11-26T02:36:00.72Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/68/79/7f5a5e5513e6a737e5fb089d9c59c74d4d24dc24d581d3aa519b326bedda/fastapi_cloud_cli-0.3.1-py3-none-any.whl", hash = "sha256:7d1a98a77791a9d0757886b2ffbf11bcc6b3be93210dd15064be10b216bf7e00", size = 19711, upload-time = "2025-10-09T11:32:57.118Z" },
+    { url = "https://files.pythonhosted.org/packages/22/7e/1ae005addc789924a9268da2394d3bb5c6f96836f7e37b7e3d23c2362675/fastar-0.8.0-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:9d210da2de733ca801de83e931012349d209f38b92d9630ccaa94bd445bdc9b8", size = 868938, upload-time = "2025-11-26T02:33:51.119Z" },
+    { url = "https://files.pythonhosted.org/packages/a6/77/290a892b073b84bf82e6b2259708dfe79c54f356e252c2dd40180b16fe07/fastar-0.8.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aa02270721517078a5bd61a38719070ac2537a4aa6b6c48cf369cf2abc59174a", size = 765204, upload-time = "2025-11-26T02:32:47.02Z" },
+    { url = "https://files.pythonhosted.org/packages/d0/00/c3155171b976003af3281f5258189f1935b15d1221bfc7467b478c631216/fastar-0.8.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:83c391e5b789a720e4d0029b9559f5d6dee3226693c5b39c0eab8eaece997e0f", size = 764717, upload-time = "2025-11-26T02:33:02.453Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/43/405b7ad76207b2c11b7b59335b70eac19e4a2653977f5588a1ac8fed54f4/fastar-0.8.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3258d7a78a72793cdd081545da61cabe85b1f37634a1d0b97ffee0ff11d105ef", size = 931502, upload-time = "2025-11-26T02:33:18.619Z" },
+    { url = "https://files.pythonhosted.org/packages/da/8a/a3dde6d37cc3da4453f2845cdf16675b5686b73b164f37e2cc579b057c2c/fastar-0.8.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e6eab95dd985cdb6a50666cbeb9e4814676e59cfe52039c880b69d67cfd44767", size = 821454, upload-time = "2025-11-26T02:33:33.427Z" },
+    { url = "https://files.pythonhosted.org/packages/da/c1/904fe2468609c8990dce9fe654df3fbc7324a8d8e80d8240ae2c89757064/fastar-0.8.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:829b1854166141860887273c116c94e31357213fa8e9fe8baeb18bd6c38aa8d9", size = 821647, upload-time = "2025-11-26T02:34:07Z" },
+    { url = "https://files.pythonhosted.org/packages/c8/73/a0642ab7a400bc07528091785e868ace598fde06fcd139b8f865ec1b6f3c/fastar-0.8.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:b1667eae13f9457a3c737f4376d68e8c3e548353538b28f7e4273a30cb3965cd", size = 986342, upload-time = "2025-11-26T02:34:53.371Z" },
+    { url = "https://files.pythonhosted.org/packages/af/af/60c1bfa6edab72366461a95f053d0f5f7ab1825fe65ca2ca367432cd8629/fastar-0.8.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:b864a95229a7db0814cd9ef7987cb713fd43dce1b0d809dd17d9cd6f02fdde3e", size = 1040207, upload-time = "2025-11-26T02:35:10.65Z" },
+    { url = "https://files.pythonhosted.org/packages/f6/a0/0d624290dec622e7fa084b6881f456809f68777d54a314f5dde932714506/fastar-0.8.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:c05fbc5618ce17675a42576fa49858d79734627f0a0c74c0875ab45ee8de340c", size = 1045031, upload-time = "2025-11-26T02:35:28.108Z" },
+    { url = "https://files.pythonhosted.org/packages/a7/74/cf663af53c4706ba88e6b4af44a6b0c3bd7d7ca09f079dc40647a8f06585/fastar-0.8.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7f41c51ee96f338662ee3c3df4840511ba3f9969606840f1b10b7cb633a3c716", size = 994877, upload-time = "2025-11-26T02:35:45.797Z" },
 ]
 
 [[package]]
@@ -594,36 +603,45 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/80/07/cdecb7aa976f34328372f1c4efd6c9dc1b039b3cc8d3f38787d640009a25/fastrlock-0.8.3-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5f13ec08f1adb1aa916c384b05ecb7dbebb8df9ea81abd045f60941c6283a670", size = 53924, upload-time = "2024-12-17T11:02:20.85Z" },
     { url = "https://files.pythonhosted.org/packages/88/6d/59c497f8db9a125066dd3a7442fab6aecbe90d6fec344c54645eaf311666/fastrlock-0.8.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:0ea4e53a04980d646def0f5e4b5e8bd8c7884288464acab0b37ca0c65c482bfe", size = 52140, upload-time = "2024-12-17T11:02:22.263Z" },
     { url = "https://files.pythonhosted.org/packages/62/04/9138943c2ee803d62a48a3c17b69de2f6fa27677a6896c300369e839a550/fastrlock-0.8.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:38340f6635bd4ee2a4fb02a3a725759fe921f2ca846cb9ca44531ba739cc17b4", size = 53261, upload-time = "2024-12-17T11:02:24.418Z" },
-    { url = "https://files.pythonhosted.org/packages/e2/4b/db35a52589764c7745a613b6943bbd018f128d42177ab92ee7dde88444f6/fastrlock-0.8.3-cp312-cp312-win_amd64.whl", hash = "sha256:da06d43e1625e2ffddd303edcd6d2cd068e1c486f5fd0102b3f079c44eb13e2c", size = 31235, upload-time = "2024-12-17T11:02:25.708Z" },
 ]
 
 [[package]]
 name = "filelock"
-version = "3.20.0"
+version = "3.20.1"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/58/46/0028a82567109b5ef6e4d2a1f04a583fb513e6cf9527fcdd09afd817deeb/filelock-3.20.0.tar.gz", hash = "sha256:711e943b4ec6be42e1d4e6690b48dc175c822967466bb31c0c293f34334c13f4", size = 18922, upload-time = "2025-10-08T18:03:50.056Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/a7/23/ce7a1126827cedeb958fc043d61745754464eb56c5937c35bbf2b8e26f34/filelock-3.20.1.tar.gz", hash = "sha256:b8360948b351b80f420878d8516519a2204b07aefcdcfd24912a5d33127f188c", size = 19476, upload-time = "2025-12-15T23:54:28.027Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/76/91/7216b27286936c16f5b4d0c530087e4a54eead683e6b0b73dd0c64844af6/filelock-3.20.0-py3-none-any.whl", hash = "sha256:339b4732ffda5cd79b13f4e2711a31b0365ce445d95d243bb996273d072546a2", size = 16054, upload-time = "2025-10-08T18:03:48.35Z" },
+    { url = "https://files.pythonhosted.org/packages/e3/7f/a1a97644e39e7316d850784c642093c99df1290a460df4ede27659056834/filelock-3.20.1-py3-none-any.whl", hash = "sha256:15d9e9a67306188a44baa72f569d2bfd803076269365fdea0934385da4dc361a", size = 16666, upload-time = "2025-12-15T23:54:26.874Z" },
+]
+
+[[package]]
+name = "flash-attn"
+version = "2.8.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "einops", marker = "sys_platform == 'linux'" },
+    { name = "torch", version = "2.9.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux'" },
 ]
+sdist = { url = "https://files.pythonhosted.org/packages/3b/b2/8d76c41ad7974ee264754709c22963447f7f8134613fd9ce80984ed0dab7/flash_attn-2.8.3.tar.gz", hash = "sha256:1e71dd64a9e0280e0447b8a0c2541bad4bf6ac65bdeaa2f90e51a9e57de0370d", size = 8447812, upload-time = "2025-08-15T08:28:12.911Z" }
 
 [[package]]
 name = "flashinfer-python"
 version = "0.5.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "apache-tvm-ffi" },
-    { name = "click" },
-    { name = "einops" },
-    { name = "ninja" },
-    { name = "numpy" },
-    { name = "nvidia-cudnn-frontend" },
-    { name = "nvidia-cutlass-dsl" },
-    { name = "nvidia-ml-py" },
-    { name = "packaging" },
-    { name = "requests" },
-    { name = "tabulate" },
-    { name = "torch" },
-    { name = "tqdm" },
+    { name = "apache-tvm-ffi", marker = "sys_platform == 'linux'" },
+    { name = "click", marker = "sys_platform == 'linux'" },
+    { name = "einops", marker = "sys_platform == 'linux'" },
+    { name = "ninja", marker = "sys_platform == 'linux'" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux'" },
+    { name = "nvidia-cudnn-frontend", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-cutlass-dsl", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-ml-py", marker = "sys_platform == 'linux'" },
+    { name = "packaging", marker = "sys_platform == 'linux'" },
+    { name = "requests", marker = "sys_platform == 'linux'" },
+    { name = "tabulate", marker = "sys_platform == 'linux'" },
+    { name = "torch", version = "2.9.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux'" },
+    { name = "tqdm", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/b4/91/cca69baeff24bb3efd12c7479a026432c8717ee47193694010494c528b22/flashinfer_python-0.5.3.tar.gz", hash = "sha256:100d59b0ede47878d2808cd3a1b9039d7a952d66338bc9f68dac192ae1b2e3f1", size = 4682367, upload-time = "2025-11-20T21:22:46.976Z" }
 wheels = [
@@ -674,15 +692,39 @@ name = "gguf"
 version = "0.17.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "numpy" },
-    { name = "pyyaml" },
-    { name = "tqdm" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux'" },
+    { name = "pyyaml", marker = "sys_platform == 'linux'" },
+    { name = "tqdm", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/08/08/7de1ca4b71e7bf33b547f82bb22505e221b5fa42f67d635e200e0ad22ad6/gguf-0.17.1.tar.gz", hash = "sha256:36ad71aad900a3e75fc94ebe96ea6029f03a4e44be7627ef7ad3d03e8c7bcb53", size = 89338, upload-time = "2025-06-19T14:00:33.705Z" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/fc/31/6a93a887617ee7deeaa602ca3d02d1c12a6cb8a742a695de5d128f5fa46a/gguf-0.17.1-py3-none-any.whl", hash = "sha256:7bc5aa7eeb1931f7d39b48fdc5b38fda6b294b9dca75cf607ac69557840a3943", size = 96224, upload-time = "2025-06-19T14:00:32.88Z" },
 ]
 
+[[package]]
+name = "gitdb"
+version = "4.0.12"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "smmap" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/72/94/63b0fc47eb32792c7ba1fe1b694daec9a63620db1e313033d18140c2320a/gitdb-4.0.12.tar.gz", hash = "sha256:5ef71f855d191a3326fcfbc0d5da835f26b13fbcba60c32c21091c349ffdb571", size = 394684, upload-time = "2025-01-02T07:20:46.413Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a0/61/5c78b91c3143ed5c14207f463aecfc8f9dbb5092fb2869baf37c273b2705/gitdb-4.0.12-py3-none-any.whl", hash = "sha256:67073e15955400952c6565cc3e707c554a4eea2e428946f7a4c162fab9bd9bcf", size = 62794, upload-time = "2025-01-02T07:20:43.624Z" },
+]
+
+[[package]]
+name = "gitpython"
+version = "3.1.45"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "gitdb" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/9a/c8/dd58967d119baab745caec2f9d853297cec1989ec1d63f677d3880632b88/gitpython-3.1.45.tar.gz", hash = "sha256:85b0ee964ceddf211c41b9f27a49086010a190fd8132a24e21f362a4b36a791c", size = 215076, upload-time = "2025-07-24T03:45:54.871Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/01/61/d4b89fec821f72385526e1b9d9a3a0385dda4a72b206d28049e2c7cd39b8/gitpython-3.1.45-py3-none-any.whl", hash = "sha256:8908cb2e02fb3b93b7eb0f2827125cb699869470432cc885f019b8fd0fccff77", size = 208168, upload-time = "2025-07-24T03:45:52.517Z" },
+]
+
 [[package]]
 name = "h11"
 version = "0.16.0"
@@ -726,13 +768,10 @@ version = "0.7.1"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/b5/46/120a669232c7bdedb9d52d4aeae7e6c7dfe151e99dc70802e2fc7a5e1993/httptools-0.7.1.tar.gz", hash = "sha256:abd72556974f8e7c74a259655924a717a2365b236c882c3f6f8a45fe94703ac9", size = 258961, upload-time = "2025-10-10T03:55:08.559Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/53/7f/403e5d787dc4942316e515e949b0c8a013d84078a915910e9f391ba9b3ed/httptools-0.7.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:38e0c83a2ea9746ebbd643bdfb521b9aa4a91703e2cd705c20443405d2fd16a5", size = 206280, upload-time = "2025-10-10T03:54:39.274Z" },
-    { url = "https://files.pythonhosted.org/packages/2a/0d/7f3fd28e2ce311ccc998c388dd1c53b18120fda3b70ebb022b135dc9839b/httptools-0.7.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f25bbaf1235e27704f1a7b86cd3304eabc04f569c828101d94a0e605ef7205a5", size = 110004, upload-time = "2025-10-10T03:54:40.403Z" },
     { url = "https://files.pythonhosted.org/packages/84/a6/b3965e1e146ef5762870bbe76117876ceba51a201e18cc31f5703e454596/httptools-0.7.1-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:2c15f37ef679ab9ecc06bfc4e6e8628c32a8e4b305459de7cf6785acd57e4d03", size = 517655, upload-time = "2025-10-10T03:54:41.347Z" },
     { url = "https://files.pythonhosted.org/packages/11/7d/71fee6f1844e6fa378f2eddde6c3e41ce3a1fb4b2d81118dd544e3441ec0/httptools-0.7.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7fe6e96090df46b36ccfaf746f03034e5ab723162bc51b0a4cf58305324036f2", size = 511440, upload-time = "2025-10-10T03:54:42.452Z" },
     { url = "https://files.pythonhosted.org/packages/22/a5/079d216712a4f3ffa24af4a0381b108aa9c45b7a5cc6eb141f81726b1823/httptools-0.7.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:f72fdbae2dbc6e68b8239defb48e6a5937b12218e6ffc2c7846cc37befa84362", size = 495186, upload-time = "2025-10-10T03:54:43.937Z" },
     { url = "https://files.pythonhosted.org/packages/e9/9e/025ad7b65278745dee3bd0ebf9314934c4592560878308a6121f7f812084/httptools-0.7.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e99c7b90a29fd82fea9ef57943d501a16f3404d7b9ee81799d41639bdaae412c", size = 499192, upload-time = "2025-10-10T03:54:45.003Z" },
-    { url = "https://files.pythonhosted.org/packages/6d/de/40a8f202b987d43afc4d54689600ff03ce65680ede2f31df348d7f368b8f/httptools-0.7.1-cp312-cp312-win_amd64.whl", hash = "sha256:3e14f530fefa7499334a79b0cf7e7cd2992870eb893526fb097d51b4f2d0f321", size = 86694, upload-time = "2025-10-10T03:54:45.923Z" },
 ]
 
 [[package]]
@@ -793,17 +832,12 @@ version = "3.4.0.post0"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/2d/30/7ab4b9e88e7946f6beef419f74edcc541df3ea562c7882257b4eaa82417d/ijson-3.4.0.post0.tar.gz", hash = "sha256:9aa02dc70bb245670a6ca7fba737b992aeeb4895360980622f7e568dbf23e41e", size = 67216, upload-time = "2025-10-10T05:29:25.62Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/7d/fe/3b6af0025288e769dbfa30485dae1b3bd3f33f00390f3ee532cbb1c33e9b/ijson-3.4.0.post0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:b607a500fca26101be47d2baf7cddb457b819ab60a75ce51ed1092a40da8b2f9", size = 87847, upload-time = "2025-10-10T05:28:07.229Z" },
-    { url = "https://files.pythonhosted.org/packages/6e/a5/95ee2ca82f3b1a57892452f6e5087607d56c620beb8ce625475194568698/ijson-3.4.0.post0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:4827d9874a6a81625412c59f7ca979a84d01f7f6bfb3c6d4dc4c46d0382b14e0", size = 59815, upload-time = "2025-10-10T05:28:08.448Z" },
-    { url = "https://files.pythonhosted.org/packages/51/8d/5a704ab3c17c55c21c86423458db8610626ca99cc9086a74dfeb7ee9054c/ijson-3.4.0.post0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d4d4afec780881edb2a0d2dd40b1cdbe246e630022d5192f266172a0307986a7", size = 59648, upload-time = "2025-10-10T05:28:09.307Z" },
     { url = "https://files.pythonhosted.org/packages/25/56/ca5d6ca145d007f30b44e747f3c163bc08710ce004af0deaad4a2301339b/ijson-3.4.0.post0-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:432fb60ffb952926f9438e0539011e2dfcd108f8426ee826ccc6173308c3ff2c", size = 138279, upload-time = "2025-10-10T05:28:10.489Z" },
     { url = "https://files.pythonhosted.org/packages/c3/d3/22e3cc806fcdda7ad4c8482ed74db7a017d4a1d49b4300c7bc07052fb561/ijson-3.4.0.post0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:54a0e3e05d9a0c95ecba73d9579f146cf6d5c5874116c849dba2d39a5f30380e", size = 149110, upload-time = "2025-10-10T05:28:12.263Z" },
     { url = "https://files.pythonhosted.org/packages/3e/04/efb30f413648b9267f5a33920ac124d7ebef3bc4063af8f6ffc8ca11ddcb/ijson-3.4.0.post0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:05807edc0bcbd222dc6ea32a2b897f0c81dc7f12c8580148bc82f6d7f5e7ec7b", size = 149026, upload-time = "2025-10-10T05:28:13.557Z" },
     { url = "https://files.pythonhosted.org/packages/2d/cf/481165f7046ade32488719300a3994a437020bc41cfbb54334356348f513/ijson-3.4.0.post0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a5269af16f715855d9864937f9dd5c348ca1ac49cee6a2c7a1b7091c159e874f", size = 150012, upload-time = "2025-10-10T05:28:14.859Z" },
     { url = "https://files.pythonhosted.org/packages/0f/24/642e3289917ecf860386e26dfde775f9962d26ab7f6c2e364ed3ca3c25d8/ijson-3.4.0.post0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:b200df83c901f5bfa416d069ac71077aa1608f854a4c50df1b84ced560e9c9ec", size = 142193, upload-time = "2025-10-10T05:28:16.131Z" },
     { url = "https://files.pythonhosted.org/packages/0f/f5/fd2f038abe95e553e1c3ee207cda19db9196eb416e63c7c89699a8cf0db7/ijson-3.4.0.post0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:6458bd8e679cdff459a0a5e555b107c3bbacb1f382da3fe0f40e392871eb518d", size = 150904, upload-time = "2025-10-10T05:28:17.401Z" },
-    { url = "https://files.pythonhosted.org/packages/49/35/24259d22519987928164e6cb8fe3486e1df0899b2999ada4b0498639b463/ijson-3.4.0.post0-cp312-cp312-win32.whl", hash = "sha256:55f7f656b5986326c978cbb3a9eea9e33f3ef6ecc4535b38f1d452c731da39ab", size = 52358, upload-time = "2025-10-10T05:28:18.315Z" },
-    { url = "https://files.pythonhosted.org/packages/a1/2b/6f7ade27a8ff5758fc41006dadd2de01730def84fe3e60553b329c59e0d4/ijson-3.4.0.post0-cp312-cp312-win_amd64.whl", hash = "sha256:e15833dcf6f6d188fdc624a31cd0520c3ba21b6855dc304bc7c1a8aeca02d4ac", size = 54789, upload-time = "2025-10-10T05:28:19.552Z" },
 ]
 
 [[package]]
@@ -887,10 +921,10 @@ name = "jsonschema"
 version = "4.25.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "attrs" },
-    { name = "jsonschema-specifications" },
-    { name = "referencing" },
-    { name = "rpds-py" },
+    { name = "attrs", marker = "sys_platform == 'linux'" },
+    { name = "jsonschema-specifications", marker = "sys_platform == 'linux'" },
+    { name = "referencing", marker = "sys_platform == 'linux'" },
+    { name = "rpds-py", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/74/69/f7185de793a29082a9f3c7728268ffb31cb5095131a9c139a74078e27336/jsonschema-4.25.1.tar.gz", hash = "sha256:e4a9655ce0da0c0b67a085847e00a3a51449e1157f4f75e9fb5aa545e122eb85", size = 357342, upload-time = "2025-08-18T17:03:50.038Z" }
 wheels = [
@@ -902,7 +936,7 @@ name = "jsonschema-specifications"
 version = "2025.9.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "referencing" },
+    { name = "referencing", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/19/74/a633ee74eb36c44aa6d1095e7cc5569bebf04342ee146178e2d36600708b/jsonschema_specifications-2025.9.1.tar.gz", hash = "sha256:b540987f239e745613c7a9176f3edb72b832a4ac465cf02712288397832b5e8d", size = 32855, upload-time = "2025-09-08T01:34:59.186Z" }
 wheels = [
@@ -918,30 +952,14 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/2d/00/d90b10b962b4277f5e64a78b6609968859ff86889f5b898c1a778c06ec00/lark-1.2.2-py3-none-any.whl", hash = "sha256:c2276486b02f0f1b90be155f2c8ba4a8e194d42775786db622faccd652d8e80c", size = 111036, upload-time = "2024-08-13T19:48:58.603Z" },
 ]
 
-[[package]]
-name = "latex2sympy2-extended"
-version = "1.10.2"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "antlr4-python3-runtime" },
-    { name = "sympy" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/f4/de/472f9115c14c6f6d8a5889cabe3418283d708bde62ce00402c29441deed4/latex2sympy2_extended-1.10.2.tar.gz", hash = "sha256:41a517ffcc5a140e910a7d1646ce6ff440817e5f9d48fc8279d88bd0925bc389", size = 206188, upload-time = "2025-07-02T15:26:06.225Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/ab/60/dfbbf40e3a371388c0e03ff65b01319b7d4023e883df6d7261125772ffdc/latex2sympy2_extended-1.10.2-py3-none-any.whl", hash = "sha256:f910442c5b02a466c1046f47d05cc5285181068b882399281f30102715337fb7", size = 207855, upload-time = "2025-07-02T15:26:04.88Z" },
-]
-
 [[package]]
 name = "llguidance"
 version = "1.3.0"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/95/48/3f7a9d3ff1b36bba92b5107a3a21286821227afe9ea464736133994d61fb/llguidance-1.3.0.tar.gz", hash = "sha256:861249afd51dc325646834462ea827e57a5c2b2042e108e6aae7059fdad9104d", size = 1070460, upload-time = "2025-10-20T19:58:44.164Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/3b/33/be5acb85cd8cdc4afde33d9c234eece9f318e087920255af3c05864cd3e7/llguidance-1.3.0-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:f7685222660a762e481ac633d49cc559c64980fe2ee59c8f932a5bb5cbc0c2c2", size = 3220647, upload-time = "2025-10-20T19:58:42.542Z" },
-    { url = "https://files.pythonhosted.org/packages/82/e6/b48bda5b15efeaeb62bd0dba8fc6a01d4ae5457a85dbb5d18632385fe15c/llguidance-1.3.0-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:098030ff0687261a3f1bd54cf21fe951fc861d56d37a0671250dd36677eaf224", size = 3099830, upload-time = "2025-10-20T19:58:40.826Z" },
     { url = "https://files.pythonhosted.org/packages/aa/11/44389d3d1526d7a5c38ffd587a5ebc61d7bee443ac1dea95f2089ad58f5f/llguidance-1.3.0-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6f6caca5d78db7f76e1fbb0fff8607b861c32d47fa3d5dee2fc49de27ee269df", size = 2835242, upload-time = "2025-10-20T19:58:34.518Z" },
     { url = "https://files.pythonhosted.org/packages/83/a8/1ff2bedb8f9acb46a2d2d603415d272bb622c142ea86f5b95445cc6e366c/llguidance-1.3.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc17e9dd602c3879bf91664a64bf72f54c74dbfbeb24ccfab6a5fe435b12f7aa", size = 3033133, upload-time = "2025-10-20T19:58:38.721Z" },
-    { url = "https://files.pythonhosted.org/packages/5a/7e/809349638231f469b9056c0e1bfd924d5ef5558b3b3ec72d093b6fad33b1/llguidance-1.3.0-cp39-abi3-win_amd64.whl", hash = "sha256:1d1cd1c8618d1a13605d3e057c978651e551c8c469b481ee4041f1d6c436002d", size = 2789946, upload-time = "2025-10-20T19:58:45.958Z" },
 ]
 
 [[package]]
@@ -950,11 +968,8 @@ version = "0.44.0"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/89/6a/95a3d3610d5c75293d5dbbb2a76480d5d4eeba641557b69fe90af6c5b84e/llvmlite-0.44.0.tar.gz", hash = "sha256:07667d66a5d150abed9157ab6c0b9393c9356f229784a4385c02f99e94fc94d4", size = 171880, upload-time = "2025-01-20T11:14:41.342Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/15/86/e3c3195b92e6e492458f16d233e58a1a812aa2bfbef9bdd0fbafcec85c60/llvmlite-0.44.0-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:1d671a56acf725bf1b531d5ef76b86660a5ab8ef19bb6a46064a705c6ca80aad", size = 28132297, upload-time = "2025-01-20T11:13:32.57Z" },
-    { url = "https://files.pythonhosted.org/packages/d6/53/373b6b8be67b9221d12b24125fd0ec56b1078b660eeae266ec388a6ac9a0/llvmlite-0.44.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5f79a728e0435493611c9f405168682bb75ffd1fbe6fc360733b850c80a026db", size = 26201105, upload-time = "2025-01-20T11:13:38.744Z" },
     { url = "https://files.pythonhosted.org/packages/cb/da/8341fd3056419441286c8e26bf436923021005ece0bff5f41906476ae514/llvmlite-0.44.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0143a5ef336da14deaa8ec26c5449ad5b6a2b564df82fcef4be040b9cacfea9", size = 42361901, upload-time = "2025-01-20T11:13:46.711Z" },
     { url = "https://files.pythonhosted.org/packages/53/ad/d79349dc07b8a395a99153d7ce8b01d6fcdc9f8231355a5df55ded649b61/llvmlite-0.44.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d752f89e31b66db6f8da06df8b39f9b91e78c5feea1bf9e8c1fba1d1c24c065d", size = 41184247, upload-time = "2025-01-20T11:13:56.159Z" },
-    { url = "https://files.pythonhosted.org/packages/e2/3b/a9a17366af80127bd09decbe2a54d8974b6d8b274b39bf47fbaedeec6307/llvmlite-0.44.0-cp312-cp312-win_amd64.whl", hash = "sha256:eae7e2d4ca8f88f89d315b48c6b741dcb925d6a1042da694aa16ab3dd4cbd3a1", size = 30332380, upload-time = "2025-01-20T11:14:02.442Z" },
 ]
 
 [[package]]
@@ -962,10 +977,10 @@ name = "lm-format-enforcer"
 version = "0.11.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "interegular" },
-    { name = "packaging" },
-    { name = "pydantic" },
-    { name = "pyyaml" },
+    { name = "interegular", marker = "sys_platform == 'linux'" },
+    { name = "packaging", marker = "sys_platform == 'linux'" },
+    { name = "pydantic", marker = "sys_platform == 'linux'" },
+    { name = "pyyaml", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/84/d5/41cd417ba7dfdbbcfe46cebf81fb3dfd7c591b89897560ad05bb410a465d/lm_format_enforcer-0.11.3.tar.gz", hash = "sha256:e68081c108719cce284a9bcc889709b26ffb085a1945b5eba3a12cfa96d528da", size = 40258, upload-time = "2025-08-24T19:37:47.527Z" }
 wheels = [
@@ -976,10 +991,6 @@ wheels = [
 name = "loguru"
 version = "0.7.3"
 source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "colorama", marker = "sys_platform == 'win32'" },
-    { name = "win32-setctime", marker = "sys_platform == 'win32'" },
-]
 sdist = { url = "https://files.pythonhosted.org/packages/3a/05/a1dae3dffd1116099471c643b8924f5aa6524411dc6c63fdae648c4f1aca/loguru-0.7.3.tar.gz", hash = "sha256:19480589e77d47b8d85b2c827ad95d49bf31b0dcde16593892eb51dd18706eb6", size = 63559, upload-time = "2024-12-06T11:20:56.608Z" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/0c/29/0348de65b8cc732daa3e33e67806420b2ae89bdce2b04af740289c5c6c8c/loguru-0.7.3-py3-none-any.whl", hash = "sha256:31a33c10c8e1e10422bfd431aeb5d351c7cf7fa671e3c4df004162264b28220c", size = 61595, upload-time = "2024-12-06T11:20:54.538Z" },
@@ -992,21 +1003,24 @@ source = { editable = "." }
 dependencies = [
     { name = "aiohttp" },
     { name = "beartype" },
+    { name = "datasets" },
+    { name = "flash-attn", marker = "sys_platform == 'linux'" },
     { name = "jaxtyping" },
     { name = "openai" },
     { name = "peft" },
     { name = "rich" },
-    { name = "torch" },
-    { name = "vllm" },
+    { name = "setuptools" },
+    { name = "torch", version = "2.9.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux'" },
+    { name = "torch", version = "2.9.1", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform == 'darwin'" },
+    { name = "torch", version = "2.9.1+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform != 'darwin' and sys_platform != 'linux'" },
+    { name = "torch-c-dlpack-ext" },
+    { name = "vllm", marker = "sys_platform == 'linux'" },
+    { name = "wandb" },
 ]
 
 [package.optional-dependencies]
-examples = [
-    { name = "datasets" },
-    { name = "math-verify" },
-]
-pipelinerl = [
-    { name = "redis" },
+code-exec = [
+    { name = "docker" },
 ]
 
 [package.dev-dependencies]
@@ -1031,17 +1045,21 @@ typing = [
 requires-dist = [
     { name = "aiohttp", specifier = ">=3.13.2" },
     { name = "beartype", specifier = ">=0.22.9" },
-    { name = "datasets", marker = "extra == 'examples'", specifier = "==4.4.1" },
+    { name = "datasets", specifier = ">=4.4.2" },
+    { name = "docker", marker = "extra == 'code-exec'", specifier = ">=7.1.0" },
+    { name = "flash-attn", marker = "sys_platform == 'linux'", specifier = ">=2.7.0" },
     { name = "jaxtyping", specifier = ">=0.3.4" },
-    { name = "math-verify", marker = "extra == 'examples'", specifier = "==0.8.0" },
     { name = "openai", specifier = ">=2.7.1" },
     { name = "peft", specifier = ">=0.18.0" },
-    { name = "redis", marker = "extra == 'pipelinerl'", specifier = ">=7.1.0" },
     { name = "rich", specifier = ">=14.2.0" },
-    { name = "torch", specifier = ">=2.8.0" },
-    { name = "vllm", specifier = ">=0.13.0" },
+    { name = "setuptools", specifier = ">=79.0.1" },
+    { name = "torch", marker = "sys_platform != 'linux'", specifier = ">=2.9.0", index = "https://download.pytorch.org/whl/cpu" },
+    { name = "torch", marker = "sys_platform == 'linux'", specifier = ">=2.9.0", index = "https://download.pytorch.org/whl/cu128" },
+    { name = "torch-c-dlpack-ext", specifier = ">=0.1.4" },
+    { name = "vllm", marker = "sys_platform == 'linux'", specifier = ">=0.12.0" },
+    { name = "wandb", specifier = ">=0.23.1" },
 ]
-provides-extras = ["pipelinerl", "examples"]
+provides-extras = ["code-exec"]
 
 [package.metadata.requires-dev]
 dev = [
@@ -1088,37 +1106,24 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e5/f1/216fc1bbfd74011693a4fd837e7026152e89c4bcf3e77b6692fba9923123/markupsafe-3.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:35add3b638a5d900e807944a078b51922212fb3dedb01633a8defc4b01a3c85f", size = 13906, upload-time = "2025-09-27T18:36:40.689Z" },
 ]
 
-[[package]]
-name = "math-verify"
-version = "0.8.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "latex2sympy2-extended" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/35/b5/b1db6fa6b6c28ebbe1889ee11a4703a72a2ca7750ec415f4559c758cf01a/math_verify-0.8.0.tar.gz", hash = "sha256:3295e0adb94bfe553ff6e3189c44f1916a85aa24ab5d1900f2086a706e28f7c4", size = 60191, upload-time = "2025-07-02T15:52:07.209Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/fe/9f/59979f699b5c97334298f1295bc9fcdc9904d98d2276479bffff863d23b1/math_verify-0.8.0-py3-none-any.whl", hash = "sha256:31ca651296d817a9bb3fd58ca1fd0d192dcea709b1e5ecf2d0a4514c16f89087", size = 29994, upload-time = "2025-07-02T15:52:05.023Z" },
-]
-
 [[package]]
 name = "mcp"
 version = "1.25.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "anyio" },
-    { name = "httpx" },
-    { name = "httpx-sse" },
-    { name = "jsonschema" },
-    { name = "pydantic" },
-    { name = "pydantic-settings" },
-    { name = "pyjwt", extra = ["crypto"] },
-    { name = "python-multipart" },
-    { name = "pywin32", marker = "sys_platform == 'win32'" },
-    { name = "sse-starlette" },
-    { name = "starlette" },
-    { name = "typing-extensions" },
-    { name = "typing-inspection" },
-    { name = "uvicorn", marker = "sys_platform != 'emscripten'" },
+    { name = "anyio", marker = "sys_platform == 'linux'" },
+    { name = "httpx", marker = "sys_platform == 'linux'" },
+    { name = "httpx-sse", marker = "sys_platform == 'linux'" },
+    { name = "jsonschema", marker = "sys_platform == 'linux'" },
+    { name = "pydantic", marker = "sys_platform == 'linux'" },
+    { name = "pydantic-settings", marker = "sys_platform == 'linux'" },
+    { name = "pyjwt", extra = ["crypto"], marker = "sys_platform == 'linux'" },
+    { name = "python-multipart", marker = "sys_platform == 'linux'" },
+    { name = "sse-starlette", marker = "sys_platform == 'linux'" },
+    { name = "starlette", marker = "sys_platform == 'linux'" },
+    { name = "typing-extensions", marker = "sys_platform == 'linux'" },
+    { name = "typing-inspection", marker = "sys_platform == 'linux'" },
+    { name = "uvicorn", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/d5/2d/649d80a0ecf6a1f82632ca44bec21c0461a9d9fc8934d38cb5b319f2db5e/mcp-1.25.0.tar.gz", hash = "sha256:56310361ebf0364e2d438e5b45f7668cbb124e158bb358333cd06e49e83a6802", size = 605387, upload-time = "2025-12-19T10:19:56.985Z" }
 wheels = [
@@ -1136,66 +1141,26 @@ wheels = [
 
 [[package]]
 name = "mistral-common"
-version = "1.8.5"
+version = "1.8.8"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "jsonschema" },
-    { name = "numpy" },
-    { name = "pillow" },
-    { name = "pydantic" },
-    { name = "pydantic-extra-types", extra = ["pycountry"] },
-    { name = "requests" },
-    { name = "tiktoken" },
-    { name = "typing-extensions" },
+    { name = "jsonschema", marker = "sys_platform == 'linux'" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux'" },
+    { name = "pillow", marker = "sys_platform == 'linux'" },
+    { name = "pydantic", marker = "sys_platform == 'linux'" },
+    { name = "pydantic-extra-types", extra = ["pycountry"], marker = "sys_platform == 'linux'" },
+    { name = "requests", marker = "sys_platform == 'linux'" },
+    { name = "tiktoken", marker = "sys_platform == 'linux'" },
+    { name = "typing-extensions", marker = "sys_platform == 'linux'" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/ce/ff/1992a00ccc936f2c6e69ecb1f2cac678e0fd46c53c71bdab99eda4f89dfd/mistral_common-1.8.5.tar.gz", hash = "sha256:9f6204ede9c807f09040a208a9381ae78ef93e2e5a9cd5202dc12e712a025de8", size = 6331923, upload-time = "2025-09-12T06:43:01.937Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/0a/bb/6fc2e46d9920c80f0d053d58be5b0546c18010ff3a5f9b9d91299226e989/mistral_common-1.8.8.tar.gz", hash = "sha256:8ae28b3f88bce1b9396f5d1107e5ea87e4130486b9f6d811df6d5ac07bff2186", size = 6337014, upload-time = "2025-12-22T10:51:47.245Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/91/4a/54e19c5e75939fd9418c7b806c21d12cf252ea2ba38f122b597272b459dd/mistral_common-1.8.5-py3-none-any.whl", hash = "sha256:f3cf87b61958a00485e603f3fe0530eb509d7e9b2f7178329dcd260e307eced1", size = 6515140, upload-time = "2025-09-12T06:42:59.622Z" },
+    { url = "https://files.pythonhosted.org/packages/73/02/c1866598c8e94a4d0593b73e6dec0afea722227b9b3223bf6bb8ab269fa7/mistral_common-1.8.8-py3-none-any.whl", hash = "sha256:f63ce79b1867b3fc7c8b66fcaedab3b07966185567558038dc02321c17e4f39f", size = 6518005, upload-time = "2025-12-22T10:51:44.88Z" },
 ]
 
 [package.optional-dependencies]
 image = [
-    { name = "opencv-python-headless" },
-]
-
-[[package]]
-name = "mlx"
-version = "0.29.3"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "mlx-metal", marker = "sys_platform == 'darwin'" },
-]
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/07/f5/14e12e219a2715296150d35f930dc3a6ff319cd60126408e563f03100113/mlx-0.29.3-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:86c62791ce930028d75c41b88b4e3ceb58f5f2e263ff9bfacda998b0c03d9544", size = 549516, upload-time = "2025-10-17T19:18:13.831Z" },
-    { url = "https://files.pythonhosted.org/packages/c6/e2/5177c80e8c33a8be89fa45fa0a839d5b6a5578687d0ec973bf03638a4e73/mlx-0.29.3-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:cddf6bcdc561094af6b3f0706f8768ecc5216a97eb6973e838c3ac2e2fca2cc8", size = 549509, upload-time = "2025-10-17T19:17:21.517Z" },
-    { url = "https://files.pythonhosted.org/packages/11/89/aa424217a7a0291b84f8969d504ac63f5af0ef60f248fe5562c3d6e44048/mlx-0.29.3-cp312-cp312-macosx_15_0_arm64.whl", hash = "sha256:b2e1a249437d017a7425358420d28e641b7bc9c2650f3e013c1b1f4f239d8533", size = 549511, upload-time = "2025-10-17T19:16:54.227Z" },
-]
-
-[[package]]
-name = "mlx-lm"
-version = "0.28.3"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "jinja2" },
-    { name = "mlx", marker = "sys_platform == 'darwin'" },
-    { name = "numpy" },
-    { name = "protobuf" },
-    { name = "pyyaml" },
-    { name = "transformers" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/51/f6/15e002d52c28d8c544ec3aaf9053677468333e6ef0e76ea68579fd77b76d/mlx_lm-0.28.3.tar.gz", hash = "sha256:75df2b925d343ebaf50b63008dede4fe98cd3b02b1b24b7da71ebeb198d674f0", size = 214455, upload-time = "2025-10-17T21:44:33.921Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/c2/a6/db3b44a5ac1a1174605628b0a477fbe4632d4fad1f94cf08647e27cc79ad/mlx_lm-0.28.3-py3-none-any.whl", hash = "sha256:ec103e2c9a06bd2cbafd41aafc975e40262176f7360d4f53ec342cebb9e0e6ea", size = 294506, upload-time = "2025-10-17T21:44:32.447Z" },
-]
-
-[[package]]
-name = "mlx-metal"
-version = "0.29.3"
-source = { registry = "https://pypi.org/simple" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/41/95/a00054a006df82bb1b5b8f666ae44a676b259146fadbff90fe654309fefc/mlx_metal-0.29.3-py3-none-macosx_13_0_arm64.whl", hash = "sha256:27b5a4d905202a71e84d9fd559ea0236813f6f960ef494e5cafe9c45df4c9d7c", size = 36817352, upload-time = "2025-10-17T19:19:25.801Z" },
-    { url = "https://files.pythonhosted.org/packages/c0/d8/5ee91eac16dfcf0334103120b47d4abd8c890ccc0d73d3eee4770ce8810f/mlx_metal-0.29.3-py3-none-macosx_14_0_arm64.whl", hash = "sha256:f426d4b67f96b4d6f0ed50d5992933595aadb370dc3e9ed2410bafbc16229882", size = 36555573, upload-time = "2025-10-17T19:18:42.098Z" },
-    { url = "https://files.pythonhosted.org/packages/cd/9a/39b7ecdf21cf2a39ced8d7933eed65c6cb38295cadfd0907dd1abd4d1ded/mlx_metal-0.29.3-py3-none-macosx_15_0_arm64.whl", hash = "sha256:106616f7f825851043c53d3dc186965c003985da9cbb6e5c034f35108fc1fc27", size = 36549163, upload-time = "2025-10-17T19:18:37.701Z" },
+    { name = "opencv-python-headless", marker = "sys_platform == 'linux'" },
 ]
 
 [[package]]
@@ -1203,13 +1168,13 @@ name = "model-hosting-container-standards"
 version = "0.1.12"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "fastapi" },
-    { name = "httpx" },
-    { name = "jmespath" },
-    { name = "pydantic" },
-    { name = "setuptools" },
-    { name = "starlette" },
-    { name = "supervisor" },
+    { name = "fastapi", marker = "sys_platform == 'linux'" },
+    { name = "httpx", marker = "sys_platform == 'linux'" },
+    { name = "jmespath", marker = "sys_platform == 'linux'" },
+    { name = "pydantic", marker = "sys_platform == 'linux'" },
+    { name = "setuptools", marker = "sys_platform == 'linux'" },
+    { name = "starlette", marker = "sys_platform == 'linux'" },
+    { name = "supervisor", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/23/cc/014bdcc700f1d4393578b55df09c1ed76b57feb9a542208d8c25e7c0bb1b/model_hosting_container_standards-0.1.12.tar.gz", hash = "sha256:5a38814201d319eaf258d816697caa16d39b5222319c2d5116d779b30babe602", size = 79119, upload-time = "2025-12-15T23:02:58.848Z" }
 wheels = [
@@ -1231,30 +1196,22 @@ version = "1.1.2"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/4d/f2/bfb55a6236ed8725a96b0aa3acbd0ec17588e6a2c3b62a93eb513ed8783f/msgpack-1.1.2.tar.gz", hash = "sha256:3b60763c1373dd60f398488069bcdc703cd08a711477b5d480eecc9f9626f47e", size = 173581, upload-time = "2025-10-08T09:15:56.596Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/ad/bd/8b0d01c756203fbab65d265859749860682ccd2a59594609aeec3a144efa/msgpack-1.1.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:70a0dff9d1f8da25179ffcf880e10cf1aad55fdb63cd59c9a49a1b82290062aa", size = 81939, upload-time = "2025-10-08T09:15:01.472Z" },
-    { url = "https://files.pythonhosted.org/packages/34/68/ba4f155f793a74c1483d4bdef136e1023f7bcba557f0db4ef3db3c665cf1/msgpack-1.1.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:446abdd8b94b55c800ac34b102dffd2f6aa0ce643c55dfc017ad89347db3dbdb", size = 85064, upload-time = "2025-10-08T09:15:03.764Z" },
     { url = "https://files.pythonhosted.org/packages/f2/60/a064b0345fc36c4c3d2c743c82d9100c40388d77f0b48b2f04d6041dbec1/msgpack-1.1.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c63eea553c69ab05b6747901b97d620bb2a690633c77f23feb0c6a947a8a7b8f", size = 417131, upload-time = "2025-10-08T09:15:05.136Z" },
     { url = "https://files.pythonhosted.org/packages/65/92/a5100f7185a800a5d29f8d14041f61475b9de465ffcc0f3b9fba606e4505/msgpack-1.1.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:372839311ccf6bdaf39b00b61288e0557916c3729529b301c52c2d88842add42", size = 427556, upload-time = "2025-10-08T09:15:06.837Z" },
     { url = "https://files.pythonhosted.org/packages/f5/87/ffe21d1bf7d9991354ad93949286f643b2bb6ddbeab66373922b44c3b8cc/msgpack-1.1.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2929af52106ca73fcb28576218476ffbb531a036c2adbcf54a3664de124303e9", size = 404920, upload-time = "2025-10-08T09:15:08.179Z" },
     { url = "https://files.pythonhosted.org/packages/ff/41/8543ed2b8604f7c0d89ce066f42007faac1eaa7d79a81555f206a5cdb889/msgpack-1.1.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:be52a8fc79e45b0364210eef5234a7cf8d330836d0a64dfbb878efa903d84620", size = 415013, upload-time = "2025-10-08T09:15:09.83Z" },
-    { url = "https://files.pythonhosted.org/packages/41/0d/2ddfaa8b7e1cee6c490d46cb0a39742b19e2481600a7a0e96537e9c22f43/msgpack-1.1.2-cp312-cp312-win32.whl", hash = "sha256:1fff3d825d7859ac888b0fbda39a42d59193543920eda9d9bea44d958a878029", size = 65096, upload-time = "2025-10-08T09:15:11.11Z" },
-    { url = "https://files.pythonhosted.org/packages/8c/ec/d431eb7941fb55a31dd6ca3404d41fbb52d99172df2e7707754488390910/msgpack-1.1.2-cp312-cp312-win_amd64.whl", hash = "sha256:1de460f0403172cff81169a30b9a92b260cb809c4cb7e2fc79ae8d0510c78b6b", size = 72708, upload-time = "2025-10-08T09:15:12.554Z" },
-    { url = "https://files.pythonhosted.org/packages/c5/31/5b1a1f70eb0e87d1678e9624908f86317787b536060641d6798e3cf70ace/msgpack-1.1.2-cp312-cp312-win_arm64.whl", hash = "sha256:be5980f3ee0e6bd44f3a9e9dea01054f175b50c3e6cdb692bc9424c0bbb8bf69", size = 64119, upload-time = "2025-10-08T09:15:13.589Z" },
 ]
 
 [[package]]
 name = "msgspec"
-version = "0.19.0"
+version = "0.20.0"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/cf/9b/95d8ce458462b8b71b8a70fa94563b2498b89933689f3a7b8911edfae3d7/msgspec-0.19.0.tar.gz", hash = "sha256:604037e7cd475345848116e89c553aa9a233259733ab51986ac924ab1b976f8e", size = 216934, upload-time = "2024-12-27T17:40:28.597Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/ea/9c/bfbd12955a49180cbd234c5d29ec6f74fe641698f0cd9df154a854fc8a15/msgspec-0.20.0.tar.gz", hash = "sha256:692349e588fde322875f8d3025ac01689fead5901e7fb18d6870a44519d62a29", size = 317862, upload-time = "2025-11-24T03:56:28.934Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/b2/5f/a70c24f075e3e7af2fae5414c7048b0e11389685b7f717bb55ba282a34a7/msgspec-0.19.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:f98bd8962ad549c27d63845b50af3f53ec468b6318400c9f1adfe8b092d7b62f", size = 190485, upload-time = "2024-12-27T17:39:44.974Z" },
-    { url = "https://files.pythonhosted.org/packages/89/b0/1b9763938cfae12acf14b682fcf05c92855974d921a5a985ecc197d1c672/msgspec-0.19.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:43bbb237feab761b815ed9df43b266114203f53596f9b6e6f00ebd79d178cdf2", size = 183910, upload-time = "2024-12-27T17:39:46.401Z" },
-    { url = "https://files.pythonhosted.org/packages/87/81/0c8c93f0b92c97e326b279795f9c5b956c5a97af28ca0fbb9fd86c83737a/msgspec-0.19.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4cfc033c02c3e0aec52b71710d7f84cb3ca5eb407ab2ad23d75631153fdb1f12", size = 210633, upload-time = "2024-12-27T17:39:49.099Z" },
-    { url = "https://files.pythonhosted.org/packages/d0/ef/c5422ce8af73928d194a6606f8ae36e93a52fd5e8df5abd366903a5ca8da/msgspec-0.19.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d911c442571605e17658ca2b416fd8579c5050ac9adc5e00c2cb3126c97f73bc", size = 213594, upload-time = "2024-12-27T17:39:51.204Z" },
-    { url = "https://files.pythonhosted.org/packages/19/2b/4137bc2ed45660444842d042be2cf5b18aa06efd2cda107cff18253b9653/msgspec-0.19.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:757b501fa57e24896cf40a831442b19a864f56d253679f34f260dcb002524a6c", size = 214053, upload-time = "2024-12-27T17:39:52.866Z" },
-    { url = "https://files.pythonhosted.org/packages/9d/e6/8ad51bdc806aac1dc501e8fe43f759f9ed7284043d722b53323ea421c360/msgspec-0.19.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5f0f65f29b45e2816d8bded36e6b837a4bf5fb60ec4bc3c625fa2c6da4124537", size = 219081, upload-time = "2024-12-27T17:39:55.142Z" },
-    { url = "https://files.pythonhosted.org/packages/b1/ef/27dd35a7049c9a4f4211c6cd6a8c9db0a50647546f003a5867827ec45391/msgspec-0.19.0-cp312-cp312-win_amd64.whl", hash = "sha256:067f0de1c33cfa0b6a8206562efdf6be5985b988b53dd244a8e06f993f27c8c0", size = 187467, upload-time = "2024-12-27T17:39:56.531Z" },
+    { url = "https://files.pythonhosted.org/packages/49/d6/9709ee093b7742362c2934bfb1bbe791a1e09bed3ea5d8a18ce552fbfd73/msgspec-0.20.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:558ed73315efa51b1538fa8f1d3b22c8c5ff6d9a2a62eff87d25829b94fc5054", size = 218852, upload-time = "2025-11-24T03:55:35.575Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/a2/488517a43ccf5a4b6b6eca6dd4ede0bd82b043d1539dd6bb908a19f8efd3/msgspec-0.20.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:509ac1362a1d53aa66798c9b9fd76872d7faa30fcf89b2fba3bcbfd559d56eb0", size = 224937, upload-time = "2025-11-24T03:55:36.859Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/e8/49b832808aa23b85d4f090d1d2e48a4e3834871415031ed7c5fe48723156/msgspec-0.20.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1353c2c93423602e7dea1aa4c92f3391fdfc25ff40e0bacf81d34dbc68adb870", size = 222858, upload-time = "2025-11-24T03:55:38.187Z" },
+    { url = "https://files.pythonhosted.org/packages/9f/56/1dc2fa53685dca9c3f243a6cbecd34e856858354e455b77f47ebd76cf5bf/msgspec-0.20.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:cb33b5eb5adb3c33d749684471c6a165468395d7aa02d8867c15103b81e1da3e", size = 227248, upload-time = "2025-11-24T03:55:39.496Z" },
 ]
 
 [[package]]
@@ -1302,11 +1259,11 @@ wheels = [
 
 [[package]]
 name = "networkx"
-version = "3.5"
+version = "3.6.1"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/6c/4f/ccdb8ad3a38e583f214547fd2f7ff1fc160c43a75af88e6aec213404b96a/networkx-3.5.tar.gz", hash = "sha256:d4c6f9cf81f52d69230866796b82afbccdec3db7ae4fbd1b65ea750feed50037", size = 2471065, upload-time = "2025-05-29T11:35:07.804Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/6a/51/63fe664f3908c97be9d2e4f1158eb633317598cfa6e1fc14af5383f17512/networkx-3.6.1.tar.gz", hash = "sha256:26b7c357accc0c8cde558ad486283728b65b6a95d85ee1cd66bafab4c8168509", size = 2517025, upload-time = "2025-12-08T17:02:39.908Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/eb/8d/776adee7bbf76365fdd7f2552710282c79a4ead5d2a46408c9043a2b70ba/networkx-3.5-py3-none-any.whl", hash = "sha256:0030d386a9a06dee3565298b4a734b68589749a544acbb6c412dc9e2489ec6ec", size = 2034406, upload-time = "2025-05-29T11:35:04.961Z" },
+    { url = "https://files.pythonhosted.org/packages/9e/c9/b2622292ea83fbb4ec318f5b9ab867d0a28ab43c5717bb85b0a5f6b3b0a4/networkx-3.6.1-py3-none-any.whl", hash = "sha256:d47fbf302e7d9cbbb9e2555a0d267983d2aa476bac30e90dfbe5669bd57f3762", size = 2068504, upload-time = "2025-12-08T17:02:38.159Z" },
 ]
 
 [[package]]
@@ -1315,7 +1272,6 @@ version = "1.13.0"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/43/73/79a0b22fc731989c708068427579e840a6cf4e937fe7ae5c5d0b7356ac22/ninja-1.13.0.tar.gz", hash = "sha256:4a40ce995ded54d9dc24f8ea37ff3bf62ad192b547f6c7126e7e25045e76f978", size = 242558, upload-time = "2025-08-11T15:10:19.421Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/3c/74/d02409ed2aa865e051b7edda22ad416a39d81a84980f544f8de717cab133/ninja-1.13.0-py3-none-macosx_10_9_universal2.whl", hash = "sha256:fa2a8bfc62e31b08f83127d1613d10821775a0eb334197154c4d6067b7068ff1", size = 310125, upload-time = "2025-08-11T15:09:50.971Z" },
     { url = "https://files.pythonhosted.org/packages/8e/de/6e1cd6b84b412ac1ef327b76f0641aeb5dcc01e9d3f9eee0286d0c34fd93/ninja-1.13.0-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:3d00c692fb717fd511abeb44b8c5d00340c36938c12d6538ba989fe764e79630", size = 177467, upload-time = "2025-08-11T15:09:52.767Z" },
     { url = "https://files.pythonhosted.org/packages/c8/83/49320fb6e58ae3c079381e333575fdbcf1cca3506ee160a2dcce775046fa/ninja-1.13.0-py3-none-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:be7f478ff9f96a128b599a964fc60a6a87b9fa332ee1bd44fa243ac88d50291c", size = 187834, upload-time = "2025-08-11T15:09:54.115Z" },
     { url = "https://files.pythonhosted.org/packages/56/c7/ba22748fb59f7f896b609cd3e568d28a0a367a6d953c24c461fe04fc4433/ninja-1.13.0-py3-none-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:60056592cf495e9a6a4bea3cd178903056ecb0943e4de45a2ea825edb6dc8d3e", size = 202736, upload-time = "2025-08-11T15:09:55.745Z" },
@@ -1330,9 +1286,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/3c/fb/95752eb635bb8ad27d101d71bef15bc63049de23f299e312878fc21cb2da/ninja-1.13.0-py3-none-musllinux_1_2_riscv64.whl", hash = "sha256:d741a5e6754e0bda767e3274a0f0deeef4807f1fec6c0d7921a0244018926ae5", size = 585106, upload-time = "2025-08-11T15:10:09.818Z" },
     { url = "https://files.pythonhosted.org/packages/c1/31/aa56a1a286703800c0cbe39fb4e82811c277772dc8cd084f442dd8e2938a/ninja-1.13.0-py3-none-musllinux_1_2_s390x.whl", hash = "sha256:e8bad11f8a00b64137e9b315b137d8bb6cbf3086fbdc43bf1f90fd33324d2e96", size = 707138, upload-time = "2025-08-11T15:10:11.366Z" },
     { url = "https://files.pythonhosted.org/packages/34/6f/5f5a54a1041af945130abdb2b8529cbef0cdcbbf9bcf3f4195378319d29a/ninja-1.13.0-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:b4f2a072db3c0f944c32793e91532d8948d20d9ab83da9c0c7c15b5768072200", size = 581758, upload-time = "2025-08-11T15:10:13.295Z" },
-    { url = "https://files.pythonhosted.org/packages/95/97/51359c77527d45943fe7a94d00a3843b81162e6c4244b3579fe8fc54cb9c/ninja-1.13.0-py3-none-win32.whl", hash = "sha256:8cfbb80b4a53456ae8a39f90ae3d7a2129f45ea164f43fadfa15dc38c4aef1c9", size = 267201, upload-time = "2025-08-11T15:10:15.158Z" },
-    { url = "https://files.pythonhosted.org/packages/29/45/c0adfbfb0b5895aa18cec400c535b4f7ff3e52536e0403602fc1a23f7de9/ninja-1.13.0-py3-none-win_amd64.whl", hash = "sha256:fb8ee8719f8af47fed145cced4a85f0755dd55d45b2bddaf7431fa89803c5f3e", size = 309975, upload-time = "2025-08-11T15:10:16.697Z" },
-    { url = "https://files.pythonhosted.org/packages/df/93/a7b983643d1253bb223234b5b226e69de6cda02b76cdca7770f684b795f5/ninja-1.13.0-py3-none-win_arm64.whl", hash = "sha256:3c0b40b1f0bba764644385319028650087b4c1b18cdfa6f45cb39a3669b81aa9", size = 290806, upload-time = "2025-08-11T15:10:18.018Z" },
 ]
 
 [[package]]
@@ -1340,34 +1293,47 @@ name = "numba"
 version = "0.61.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "llvmlite" },
-    { name = "numpy" },
+    { name = "llvmlite", marker = "sys_platform == 'linux'" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/1c/a0/e21f57604304aa03ebb8e098429222722ad99176a4f979d34af1d1ee80da/numba-0.61.2.tar.gz", hash = "sha256:8750ee147940a6637b80ecf7f95062185ad8726c8c28a2295b8ec1160a196f7d", size = 2820615, upload-time = "2025-04-09T02:58:07.659Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/b4/a0/c6b7b9c615cfa3b98c4c63f4316e3f6b3bbe2387740277006551784218cd/numba-0.61.2-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:34fba9406078bac7ab052efbf0d13939426c753ad72946baaa5bf9ae0ebb8dd2", size = 2776626, upload-time = "2025-04-09T02:57:51.857Z" },
-    { url = "https://files.pythonhosted.org/packages/92/4a/fe4e3c2ecad72d88f5f8cd04e7f7cff49e718398a2fac02d2947480a00ca/numba-0.61.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4ddce10009bc097b080fc96876d14c051cc0c7679e99de3e0af59014dab7dfe8", size = 2779287, upload-time = "2025-04-09T02:57:53.658Z" },
     { url = "https://files.pythonhosted.org/packages/9a/2d/e518df036feab381c23a624dac47f8445ac55686ec7f11083655eb707da3/numba-0.61.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5b1bb509d01f23d70325d3a5a0e237cbc9544dd50e50588bc581ba860c213546", size = 3885928, upload-time = "2025-04-09T02:57:55.206Z" },
     { url = "https://files.pythonhosted.org/packages/10/0f/23cced68ead67b75d77cfcca3df4991d1855c897ee0ff3fe25a56ed82108/numba-0.61.2-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:48a53a3de8f8793526cbe330f2a39fe9a6638efcbf11bd63f3d2f9757ae345cd", size = 3577115, upload-time = "2025-04-09T02:57:56.818Z" },
-    { url = "https://files.pythonhosted.org/packages/68/1d/ddb3e704c5a8fb90142bf9dc195c27db02a08a99f037395503bfbc1d14b3/numba-0.61.2-cp312-cp312-win_amd64.whl", hash = "sha256:97cf4f12c728cf77c9c1d7c23707e4d8fb4632b46275f8f3397de33e5877af18", size = 2831929, upload-time = "2025-04-09T02:57:58.45Z" },
 ]
 
 [[package]]
 name = "numpy"
 version = "2.2.6"
 source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "sys_platform == 'linux'",
+]
 sdist = { url = "https://files.pythonhosted.org/packages/76/21/7d2a95e4bba9dc13d043ee156a356c0a8f0c6309dff6b21b4d71a073b8a8/numpy-2.2.6.tar.gz", hash = "sha256:e29554e2bef54a90aa5cc07da6ce955accb83f21ab5de01a62c8478897b264fd", size = 20276440, upload-time = "2025-05-17T22:38:04.611Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/82/5d/c00588b6cf18e1da539b45d3598d3557084990dcc4331960c15ee776ee41/numpy-2.2.6-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:41c5a21f4a04fa86436124d388f6ed60a9343a6f767fced1a8a71c3fbca038ff", size = 20875348, upload-time = "2025-05-17T21:34:39.648Z" },
-    { url = "https://files.pythonhosted.org/packages/66/ee/560deadcdde6c2f90200450d5938f63a34b37e27ebff162810f716f6a230/numpy-2.2.6-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:de749064336d37e340f640b05f24e9e3dd678c57318c7289d222a8a2f543e90c", size = 14119362, upload-time = "2025-05-17T21:35:01.241Z" },
-    { url = "https://files.pythonhosted.org/packages/3c/65/4baa99f1c53b30adf0acd9a5519078871ddde8d2339dc5a7fde80d9d87da/numpy-2.2.6-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:894b3a42502226a1cac872f840030665f33326fc3dac8e57c607905773cdcde3", size = 5084103, upload-time = "2025-05-17T21:35:10.622Z" },
-    { url = "https://files.pythonhosted.org/packages/cc/89/e5a34c071a0570cc40c9a54eb472d113eea6d002e9ae12bb3a8407fb912e/numpy-2.2.6-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:71594f7c51a18e728451bb50cc60a3ce4e6538822731b2933209a1f3614e9282", size = 6625382, upload-time = "2025-05-17T21:35:21.414Z" },
     { url = "https://files.pythonhosted.org/packages/f8/35/8c80729f1ff76b3921d5c9487c7ac3de9b2a103b1cd05e905b3090513510/numpy-2.2.6-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f2618db89be1b4e05f7a1a847a9c1c0abd63e63a1607d892dd54668dd92faf87", size = 14018462, upload-time = "2025-05-17T21:35:42.174Z" },
     { url = "https://files.pythonhosted.org/packages/8c/3d/1e1db36cfd41f895d266b103df00ca5b3cbe965184df824dec5c08c6b803/numpy-2.2.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd83c01228a688733f1ded5201c678f0c53ecc1006ffbc404db9f7a899ac6249", size = 16527618, upload-time = "2025-05-17T21:36:06.711Z" },
     { url = "https://files.pythonhosted.org/packages/61/c6/03ed30992602c85aa3cd95b9070a514f8b3c33e31124694438d88809ae36/numpy-2.2.6-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:37c0ca431f82cd5fa716eca9506aefcabc247fb27ba69c5062a6d3ade8cf8f49", size = 15505511, upload-time = "2025-05-17T21:36:29.965Z" },
     { url = "https://files.pythonhosted.org/packages/b7/25/5761d832a81df431e260719ec45de696414266613c9ee268394dd5ad8236/numpy-2.2.6-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fe27749d33bb772c80dcd84ae7e8df2adc920ae8297400dabec45f0dedb3f6de", size = 18313783, upload-time = "2025-05-17T21:36:56.883Z" },
-    { url = "https://files.pythonhosted.org/packages/57/0a/72d5a3527c5ebffcd47bde9162c39fae1f90138c961e5296491ce778e682/numpy-2.2.6-cp312-cp312-win32.whl", hash = "sha256:4eeaae00d789f66c7a25ac5f34b71a7035bb474e679f410e5e1a94deb24cf2d4", size = 6246506, upload-time = "2025-05-17T21:37:07.368Z" },
-    { url = "https://files.pythonhosted.org/packages/36/fa/8c9210162ca1b88529ab76b41ba02d433fd54fecaf6feb70ef9f124683f1/numpy-2.2.6-cp312-cp312-win_amd64.whl", hash = "sha256:c1f9540be57940698ed329904db803cf7a402f3fc200bfe599334c9bd84a40b2", size = 12614190, upload-time = "2025-05-17T21:37:26.213Z" },
+]
+
+[[package]]
+name = "numpy"
+version = "2.4.0"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "sys_platform != 'darwin' and sys_platform != 'linux'",
+    "sys_platform == 'darwin'",
+]
+sdist = { url = "https://files.pythonhosted.org/packages/a4/7a/6a3d14e205d292b738db449d0de649b373a59edb0d0b4493821d0a3e8718/numpy-2.4.0.tar.gz", hash = "sha256:6e504f7b16118198f138ef31ba24d985b124c2c469fe8467007cf30fd992f934", size = 20685720, upload-time = "2025-12-20T16:18:19.023Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/8b/ff/f6400ffec95de41c74b8e73df32e3fff1830633193a7b1e409be7fb1bb8c/numpy-2.4.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:2a8b6bb8369abefb8bd1801b054ad50e02b3275c8614dc6e5b0373c305291037", size = 16653117, upload-time = "2025-12-20T16:16:06.709Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/28/6c23e97450035072e8d830a3c411bf1abd1f42c611ff9d29e3d8f55c6252/numpy-2.4.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2e284ca13d5a8367e43734148622caf0b261b275673823593e3e3634a6490f83", size = 12369711, upload-time = "2025-12-20T16:16:08.758Z" },
+    { url = "https://files.pythonhosted.org/packages/bc/af/acbef97b630ab1bb45e6a7d01d1452e4251aa88ce680ac36e56c272120ec/numpy-2.4.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:49ff32b09f5aa0cd30a20c2b39db3e669c845589f2b7fc910365210887e39344", size = 5198355, upload-time = "2025-12-20T16:16:10.902Z" },
+    { url = "https://files.pythonhosted.org/packages/c1/c8/4e0d436b66b826f2e53330adaa6311f5cac9871a5b5c31ad773b27f25a74/numpy-2.4.0-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:36cbfb13c152b1c7c184ddac43765db8ad672567e7bafff2cc755a09917ed2e6", size = 6545298, upload-time = "2025-12-20T16:16:12.607Z" },
+    { url = "https://files.pythonhosted.org/packages/d3/6f/f868765d44e6fc466467ed810ba9d8d6db1add7d4a748abfa2a4c99a3194/numpy-2.4.0-cp312-cp312-win32.whl", hash = "sha256:92190db305a6f48734d3982f2c60fa30d6b5ee9bff10f2887b930d7b40119f4c", size = 5955432, upload-time = "2025-12-20T16:16:25.06Z" },
+    { url = "https://files.pythonhosted.org/packages/d4/b5/94c1e79fcbab38d1ca15e13777477b2914dd2d559b410f96949d6637b085/numpy-2.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:680060061adb2d74ce352628cb798cfdec399068aa7f07ba9fb818b2b3305f98", size = 12306201, upload-time = "2025-12-20T16:16:26.979Z" },
+    { url = "https://files.pythonhosted.org/packages/70/09/c39dadf0b13bb0768cd29d6a3aaff1fb7c6905ac40e9aaeca26b1c086e06/numpy-2.4.0-cp312-cp312-win_arm64.whl", hash = "sha256:39699233bc72dd482da1415dcb06076e32f60eddc796a796c5fb6c5efce94667", size = 10308234, upload-time = "2025-12-20T16:16:29.417Z" },
 ]
 
 [[package]]
@@ -1375,6 +1341,7 @@ name = "nvidia-cublas-cu12"
 version = "12.8.4.1"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/29/99/db44d685f0e257ff0e213ade1964fc459b4a690a73293220e98feb3307cf/nvidia_cublas_cu12-12.8.4.1-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:b86f6dd8935884615a0683b663891d43781b819ac4f2ba2b0c9604676af346d0", size = 590537124, upload-time = "2025-03-07T01:43:53.556Z" },
     { url = "https://files.pythonhosted.org/packages/dc/61/e24b560ab2e2eaeb3c839129175fb330dfcfc29e5203196e5541a4c44682/nvidia_cublas_cu12-12.8.4.1-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:8ac4e771d5a348c551b2a426eda6193c19aa630236b418086020df5ba9667142", size = 594346921, upload-time = "2025-03-07T01:44:31.254Z" },
 ]
 
@@ -1383,6 +1350,7 @@ name = "nvidia-cuda-cupti-cu12"
 version = "12.8.90"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/d5/1f/b3bd73445e5cb342727fd24fe1f7b748f690b460acadc27ea22f904502c8/nvidia_cuda_cupti_cu12-12.8.90-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:4412396548808ddfed3f17a467b104ba7751e6b58678a4b840675c56d21cf7ed", size = 9533318, upload-time = "2025-03-07T01:40:10.421Z" },
     { url = "https://files.pythonhosted.org/packages/f8/02/2adcaa145158bf1a8295d83591d22e4103dbfd821bcaf6f3f53151ca4ffa/nvidia_cuda_cupti_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ea0cb07ebda26bb9b29ba82cda34849e73c166c18162d3913575b0c9db9a6182", size = 10248621, upload-time = "2025-03-07T01:40:21.213Z" },
 ]
 
@@ -1392,6 +1360,7 @@ version = "12.8.93"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/05/6b/32f747947df2da6994e999492ab306a903659555dddc0fbdeb9d71f75e52/nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:a7756528852ef889772a84c6cd89d41dfa74667e24cca16bb31f8f061e3e9994", size = 88040029, upload-time = "2025-03-07T01:42:13.562Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/d1/e50d0acaab360482034b84b6e27ee83c6738f7d32182b987f9c7a4e32962/nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:fc1fec1e1637854b4c0a65fb9a8346b51dd9ee69e61ebaccc82058441f15bce8", size = 43106076, upload-time = "2025-03-07T01:41:59.817Z" },
 ]
 
 [[package]]
@@ -1399,6 +1368,7 @@ name = "nvidia-cuda-runtime-cu12"
 version = "12.8.90"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/7c/75/f865a3b236e4647605ea34cc450900854ba123834a5f1598e160b9530c3a/nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:52bf7bbee900262ffefe5e9d5a2a69a30d97e2bc5bb6cc866688caa976966e3d", size = 965265, upload-time = "2025-03-07T01:39:43.533Z" },
     { url = "https://files.pythonhosted.org/packages/0d/9b/a997b638fcd068ad6e4d53b8551a7d30fe8b404d6f1804abf1df69838932/nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:adade8dcbd0edf427b7204d480d6066d33902cab2a4707dcfc48a2d0fd44ab90", size = 954765, upload-time = "2025-03-07T01:40:01.615Z" },
 ]
 
@@ -1407,20 +1377,20 @@ name = "nvidia-cudnn-cu12"
 version = "9.10.2.21"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-cublas-cu12" },
+    { name = "nvidia-cublas-cu12", marker = "sys_platform == 'linux'" },
 ]
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/fa/41/e79269ce215c857c935fd86bcfe91a451a584dfc27f1e068f568b9ad1ab7/nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:c9132cc3f8958447b4910a1720036d9eff5928cc3179b0a51fb6d167c6cc87d8", size = 705026878, upload-time = "2025-06-06T21:52:51.348Z" },
     { url = "https://files.pythonhosted.org/packages/ba/51/e123d997aa098c61d029f76663dedbfb9bc8dcf8c60cbd6adbe42f76d049/nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:949452be657fa16687d0930933f032835951ef0892b37d2d53824d1a84dc97a8", size = 706758467, upload-time = "2025-06-06T21:54:08.597Z" },
 ]
 
 [[package]]
 name = "nvidia-cudnn-frontend"
-version = "1.16.0"
+version = "1.17.0"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/00/39/79b606e805abd67ab4fa72f752a5413a496159f10d94fbdb1d67bb5ae86c/nvidia_cudnn_frontend-1.16.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dd6fdd71c0896ff2ca1809d914cbd17f2904d55863f8881f47946e1d634c7a88", size = 1839271, upload-time = "2025-11-07T01:29:53.06Z" },
-    { url = "https://files.pythonhosted.org/packages/09/21/a0e0d50ba8d7b639fe635500fee0d9c0319561b1ae72176d7024ec04b439/nvidia_cudnn_frontend-1.16.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:16efb069d4bda4d3b99134f59f376cfd4d09558298bd96af778fdc7f2851e696", size = 1954062, upload-time = "2025-11-07T01:32:18.556Z" },
-    { url = "https://files.pythonhosted.org/packages/ce/d6/30ae67bb9c010e9459d1211c56d73373eb4e3dd9f57f4c3c1fe0966efcb1/nvidia_cudnn_frontend-1.16.0-cp312-cp312-win_amd64.whl", hash = "sha256:7b7860db03767c158accbe0b4e9c9553506513cc970ff08ed28c7761681ac466", size = 1368435, upload-time = "2025-11-07T01:26:28.022Z" },
+    { url = "https://files.pythonhosted.org/packages/42/d9/f58ed6292c9396f7422812a0a2d9f80cc5a623ea6c758bcb3d34d4795bb8/nvidia_cudnn_frontend-1.17.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:de0c473f32d705abcf14f351615f7ffbeed7320e3499cf2195ae5689652a2592", size = 1917620, upload-time = "2025-12-20T00:27:46.179Z" },
+    { url = "https://files.pythonhosted.org/packages/db/eb/c641135632bd2afc21339aadee96af4c5db1460dfa07ca74836de75a590f/nvidia_cudnn_frontend-1.17.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c913c87fca691a91385287f2587575531933acfebc85c33dbcecb191886c7a53", size = 2038994, upload-time = "2025-12-20T00:25:18.9Z" },
 ]
 
 [[package]]
@@ -1428,9 +1398,10 @@ name = "nvidia-cufft-cu12"
 version = "11.3.3.83"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-nvjitlink-cu12" },
+    { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'linux'" },
 ]
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/60/bc/7771846d3a0272026c416fbb7e5f4c1f146d6d80704534d0b187dd6f4800/nvidia_cufft_cu12-11.3.3.83-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:848ef7224d6305cdb2a4df928759dca7b1201874787083b6e7550dd6765ce69a", size = 193109211, upload-time = "2025-03-07T01:44:56.873Z" },
     { url = "https://files.pythonhosted.org/packages/1f/13/ee4e00f30e676b66ae65b4f08cb5bcbb8392c03f54f2d5413ea99a5d1c80/nvidia_cufft_cu12-11.3.3.83-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4d2dd21ec0b88cf61b62e6b43564355e5222e4a3fb394cac0db101f2dd0d4f74", size = 193118695, upload-time = "2025-03-07T01:45:27.821Z" },
 ]
 
@@ -1440,6 +1411,7 @@ version = "1.13.1.3"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/bb/fe/1bcba1dfbfb8d01be8d93f07bfc502c93fa23afa6fd5ab3fc7c1df71038a/nvidia_cufile_cu12-1.13.1.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1d069003be650e131b21c932ec3d8969c1715379251f8d23a1860554b1cb24fc", size = 1197834, upload-time = "2025-03-07T01:45:50.723Z" },
+    { url = "https://files.pythonhosted.org/packages/1e/f5/5607710447a6fe9fd9b3283956fceeee8a06cda1d2f56ce31371f595db2a/nvidia_cufile_cu12-1.13.1.3-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:4beb6d4cce47c1a0f1013d72e02b0994730359e17801d395bdcbf20cfb3bb00a", size = 1120705, upload-time = "2025-03-07T01:45:41.434Z" },
 ]
 
 [[package]]
@@ -1447,6 +1419,7 @@ name = "nvidia-curand-cu12"
 version = "10.3.9.90"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/45/5e/92aa15eca622a388b80fbf8375d4760738df6285b1e92c43d37390a33a9a/nvidia_curand_cu12-10.3.9.90-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:dfab99248034673b779bc6decafdc3404a8a6f502462201f2f31f11354204acd", size = 63625754, upload-time = "2025-03-07T01:46:10.735Z" },
     { url = "https://files.pythonhosted.org/packages/fb/aa/6584b56dc84ebe9cf93226a5cde4d99080c8e90ab40f0c27bda7a0f29aa1/nvidia_curand_cu12-10.3.9.90-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:b32331d4f4df5d6eefa0554c565b626c7216f87a06a4f56fab27c3b68a830ec9", size = 63619976, upload-time = "2025-03-07T01:46:23.323Z" },
 ]
 
@@ -1455,11 +1428,12 @@ name = "nvidia-cusolver-cu12"
 version = "11.7.3.90"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-cublas-cu12" },
-    { name = "nvidia-cusparse-cu12" },
-    { name = "nvidia-nvjitlink-cu12" },
+    { name = "nvidia-cublas-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-cusparse-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'linux'" },
 ]
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/c8/32/f7cd6ce8a7690544d084ea21c26e910a97e077c9b7f07bf5de623ee19981/nvidia_cusolver_cu12-11.7.3.90-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:db9ed69dbef9715071232caa9b69c52ac7de3a95773c2db65bdba85916e4e5c0", size = 267229841, upload-time = "2025-03-07T01:46:54.356Z" },
     { url = "https://files.pythonhosted.org/packages/85/48/9a13d2975803e8cf2777d5ed57b87a0b6ca2cc795f9a4f59796a910bfb80/nvidia_cusolver_cu12-11.7.3.90-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:4376c11ad263152bd50ea295c05370360776f8c3427b30991df774f9fb26c450", size = 267506905, upload-time = "2025-03-07T01:47:16.273Z" },
 ]
 
@@ -1468,9 +1442,10 @@ name = "nvidia-cusparse-cu12"
 version = "12.5.8.93"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-nvjitlink-cu12" },
+    { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'linux'" },
 ]
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/bc/f7/cd777c4109681367721b00a106f491e0d0d15cfa1fd59672ce580ce42a97/nvidia_cusparse_cu12-12.5.8.93-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:9b6c161cb130be1a07a27ea6923df8141f3c295852f4b260c65f18f3e0a091dc", size = 288117129, upload-time = "2025-03-07T01:47:40.407Z" },
     { url = "https://files.pythonhosted.org/packages/c2/f5/e1854cb2f2bcd4280c44736c93550cc300ff4b8c95ebe370d0aa7d2b473d/nvidia_cusparse_cu12-12.5.8.93-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1ec05d76bbbd8b61b06a80e1eaf8cf4959c3d4ce8e711b65ebd0443bb0ebb13b", size = 288216466, upload-time = "2025-03-07T01:48:13.779Z" },
 ]
 
@@ -1479,21 +1454,22 @@ name = "nvidia-cusparselt-cu12"
 version = "0.7.1"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/73/b9/598f6ff36faaece4b3c50d26f50e38661499ff34346f00e057760b35cc9d/nvidia_cusparselt_cu12-0.7.1-py3-none-manylinux2014_aarch64.whl", hash = "sha256:8878dce784d0fac90131b6817b607e803c36e629ba34dc5b433471382196b6a5", size = 283835557, upload-time = "2025-02-26T00:16:54.265Z" },
     { url = "https://files.pythonhosted.org/packages/56/79/12978b96bd44274fe38b5dde5cfb660b1d114f70a65ef962bcbbed99b549/nvidia_cusparselt_cu12-0.7.1-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f1bb701d6b930d5a7cea44c19ceb973311500847f81b634d802b7b539dc55623", size = 287193691, upload-time = "2025-02-26T00:15:44.104Z" },
 ]
 
 [[package]]
 name = "nvidia-cutlass-dsl"
-version = "4.3.3"
+version = "4.3.4"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "cuda-python" },
-    { name = "numpy" },
-    { name = "typing-extensions" },
+    { name = "cuda-python", marker = "sys_platform == 'linux'" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux'" },
+    { name = "typing-extensions", marker = "sys_platform == 'linux'" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/c6/d4/7c5ef53ccf75d7f99a9ea29cae9f9c0233229b75b3b22f85a4ef4f52e6ab/nvidia_cutlass_dsl-4.3.3-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:3278526f54bddd920d8e539771e5820c6166c549a1e67813375025f39417dec6", size = 58734009, upload-time = "2025-12-10T09:23:29.305Z" },
-    { url = "https://files.pythonhosted.org/packages/88/a8/a27562194cc4182c67793cd21c5dbf9468cd5a49c775a487153c6f28364c/nvidia_cutlass_dsl-4.3.3-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:f2b25816b8bb8bc332bcbf6fc341347b5d728344cf185c65af0dd73e8503d5c7", size = 58596724, upload-time = "2025-12-10T11:01:07.228Z" },
+    { url = "https://files.pythonhosted.org/packages/86/ee/53d22e2e14cb763927d85f7ec9748f6af6d27a2b7f43d52de014728da10e/nvidia_cutlass_dsl-4.3.4-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:57693d87677919572ab9eefa386b3f39e8e888bc4a9db7ab8730a97e8dbe06b4", size = 58736300, upload-time = "2025-12-21T07:41:25.723Z" },
+    { url = "https://files.pythonhosted.org/packages/66/f6/47489e07081cd4060f08bfa4166f8ff32beaecf71c06060d03bde88f3b6c/nvidia_cutlass_dsl-4.3.4-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:a48fbff859e44dd548f8f26819d97d0595acea70e3b057c91dfdb47929015c72", size = 58599014, upload-time = "2025-12-21T07:38:51.632Z" },
 ]
 
 [[package]]
@@ -1510,6 +1486,7 @@ name = "nvidia-nccl-cu12"
 version = "2.27.5"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/bb/1c/857979db0ef194ca5e21478a0612bcdbbe59458d7694361882279947b349/nvidia_nccl_cu12-2.27.5-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:31432ad4d1fb1004eb0c56203dc9bc2178a1ba69d1d9e02d64a6938ab5e40e7a", size = 322400625, upload-time = "2025-06-26T04:11:04.496Z" },
     { url = "https://files.pythonhosted.org/packages/6e/89/f7a07dc961b60645dbbf42e80f2bc85ade7feb9a491b11a1e973aa00071f/nvidia_nccl_cu12-2.27.5-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ad730cf15cb5d25fe849c6e6ca9eb5b76db16a80f13f425ac68d8e2e55624457", size = 322348229, upload-time = "2025-06-26T04:11:28.385Z" },
 ]
 
@@ -1519,6 +1496,7 @@ version = "12.8.93"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/f6/74/86a07f1d0f42998ca31312f998bd3b9a7eff7f52378f4f270c8679c77fb9/nvidia_nvjitlink_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:81ff63371a7ebd6e6451970684f916be2eab07321b73c9d244dc2b4da7f73b88", size = 39254836, upload-time = "2025-03-07T01:49:55.661Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/a2/8cee5da30d13430e87bf99bb33455d2724d0a4a9cb5d7926d80ccb96d008/nvidia_nvjitlink_cu12-12.8.93-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:adccd7161ace7261e01bb91e44e88da350895c270d23f744f0820c818b7229e7", size = 38386204, upload-time = "2025-03-07T01:49:43.612Z" },
 ]
 
 [[package]]
@@ -1526,6 +1504,7 @@ name = "nvidia-nvshmem-cu12"
 version = "3.3.20"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/92/9d/3dd98852568fb845ec1f7902c90a22b240fe1cbabda411ccedf2fd737b7b/nvidia_nvshmem_cu12-3.3.20-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0b0b960da3842212758e4fa4696b94f129090b30e5122fea3c5345916545cff0", size = 124484616, upload-time = "2025-08-04T20:24:59.172Z" },
     { url = "https://files.pythonhosted.org/packages/3b/6c/99acb2f9eb85c29fc6f3a7ac4dccfd992e22666dd08a642b303311326a97/nvidia_nvshmem_cu12-3.3.20-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d00f26d3f9b2e3c3065be895e3059d6479ea5c638a3f38c9fec49b1b9dd7c1e5", size = 124657145, upload-time = "2025-08-04T20:25:19.995Z" },
 ]
 
@@ -1534,12 +1513,13 @@ name = "nvidia-nvtx-cu12"
 version = "12.8.90"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/10/c0/1b303feea90d296f6176f32a2a70b5ef230f9bdeb3a72bddb0dc922dc137/nvidia_nvtx_cu12-12.8.90-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d7ad891da111ebafbf7e015d34879f7112832fc239ff0d7d776b6cb685274615", size = 91161, upload-time = "2025-03-07T01:42:23.922Z" },
     { url = "https://files.pythonhosted.org/packages/a2/eb/86626c1bbc2edb86323022371c39aa48df6fd8b0a1647bc274577f72e90b/nvidia_nvtx_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5b17e2001cc0d751a5bc2c6ec6d26ad95913324a4adb86788c944f8ce9ba441f", size = 89954, upload-time = "2025-03-07T01:42:44.131Z" },
 ]
 
 [[package]]
 name = "openai"
-version = "2.7.1"
+version = "2.14.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "anyio" },
@@ -1551,9 +1531,9 @@ dependencies = [
     { name = "tqdm" },
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/51/a2/f4023c1e0c868a6a5854955b3374f17153388aed95e835af114a17eac95b/openai-2.7.1.tar.gz", hash = "sha256:df4d4a3622b2df3475ead8eb0fbb3c27fd1c070fa2e55d778ca4f40e0186c726", size = 595933, upload-time = "2025-11-04T06:07:23.069Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/d8/b1/12fe1c196bea326261718eb037307c1c1fe1dedc2d2d4de777df822e6238/openai-2.14.0.tar.gz", hash = "sha256:419357bedde9402d23bf8f2ee372fca1985a73348debba94bddff06f19459952", size = 626938, upload-time = "2025-12-19T03:28:45.742Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/8c/74/6bfc3adc81f6c2cea4439f2a734c40e3a420703bbcdc539890096a732bbd/openai-2.7.1-py3-none-any.whl", hash = "sha256:2f2530354d94c59c614645a4662b9dab0a5b881c5cd767a8587398feac0c9021", size = 1008780, upload-time = "2025-11-04T06:07:20.818Z" },
+    { url = "https://files.pythonhosted.org/packages/27/4b/7c1a00c2c3fbd004253937f7520f692a9650767aa73894d7a34f0d65d3f4/openai-2.14.0-py3-none-any.whl", hash = "sha256:7ea40aca4ffc4c4a776e77679021b47eec1160e341f42ae086ba949c9dcc9183", size = 1067558, upload-time = "2025-12-19T03:28:43.727Z" },
 ]
 
 [[package]]
@@ -1561,11 +1541,10 @@ name = "openai-harmony"
 version = "0.0.8"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "pydantic" },
+    { name = "pydantic", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/3e/92/2d038d096f29179c7c9571b431f9e739f87a487121901725e23fe338dd9d/openai_harmony-0.0.8.tar.gz", hash = "sha256:6e43f98e6c242fa2de6f8ea12eab24af63fa2ed3e89c06341fb9d92632c5cbdf", size = 284777, upload-time = "2025-11-05T19:07:06.727Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/45/c6/2502f416d46be3ec08bb66d696cccffb57781a499e3ff2e4d7c174af4e8f/openai_harmony-0.0.8-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:029ec25ca74abe48fdb58eb9fdd2a8c1618581fc33ce8e5653f8a1ffbfbd9326", size = 2627806, upload-time = "2025-11-05T19:06:57.063Z" },
     { url = "https://files.pythonhosted.org/packages/d3/d2/ce6953ca87db9cae3e775024184da7d1c5cb88cead19a2d75b42f00a959c/openai_harmony-0.0.8-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e4f709815924ec325b9a890e6ab2bbb0ceec8e319a4e257328eb752cf36b2efc", size = 2948463, upload-time = "2025-11-05T19:06:48.17Z" },
     { url = "https://files.pythonhosted.org/packages/fa/4c/b553c9651662d6ce102ca7f3629d268b23df1abe5841e24bed81e8a8e949/openai_harmony-0.0.8-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5cfcfd963b50a41fc656c84d3440ca6eecdccd6c552158ce790b8f2e33dfb5a9", size = 2704083, upload-time = "2025-11-05T19:06:50.205Z" },
     { url = "https://files.pythonhosted.org/packages/9b/af/4eec8f9ab9c27bcdb444460c72cf43011d176fc44c79d6e113094ca1e152/openai_harmony-0.0.8-cp38-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a3a16972aa1cee38ea958470cd04ac9a2d5ac38fdcf77ab686611246220c158", size = 2959765, upload-time = "2025-11-05T19:06:53.62Z" },
@@ -1575,8 +1554,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/1d/10/4327dbf87f75ae813405fd9a9b4a5cde63d506ffed0a096a440a4cabd89c/openai_harmony-0.0.8-cp38-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:cbaa3bda75ef0d8836e1f8cc84af62f971b1d756d740efc95c38c3e04c0bfde2", size = 2932931, upload-time = "2025-11-05T19:07:01.437Z" },
     { url = "https://files.pythonhosted.org/packages/8a/c8/1774eec4f6f360ef57618fb8f52e3d3af245b2491bd0297513aa09eec04b/openai_harmony-0.0.8-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:772922a9bd24e133950fad71eb1550836f415a88e8c77870e12d0c3bd688ddc2", size = 2996140, upload-time = "2025-11-05T19:07:03.438Z" },
     { url = "https://files.pythonhosted.org/packages/60/c3/3d1e01e2dba517a91760e4a03e4f20ffc75039a6fe584d0e6f9b5c78fd15/openai_harmony-0.0.8-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:007b0476a1f331f8130783f901f1da6f5a7057af1a4891f1b6a31dec364189b5", size = 3205080, upload-time = "2025-11-05T19:07:05.078Z" },
-    { url = "https://files.pythonhosted.org/packages/14/63/119de431572d7c70a7bf1037034a9be6ed0a7502a7498ba7302bca5b3242/openai_harmony-0.0.8-cp38-abi3-win32.whl", hash = "sha256:a9b5f893326b28d9e935ade14b4f655f5a840942473bc89b201c25f7a15af9cf", size = 2082457, upload-time = "2025-11-05T19:07:09.631Z" },
-    { url = "https://files.pythonhosted.org/packages/40/1f/c83cf5a206c263ee70448a5ae4264682555f4d0b5bed0d2cc6ca1108103d/openai_harmony-0.0.8-cp38-abi3-win_amd64.whl", hash = "sha256:39d44f0d8f466bd56698e7ead708bead3141e27b9b87e3ab7d5a6d0e4a869ee5", size = 2438369, upload-time = "2025-11-05T19:07:08.1Z" },
 ]
 
 [[package]]
@@ -1584,16 +1561,12 @@ name = "opencv-python-headless"
 version = "4.12.0.88"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "numpy" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/a4/63/6861102ec149c3cd298f4d1ea7ce9d6adbc7529221606ff1dab991a19adb/opencv-python-headless-4.12.0.88.tar.gz", hash = "sha256:cfdc017ddf2e59b6c2f53bc12d74b6b0be7ded4ec59083ea70763921af2b6c09", size = 95379675, upload-time = "2025-07-07T09:21:06.815Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/f7/7d/414e243c5c8216a5277afd104a319cc1291c5e23f5eeef512db5629ee7f4/opencv_python_headless-4.12.0.88-cp37-abi3-macosx_13_0_arm64.whl", hash = "sha256:1e58d664809b3350c1123484dd441e1667cd7bed3086db1b9ea1b6f6cb20b50e", size = 37877864, upload-time = "2025-07-07T09:14:41.693Z" },
-    { url = "https://files.pythonhosted.org/packages/05/14/7e162714beed1cd5e7b5eb66fcbcba2f065c51b1d9da2463024c84d2f7c0/opencv_python_headless-4.12.0.88-cp37-abi3-macosx_13_0_x86_64.whl", hash = "sha256:365bb2e486b50feffc2d07a405b953a8f3e8eaa63865bc650034e5c71e7a5154", size = 57326608, upload-time = "2025-07-07T09:14:51.885Z" },
     { url = "https://files.pythonhosted.org/packages/69/4e/116720df7f1f7f3b59abc608ca30fbec9d2b3ae810afe4e4d26483d9dfa0/opencv_python_headless-4.12.0.88-cp37-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:aeb4b13ecb8b4a0beb2668ea07928160ea7c2cd2d9b5ef571bbee6bafe9cc8d0", size = 33145800, upload-time = "2025-07-07T09:15:00.367Z" },
     { url = "https://files.pythonhosted.org/packages/89/53/e19c21e0c4eb1275c3e2c97b081103b6dfb3938172264d283a519bf728b9/opencv_python_headless-4.12.0.88-cp37-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:236c8df54a90f4d02076e6f9c1cc763d794542e886c576a6fee46ec8ff75a7a9", size = 54023419, upload-time = "2025-07-07T09:15:10.164Z" },
-    { url = "https://files.pythonhosted.org/packages/bf/9c/a76fd5414de6ec9f21f763a600058a0c3e290053cea87e0275692b1375c0/opencv_python_headless-4.12.0.88-cp37-abi3-win32.whl", hash = "sha256:fde2cf5c51e4def5f2132d78e0c08f9c14783cd67356922182c6845b9af87dbd", size = 30225230, upload-time = "2025-07-07T09:15:17.045Z" },
-    { url = "https://files.pythonhosted.org/packages/f2/35/0858e9e71b36948eafbc5e835874b63e515179dc3b742cbe3d76bc683439/opencv_python_headless-4.12.0.88-cp37-abi3-win_amd64.whl", hash = "sha256:86b413bdd6c6bf497832e346cd5371995de148e579b9774f8eba686dee3f5528", size = 38923559, upload-time = "2025-07-07T09:15:25.229Z" },
 ]
 
 [[package]]
@@ -1602,14 +1575,8 @@ version = "0.2.11"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/1a/d3/e04e9145f8f806723dec9b9e5227ad695a3efcd3ced7794cf7c22b15df5e/outlines_core-0.2.11.tar.gz", hash = "sha256:dfce56f717ff5083e54cbcfdb66cad243365437fccbb5509adaa7e31e030f1d8", size = 197263, upload-time = "2025-05-19T10:12:51.719Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/5f/2c/c7636823244c70e2960060bf9bd978248dffb55c5e7c91c46d18354b2a24/outlines_core-0.2.11-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:4a9db4872bae083631d720994f4cee603bce0536b33d5a988814576863b657cf", size = 1957668, upload-time = "2025-05-19T10:12:18.29Z" },
-    { url = "https://files.pythonhosted.org/packages/c7/09/5c62047da139d722317a444a4d01cd5f11943a8c2eaecce784341dd0844a/outlines_core-0.2.11-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:8359a45c59f6a8f2eb717245806501a59044c75f6ea8bd08faaa131cc8cdec45", size = 2130493, upload-time = "2025-05-19T10:12:19.537Z" },
-    { url = "https://files.pythonhosted.org/packages/89/7a/d6a2810f90e37d550168e0c0a9a915086ea721444727e3ca2c630898d1ef/outlines_core-0.2.11-cp312-cp312-macosx_15_0_arm64.whl", hash = "sha256:5d26a46591377340e0b870b8a96ea8341058341a62ee0bded9098e0c88dd24f4", size = 1956804, upload-time = "2025-05-19T10:12:20.755Z" },
-    { url = "https://files.pythonhosted.org/packages/ca/ea/339e6c273b5581128c3b7ca27d428d8993c3085912af1a467aa32ef0e9d1/outlines_core-0.2.11-cp312-cp312-macosx_15_0_x86_64.whl", hash = "sha256:ae460a34675fb11d92a5c605a480fbae4cd6c1b2d11b3698da64a7fcaba64dcf", size = 2127085, upload-time = "2025-05-19T10:12:22.02Z" },
     { url = "https://files.pythonhosted.org/packages/92/c7/a65d1fddf49830ebc41422294eacde35286d9f68994a8aa905cb14f5aade/outlines_core-0.2.11-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:86df9740368866295077346440d911df4972da2b3f1f54b8125e6f329e8a8891", size = 2287677, upload-time = "2025-05-19T10:12:24.24Z" },
     { url = "https://files.pythonhosted.org/packages/23/79/8795aed8be9b77dd69d78e7cfbfcf28c179e6b08da6e56bbbf48a09fe55f/outlines_core-0.2.11-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:96ce4dd78f106799be4a0a5795cefd1352806162973756a4b6fce4bb6eddd7e4", size = 2113000, upload-time = "2025-05-19T10:12:25.446Z" },
-    { url = "https://files.pythonhosted.org/packages/59/e3/cbe9294b06d92ee1892dbb6f2125d833d68e8629d45d080d6daba54eec2d/outlines_core-0.2.11-cp312-cp312-win32.whl", hash = "sha256:358db161cce3650ba822e118dcf0a1efa571c7deb4864ab9d64ca2c9cca7425d", size = 1765703, upload-time = "2025-05-19T10:12:26.693Z" },
-    { url = "https://files.pythonhosted.org/packages/1d/c9/ed3cf362515fac16e313368b9b2f2497051f4ded88679205830b6f889f54/outlines_core-0.2.11-cp312-cp312-win_amd64.whl", hash = "sha256:231f9d20d2630c70665345821780d7808b29539620a75c99f65113b518c51032", size = 2060945, upload-time = "2025-05-19T10:12:28.294Z" },
 ]
 
 [[package]]
@@ -1626,7 +1593,8 @@ name = "pandas"
 version = "2.3.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "numpy" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux'" },
+    { name = "numpy", version = "2.4.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'linux'" },
     { name = "python-dateutil" },
     { name = "pytz" },
     { name = "tzdata" },
@@ -1644,11 +1612,11 @@ wheels = [
 
 [[package]]
 name = "partial-json-parser"
-version = "0.2.1.1.post6"
+version = "0.2.1.1.post7"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/86/13/459e86c9c67a006651803a3df3d0b08f7708bc5483fdc482582d75562949/partial_json_parser-0.2.1.1.post6.tar.gz", hash = "sha256:43896b68929678224cbbe4884a6a5fe9251ded4b30b8b7d7eb569e5feea93afc", size = 10299, upload-time = "2025-06-23T17:51:45.372Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/6a/6d/eed37d7ebc1e0bcd27b831c0cf1fe94881934316187c4b30d23f29ea0bd4/partial_json_parser-0.2.1.1.post7.tar.gz", hash = "sha256:86590e1ba6bcb6739a2dfc17d2323f028cb5884f4c6ce23db376999132c9a922", size = 10296, upload-time = "2025-11-17T07:27:41.202Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/cb/40/1f922794af3dc7503f19319a8804b398a161a2cd54183cff8b12225b8d85/partial_json_parser-0.2.1.1.post6-py3-none-any.whl", hash = "sha256:abc332f09b13ef5233384dbfe7128a0e9ea3fa4b8f8be9b37ac1b433c810e99e", size = 10876, upload-time = "2025-06-23T17:51:44.332Z" },
+    { url = "https://files.pythonhosted.org/packages/42/32/658973117bf0fd82a24abbfb94fe73a5e86216e49342985e10acce54775a/partial_json_parser-0.2.1.1.post7-py3-none-any.whl", hash = "sha256:145119e5eabcf80cbb13844a6b50a85c68bf99d376f8ed771e2a3c3b03e653ae", size = 10877, upload-time = "2025-11-17T07:27:40.457Z" },
 ]
 
 [[package]]
@@ -1658,12 +1626,15 @@ source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "accelerate" },
     { name = "huggingface-hub" },
-    { name = "numpy" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux'" },
+    { name = "numpy", version = "2.4.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'linux'" },
     { name = "packaging" },
     { name = "psutil" },
     { name = "pyyaml" },
     { name = "safetensors" },
-    { name = "torch" },
+    { name = "torch", version = "2.9.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux'" },
+    { name = "torch", version = "2.9.1", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform == 'darwin'" },
+    { name = "torch", version = "2.9.1+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform != 'darwin' and sys_platform != 'linux'" },
     { name = "tqdm" },
     { name = "transformers" },
 ]
@@ -1678,17 +1649,21 @@ version = "12.0.0"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/5a/b0/cace85a1b0c9775a9f8f5d5423c8261c858760e2466c79b2dd184638b056/pillow-12.0.0.tar.gz", hash = "sha256:87d4f8125c9988bfbed67af47dd7a953e2fc7b0cc1e7800ec6d2080d490bb353", size = 47008828, upload-time = "2025-10-15T18:24:14.008Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/2c/90/4fcce2c22caf044e660a198d740e7fbc14395619e3cb1abad12192c0826c/pillow-12.0.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:53561a4ddc36facb432fae7a9d8afbfaf94795414f5cdc5fc52f28c1dca90371", size = 5249377, upload-time = "2025-10-15T18:22:05.993Z" },
-    { url = "https://files.pythonhosted.org/packages/fd/e0/ed960067543d080691d47d6938ebccbf3976a931c9567ab2fbfab983a5dd/pillow-12.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:71db6b4c1653045dacc1585c1b0d184004f0d7e694c7b34ac165ca70c0838082", size = 4650343, upload-time = "2025-10-15T18:22:07.718Z" },
     { url = "https://files.pythonhosted.org/packages/e7/a1/f81fdeddcb99c044bf7d6faa47e12850f13cee0849537a7d27eeab5534d4/pillow-12.0.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:2fa5f0b6716fc88f11380b88b31fe591a06c6315e955c096c35715788b339e3f", size = 6232981, upload-time = "2025-10-15T18:22:09.287Z" },
     { url = "https://files.pythonhosted.org/packages/88/e1/9098d3ce341a8750b55b0e00c03f1630d6178f38ac191c81c97a3b047b44/pillow-12.0.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:82240051c6ca513c616f7f9da06e871f61bfd7805f566275841af15015b8f98d", size = 8041399, upload-time = "2025-10-15T18:22:10.872Z" },
     { url = "https://files.pythonhosted.org/packages/a7/62/a22e8d3b602ae8cc01446d0c57a54e982737f44b6f2e1e019a925143771d/pillow-12.0.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:55f818bd74fe2f11d4d7cbc65880a843c4075e0ac7226bc1a23261dbea531953", size = 6347740, upload-time = "2025-10-15T18:22:12.769Z" },
     { url = "https://files.pythonhosted.org/packages/4f/87/424511bdcd02c8d7acf9f65caa09f291a519b16bd83c3fb3374b3d4ae951/pillow-12.0.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b87843e225e74576437fd5b6a4c2205d422754f84a06942cfaf1dc32243e45a8", size = 7040201, upload-time = "2025-10-15T18:22:14.813Z" },
     { url = "https://files.pythonhosted.org/packages/dc/4d/435c8ac688c54d11755aedfdd9f29c9eeddf68d150fe42d1d3dbd2365149/pillow-12.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:c607c90ba67533e1b2355b821fef6764d1dd2cbe26b8c1005ae84f7aea25ff79", size = 6462334, upload-time = "2025-10-15T18:22:16.375Z" },
     { url = "https://files.pythonhosted.org/packages/2b/f2/ad34167a8059a59b8ad10bc5c72d4d9b35acc6b7c0877af8ac885b5f2044/pillow-12.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:21f241bdd5080a15bc86d3466a9f6074a9c2c2b314100dd896ac81ee6db2f1ba", size = 7134162, upload-time = "2025-10-15T18:22:17.996Z" },
-    { url = "https://files.pythonhosted.org/packages/0c/b1/a7391df6adacf0a5c2cf6ac1cf1fcc1369e7d439d28f637a847f8803beb3/pillow-12.0.0-cp312-cp312-win32.whl", hash = "sha256:dd333073e0cacdc3089525c7df7d39b211bcdf31fc2824e49d01c6b6187b07d0", size = 6298769, upload-time = "2025-10-15T18:22:19.923Z" },
-    { url = "https://files.pythonhosted.org/packages/a2/0b/d87733741526541c909bbf159e338dcace4f982daac6e5a8d6be225ca32d/pillow-12.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:9fe611163f6303d1619bbcb653540a4d60f9e55e622d60a3108be0d5b441017a", size = 7001107, upload-time = "2025-10-15T18:22:21.644Z" },
-    { url = "https://files.pythonhosted.org/packages/bc/96/aaa61ce33cc98421fb6088af2a03be4157b1e7e0e87087c888e2370a7f45/pillow-12.0.0-cp312-cp312-win_arm64.whl", hash = "sha256:7dfb439562f234f7d57b1ac6bc8fe7f838a4bd49c79230e0f6a1da93e82f1fad", size = 2436012, upload-time = "2025-10-15T18:22:23.621Z" },
+]
+
+[[package]]
+name = "platformdirs"
+version = "4.5.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/cf/86/0248f086a84f01b37aaec0fa567b397df1a119f73c16f6c7a9aac73ea309/platformdirs-4.5.1.tar.gz", hash = "sha256:61d5cdcc6065745cdd94f0f878977f8de9437be93de97c1c12f853c9c0cdcbda", size = 21715, upload-time = "2025-12-05T13:52:58.638Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/cb/28/3bfe2fa5a7b9c46fe7e13c97bda14c895fb10fa2ebf1d0abb90e0cea7ee1/platformdirs-4.5.1-py3-none-any.whl", hash = "sha256:d03afa3963c806a9bed9d5125c8f4cb2fdaf74a55ab60e5d59b3fde758104d31", size = 18731, upload-time = "2025-12-05T13:52:56.823Z" },
 ]
 
 [[package]]
@@ -1714,8 +1689,8 @@ name = "prometheus-fastapi-instrumentator"
 version = "7.1.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "prometheus-client" },
-    { name = "starlette" },
+    { name = "prometheus-client", marker = "sys_platform == 'linux'" },
+    { name = "starlette", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/69/6d/24d53033cf93826aa7857699a4450c1c67e5b9c710e925b1ed2b320c04df/prometheus_fastapi_instrumentator-7.1.0.tar.gz", hash = "sha256:be7cd61eeea4e5912aeccb4261c6631b3f227d8924542d79eaf5af3f439cbe5e", size = 20220, upload-time = "2025-03-19T19:35:05.351Z" }
 wheels = [
@@ -1748,31 +1723,33 @@ wheels = [
 
 [[package]]
 name = "protobuf"
-version = "6.33.0"
+version = "6.33.2"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/19/ff/64a6c8f420818bb873713988ca5492cba3a7946be57e027ac63495157d97/protobuf-6.33.0.tar.gz", hash = "sha256:140303d5c8d2037730c548f8c7b93b20bb1dc301be280c378b82b8894589c954", size = 443463, upload-time = "2025-10-15T20:39:52.159Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/34/44/e49ecff446afeec9d1a66d6bbf9adc21e3c7cea7803a920ca3773379d4f6/protobuf-6.33.2.tar.gz", hash = "sha256:56dc370c91fbb8ac85bc13582c9e373569668a290aa2e66a590c2a0d35ddb9e4", size = 444296, upload-time = "2025-12-06T00:17:53.311Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/7e/ee/52b3fa8feb6db4a833dfea4943e175ce645144532e8a90f72571ad85df4e/protobuf-6.33.0-cp310-abi3-win32.whl", hash = "sha256:d6101ded078042a8f17959eccd9236fb7a9ca20d3b0098bbcb91533a5680d035", size = 425593, upload-time = "2025-10-15T20:39:40.29Z" },
-    { url = "https://files.pythonhosted.org/packages/7b/c6/7a465f1825872c55e0341ff4a80198743f73b69ce5d43ab18043699d1d81/protobuf-6.33.0-cp310-abi3-win_amd64.whl", hash = "sha256:9a031d10f703f03768f2743a1c403af050b6ae1f3480e9c140f39c45f81b13ee", size = 436882, upload-time = "2025-10-15T20:39:42.841Z" },
-    { url = "https://files.pythonhosted.org/packages/e1/a9/b6eee662a6951b9c3640e8e452ab3e09f117d99fc10baa32d1581a0d4099/protobuf-6.33.0-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:905b07a65f1a4b72412314082c7dbfae91a9e8b68a0cc1577515f8df58ecf455", size = 427521, upload-time = "2025-10-15T20:39:43.803Z" },
-    { url = "https://files.pythonhosted.org/packages/10/35/16d31e0f92c6d2f0e77c2a3ba93185130ea13053dd16200a57434c882f2b/protobuf-6.33.0-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:e0697ece353e6239b90ee43a9231318302ad8353c70e6e45499fa52396debf90", size = 324445, upload-time = "2025-10-15T20:39:44.932Z" },
-    { url = "https://files.pythonhosted.org/packages/e6/eb/2a981a13e35cda8b75b5585aaffae2eb904f8f351bdd3870769692acbd8a/protobuf-6.33.0-cp39-abi3-manylinux2014_s390x.whl", hash = "sha256:e0a1715e4f27355afd9570f3ea369735afc853a6c3951a6afe1f80d8569ad298", size = 339159, upload-time = "2025-10-15T20:39:46.186Z" },
-    { url = "https://files.pythonhosted.org/packages/21/51/0b1cbad62074439b867b4e04cc09b93f6699d78fd191bed2bbb44562e077/protobuf-6.33.0-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:35be49fd3f4fefa4e6e2aacc35e8b837d6703c37a2168a55ac21e9b1bc7559ef", size = 323172, upload-time = "2025-10-15T20:39:47.465Z" },
-    { url = "https://files.pythonhosted.org/packages/07/d1/0a28c21707807c6aacd5dc9c3704b2aa1effbf37adebd8caeaf68b17a636/protobuf-6.33.0-py3-none-any.whl", hash = "sha256:25c9e1963c6734448ea2d308cfa610e692b801304ba0908d7bfa564ac5132995", size = 170477, upload-time = "2025-10-15T20:39:51.311Z" },
+    { url = "https://files.pythonhosted.org/packages/bc/91/1e3a34881a88697a7354ffd177e8746e97a722e5e8db101544b47e84afb1/protobuf-6.33.2-cp310-abi3-win32.whl", hash = "sha256:87eb388bd2d0f78febd8f4c8779c79247b26a5befad525008e49a6955787ff3d", size = 425603, upload-time = "2025-12-06T00:17:41.114Z" },
+    { url = "https://files.pythonhosted.org/packages/64/20/4d50191997e917ae13ad0a235c8b42d8c1ab9c3e6fd455ca16d416944355/protobuf-6.33.2-cp310-abi3-win_amd64.whl", hash = "sha256:fc2a0e8b05b180e5fc0dd1559fe8ebdae21a27e81ac77728fb6c42b12c7419b4", size = 436930, upload-time = "2025-12-06T00:17:43.278Z" },
+    { url = "https://files.pythonhosted.org/packages/b2/ca/7e485da88ba45c920fb3f50ae78de29ab925d9e54ef0de678306abfbb497/protobuf-6.33.2-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:d9b19771ca75935b3a4422957bc518b0cecb978b31d1dd12037b088f6bcc0e43", size = 427621, upload-time = "2025-12-06T00:17:44.445Z" },
+    { url = "https://files.pythonhosted.org/packages/7d/4f/f743761e41d3b2b2566748eb76bbff2b43e14d5fcab694f494a16458b05f/protobuf-6.33.2-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:b5d3b5625192214066d99b2b605f5783483575656784de223f00a8d00754fc0e", size = 324460, upload-time = "2025-12-06T00:17:45.678Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/fa/26468d00a92824020f6f2090d827078c09c9c587e34cbfd2d0c7911221f8/protobuf-6.33.2-cp39-abi3-manylinux2014_s390x.whl", hash = "sha256:8cd7640aee0b7828b6d03ae518b5b4806fdfc1afe8de82f79c3454f8aef29872", size = 339168, upload-time = "2025-12-06T00:17:46.813Z" },
+    { url = "https://files.pythonhosted.org/packages/56/13/333b8f421738f149d4fe5e49553bc2a2ab75235486259f689b4b91f96cec/protobuf-6.33.2-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:1f8017c48c07ec5859106533b682260ba3d7c5567b1ca1f24297ce03384d1b4f", size = 323270, upload-time = "2025-12-06T00:17:48.253Z" },
+    { url = "https://files.pythonhosted.org/packages/0e/15/4f02896cc3df04fc465010a4c6a0cd89810f54617a32a70ef531ed75d61c/protobuf-6.33.2-py3-none-any.whl", hash = "sha256:7636aad9bb01768870266de5dc009de2d1b936771b38a793f73cbbf279c91c5c", size = 170501, upload-time = "2025-12-06T00:17:52.211Z" },
 ]
 
 [[package]]
 name = "psutil"
-version = "7.1.3"
+version = "7.2.0"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/e1/88/bdd0a41e5857d5d703287598cbf08dad90aed56774ea52ae071bae9071b6/psutil-7.1.3.tar.gz", hash = "sha256:6c86281738d77335af7aec228328e944b30930899ea760ecf33a4dba66be5e74", size = 489059, upload-time = "2025-11-02T12:25:54.619Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/be/7c/31d1c3ceb1260301f87565f50689dc6da3db427ece1e1e012af22abca54e/psutil-7.2.0.tar.gz", hash = "sha256:2e4f8e1552f77d14dc96fb0f6240c5b34a37081c0889f0853b3b29a496e5ef64", size = 489863, upload-time = "2025-12-23T20:26:24.616Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/ef/94/46b9154a800253e7ecff5aaacdf8ebf43db99de4a2dfa18575b02548654e/psutil-7.1.3-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:2bdbcd0e58ca14996a42adf3621a6244f1bb2e2e528886959c72cf1e326677ab", size = 238359, upload-time = "2025-11-02T12:26:25.284Z" },
-    { url = "https://files.pythonhosted.org/packages/68/3a/9f93cff5c025029a36d9a92fef47220ab4692ee7f2be0fba9f92813d0cb8/psutil-7.1.3-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:bc31fa00f1fbc3c3802141eede66f3a2d51d89716a194bf2cd6fc68310a19880", size = 239171, upload-time = "2025-11-02T12:26:27.23Z" },
-    { url = "https://files.pythonhosted.org/packages/ce/b1/5f49af514f76431ba4eea935b8ad3725cdeb397e9245ab919dbc1d1dc20f/psutil-7.1.3-cp36-abi3-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3bb428f9f05c1225a558f53e30ccbad9930b11c3fc206836242de1091d3e7dd3", size = 263261, upload-time = "2025-11-02T12:26:29.48Z" },
-    { url = "https://files.pythonhosted.org/packages/e0/95/992c8816a74016eb095e73585d747e0a8ea21a061ed3689474fabb29a395/psutil-7.1.3-cp36-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:56d974e02ca2c8eb4812c3f76c30e28836fffc311d55d979f1465c1feeb2b68b", size = 264635, upload-time = "2025-11-02T12:26:31.74Z" },
-    { url = "https://files.pythonhosted.org/packages/55/4c/c3ed1a622b6ae2fd3c945a366e64eb35247a31e4db16cf5095e269e8eb3c/psutil-7.1.3-cp37-abi3-win_amd64.whl", hash = "sha256:f39c2c19fe824b47484b96f9692932248a54c43799a84282cfe58d05a6449efd", size = 247633, upload-time = "2025-11-02T12:26:33.887Z" },
-    { url = "https://files.pythonhosted.org/packages/c9/ad/33b2ccec09bf96c2b2ef3f9a6f66baac8253d7565d8839e024a6b905d45d/psutil-7.1.3-cp37-abi3-win_arm64.whl", hash = "sha256:bd0d69cee829226a761e92f28140bec9a5ee9d5b4fb4b0cc589068dbfff559b1", size = 244608, upload-time = "2025-11-02T12:26:36.136Z" },
+    { url = "https://files.pythonhosted.org/packages/40/c5/a49160bf3e165b7b93a60579a353cf5d939d7f878fe5fd369110f1d18043/psutil-7.2.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:977a2fcd132d15cb05b32b2d85b98d087cad039b0ce435731670ba74da9e6133", size = 128116, upload-time = "2025-12-23T20:26:53.516Z" },
+    { url = "https://files.pythonhosted.org/packages/10/a1/c75feb480f60cd768fb6ed00ac362a16a33e5076ec8475a22d8162fb2659/psutil-7.2.0-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:24151011c21fadd94214d7139d7c6c54569290d7e553989bdf0eab73b13beb8c", size = 128925, upload-time = "2025-12-23T20:26:55.573Z" },
+    { url = "https://files.pythonhosted.org/packages/12/ff/e93136587c00a543f4bc768b157fac2c47cd77b180d4f4e5c6efb6ea53a2/psutil-7.2.0-cp36-abi3-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:91f211ba9279e7c61d9d8f84b713cfc38fa161cb0597d5cb3f1ca742f6848254", size = 154666, upload-time = "2025-12-23T20:26:57.312Z" },
+    { url = "https://files.pythonhosted.org/packages/b8/dd/4c2de9c3827c892599d277a69d2224136800870a8a88a80981de905de28d/psutil-7.2.0-cp36-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f37415188b7ea98faf90fed51131181646c59098b077550246e2e092e127418b", size = 156109, upload-time = "2025-12-23T20:26:58.851Z" },
+    { url = "https://files.pythonhosted.org/packages/81/3f/090943c682d3629968dd0b04826ddcbc760ee1379021dbe316e2ddfcd01b/psutil-7.2.0-cp36-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:0d12c7ce6ed1128cd81fd54606afa054ac7dbb9773469ebb58cf2f171c49f2ac", size = 148081, upload-time = "2025-12-23T20:27:01.318Z" },
+    { url = "https://files.pythonhosted.org/packages/c4/88/c39648ebb8ec182d0364af53cdefe6eddb5f3872ba718b5855a8ff65d6d4/psutil-7.2.0-cp36-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:ca0faef7976530940dcd39bc5382d0d0d5eb023b186a4901ca341bd8d8684151", size = 147376, upload-time = "2025-12-23T20:27:03.347Z" },
+    { url = "https://files.pythonhosted.org/packages/01/a2/5b39e08bd9b27476bc7cce7e21c71a481ad60b81ffac49baf02687a50d7f/psutil-7.2.0-cp37-abi3-win_amd64.whl", hash = "sha256:abdb74137ca232d20250e9ad471f58d500e7743bc8253ba0bfbf26e570c0e437", size = 136910, upload-time = "2025-12-23T20:27:05.289Z" },
+    { url = "https://files.pythonhosted.org/packages/59/54/53839db1258c1eaeb4ded57ff202144ebc75b23facc05a74fd98d338b0c6/psutil-7.2.0-cp37-abi3-win_arm64.whl", hash = "sha256:284e71038b3139e7ab3834b63b3eb5aa5565fcd61a681ec746ef9a0a8c457fd2", size = 133807, upload-time = "2025-12-23T20:27:06.825Z" },
 ]
 
 [[package]]
@@ -1801,29 +1778,26 @@ wheels = [
 
 [[package]]
 name = "pybase64"
-version = "1.4.2"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/04/14/43297a7b7f0c1bf0c00b596f754ee3ac946128c64d21047ccf9c9bbc5165/pybase64-1.4.2.tar.gz", hash = "sha256:46cdefd283ed9643315d952fe44de80dc9b9a811ce6e3ec97fd1827af97692d0", size = 137246, upload-time = "2025-07-27T13:08:57.808Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/28/6d/11ede991e800797b9f5ebd528013b34eee5652df93de61ffb24503393fa5/pybase64-1.4.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:db2c75d1388855b5a1015b65096d7dbcc708e7de3245dcbedeb872ec05a09326", size = 38326, upload-time = "2025-07-27T13:03:09.065Z" },
-    { url = "https://files.pythonhosted.org/packages/fe/84/87f1f565f42e2397e2aaa2477c86419f5173c3699881c42325c090982f0a/pybase64-1.4.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6b621a972a01841368fdb9dedc55fd3c6e0c7217d0505ba3b1ebe95e7ef1b493", size = 31661, upload-time = "2025-07-27T13:03:10.295Z" },
-    { url = "https://files.pythonhosted.org/packages/cb/2a/a24c810e7a61d2cc6f73fe9ee4872a03030887fa8654150901b15f376f65/pybase64-1.4.2-cp312-cp312-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:f48c32ac6a16cbf57a5a96a073fef6ff7e3526f623cd49faa112b7f9980bafba", size = 68192, upload-time = "2025-07-27T13:03:11.467Z" },
-    { url = "https://files.pythonhosted.org/packages/ee/87/d9baf98cbfc37b8657290ad4421f3a3c36aa0eafe4872c5859cfb52f3448/pybase64-1.4.2-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:ace8b23093a6bb862477080d9059b784096ab2f97541e8bfc40d42f062875149", size = 71587, upload-time = "2025-07-27T13:03:12.719Z" },
-    { url = "https://files.pythonhosted.org/packages/0b/89/3df043cc56ef3b91b7aa0c26ae822a2d7ec8da0b0fd7c309c879b0eb5988/pybase64-1.4.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1772c7532a7fb6301baea3dd3e010148dbf70cd1136a83c2f5f91bdc94822145", size = 59910, upload-time = "2025-07-27T13:03:14.266Z" },
-    { url = "https://files.pythonhosted.org/packages/75/4f/6641e9edf37aeb4d4524dc7ba2168eff8d96c90e77f6283c2be3400ab380/pybase64-1.4.2-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.whl", hash = "sha256:f86f7faddcba5cbfea475f8ab96567834c28bf09ca6c7c3d66ee445adac80d8f", size = 56701, upload-time = "2025-07-27T13:03:15.6Z" },
-    { url = "https://files.pythonhosted.org/packages/2d/7f/20d8ac1046f12420a0954a45a13033e75f98aade36eecd00c64e3549b071/pybase64-1.4.2-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:0b8c8e275b5294089f314814b4a50174ab90af79d6a4850f6ae11261ff6a7372", size = 59288, upload-time = "2025-07-27T13:03:16.823Z" },
-    { url = "https://files.pythonhosted.org/packages/17/ea/9c0ca570e3e50b3c6c3442e280c83b321a0464c86a9db1f982a4ff531550/pybase64-1.4.2-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:864d85a0470c615807ae8b97d724d068b940a2d10ac13a5f1b9e75a3ce441758", size = 60267, upload-time = "2025-07-27T13:03:18.132Z" },
-    { url = "https://files.pythonhosted.org/packages/f9/ac/46894929d71ccedebbfb0284173b0fea96bc029cd262654ba8451a7035d6/pybase64-1.4.2-cp312-cp312-manylinux_2_31_riscv64.whl", hash = "sha256:47254d97ed2d8351e30ecfdb9e2414547f66ba73f8a09f932c9378ff75cd10c5", size = 54801, upload-time = "2025-07-27T13:03:19.669Z" },
-    { url = "https://files.pythonhosted.org/packages/6a/1e/02c95218ea964f0b2469717c2c69b48e63f4ca9f18af01a5b2a29e4c1216/pybase64-1.4.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:264b65ecc4f0ee73f3298ab83bbd8008f7f9578361b8df5b448f985d8c63e02a", size = 58599, upload-time = "2025-07-27T13:03:20.951Z" },
-    { url = "https://files.pythonhosted.org/packages/15/45/ccc21004930789b8fb439d43e3212a6c260ccddb2bf450c39a20db093f33/pybase64-1.4.2-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:fbcc2b30cd740c16c9699f596f22c7a9e643591311ae72b1e776f2d539e9dd9d", size = 52388, upload-time = "2025-07-27T13:03:23.064Z" },
-    { url = "https://files.pythonhosted.org/packages/c4/45/22e46e549710c4c237d77785b6fb1bc4c44c288a5c44237ba9daf5c34b82/pybase64-1.4.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:cda9f79c22d51ee4508f5a43b673565f1d26af4330c99f114e37e3186fdd3607", size = 68802, upload-time = "2025-07-27T13:03:24.673Z" },
-    { url = "https://files.pythonhosted.org/packages/55/0c/232c6261b81296e5593549b36e6e7884a5da008776d12665923446322c36/pybase64-1.4.2-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:0c91c6d2a7232e2a1cd10b3b75a8bb657defacd4295a1e5e80455df2dfc84d4f", size = 57841, upload-time = "2025-07-27T13:03:25.948Z" },
-    { url = "https://files.pythonhosted.org/packages/20/8a/b35a615ae6f04550d696bb179c414538b3b477999435fdd4ad75b76139e4/pybase64-1.4.2-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:a370dea7b1cee2a36a4d5445d4e09cc243816c5bc8def61f602db5a6f5438e52", size = 54320, upload-time = "2025-07-27T13:03:27.495Z" },
-    { url = "https://files.pythonhosted.org/packages/d3/a9/8bd4f9bcc53689f1b457ecefed1eaa080e4949d65a62c31a38b7253d5226/pybase64-1.4.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:9aa4de83f02e462a6f4e066811c71d6af31b52d7484de635582d0e3ec3d6cc3e", size = 56482, upload-time = "2025-07-27T13:03:28.942Z" },
-    { url = "https://files.pythonhosted.org/packages/75/e5/4a7735b54a1191f61c3f5c2952212c85c2d6b06eb5fb3671c7603395f70c/pybase64-1.4.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:83a1c2f9ed00fee8f064d548c8654a480741131f280e5750bb32475b7ec8ee38", size = 70959, upload-time = "2025-07-27T13:03:30.171Z" },
-    { url = "https://files.pythonhosted.org/packages/d3/67/e2b6cb32c782e12304d467418e70da0212567f42bd4d3b5eb1fdf64920ad/pybase64-1.4.2-cp312-cp312-win32.whl", hash = "sha256:a6e5688b18d558e8c6b8701cc8560836c4bbeba61d33c836b4dba56b19423716", size = 33683, upload-time = "2025-07-27T13:03:31.775Z" },
-    { url = "https://files.pythonhosted.org/packages/4f/bc/d5c277496063a09707486180f17abbdbdebbf2f5c4441b20b11d3cb7dc7c/pybase64-1.4.2-cp312-cp312-win_amd64.whl", hash = "sha256:c995d21b8bd08aa179cd7dd4db0695c185486ecc72da1e8f6c37ec86cadb8182", size = 35817, upload-time = "2025-07-27T13:03:32.99Z" },
-    { url = "https://files.pythonhosted.org/packages/e6/69/e4be18ae685acff0ae77f75d4586590f29d2cd187bf603290cf1d635cad4/pybase64-1.4.2-cp312-cp312-win_arm64.whl", hash = "sha256:e254b9258c40509c2ea063a7784f6994988f3f26099d6e08704e3c15dfed9a55", size = 30900, upload-time = "2025-07-27T13:03:34.499Z" },
+version = "1.4.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/aa/b8/4ed5c7ad5ec15b08d35cc79ace6145d5c1ae426e46435f4987379439dfea/pybase64-1.4.3.tar.gz", hash = "sha256:c2ed274c9e0ba9c8f9c4083cfe265e66dd679126cd9c2027965d807352f3f053", size = 137272, upload-time = "2025-12-06T13:27:04.013Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/43/1b/9a8cab0042b464e9a876d5c65fe5127445a2436da36fda64899b119b1a1b/pybase64-1.4.3-cp312-cp312-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:f0b3f200c3e06316f6bebabd458b4e4bcd4c2ca26af7c0c766614d91968dee27", size = 68210, upload-time = "2025-12-06T13:23:18.813Z" },
+    { url = "https://files.pythonhosted.org/packages/62/f7/965b79ff391ad208b50e412b5d3205ccce372a2d27b7218ae86d5295b105/pybase64-1.4.3-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:bb632edfd132b3eaf90c39c89aa314beec4e946e210099b57d40311f704e11d4", size = 71599, upload-time = "2025-12-06T13:23:20.195Z" },
+    { url = "https://files.pythonhosted.org/packages/03/4b/a3b5175130b3810bbb8ccfa1edaadbd3afddb9992d877c8a1e2f274b476e/pybase64-1.4.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:356ef1d74648ce997f5a777cf8f1aefecc1c0b4fe6201e0ef3ec8a08170e1b54", size = 59922, upload-time = "2025-12-06T13:23:21.487Z" },
+    { url = "https://files.pythonhosted.org/packages/da/5d/c38d1572027fc601b62d7a407721688b04b4d065d60ca489912d6893e6cf/pybase64-1.4.3-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.whl", hash = "sha256:c48361f90db32bacaa5518419d4eb9066ba558013aaf0c7781620279ecddaeb9", size = 56712, upload-time = "2025-12-06T13:23:22.77Z" },
+    { url = "https://files.pythonhosted.org/packages/e7/d4/4e04472fef485caa8f561d904d4d69210a8f8fc1608ea15ebd9012b92655/pybase64-1.4.3-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:702bcaa16ae02139d881aeaef5b1c8ffb4a3fae062fe601d1e3835e10310a517", size = 59300, upload-time = "2025-12-06T13:23:24.543Z" },
+    { url = "https://files.pythonhosted.org/packages/86/e7/16e29721b86734b881d09b7e23dfd7c8408ad01a4f4c7525f3b1088e25ec/pybase64-1.4.3-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:53d0ffe1847b16b647c6413d34d1de08942b7724273dd57e67dcbdb10c574045", size = 60278, upload-time = "2025-12-06T13:23:25.608Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/02/18515f211d7c046be32070709a8efeeef8a0203de4fd7521e6b56404731b/pybase64-1.4.3-cp312-cp312-manylinux_2_31_riscv64.whl", hash = "sha256:9a1792e8b830a92736dae58f0c386062eb038dfe8004fb03ba33b6083d89cd43", size = 54817, upload-time = "2025-12-06T13:23:26.633Z" },
+    { url = "https://files.pythonhosted.org/packages/e7/be/14e29d8e1a481dbff151324c96dd7b5d2688194bb65dc8a00ca0e1ad1e86/pybase64-1.4.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1d468b1b1ac5ad84875a46eaa458663c3721e8be5f155ade356406848d3701f6", size = 58611, upload-time = "2025-12-06T13:23:27.684Z" },
+    { url = "https://files.pythonhosted.org/packages/b4/8a/a2588dfe24e1bbd742a554553778ab0d65fdf3d1c9a06d10b77047d142aa/pybase64-1.4.3-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:e97b7bdbd62e71898cd542a6a9e320d9da754ff3ebd02cb802d69087ee94d468", size = 52404, upload-time = "2025-12-06T13:23:28.714Z" },
+    { url = "https://files.pythonhosted.org/packages/27/fc/afcda7445bebe0cbc38cafdd7813234cdd4fc5573ff067f1abf317bb0cec/pybase64-1.4.3-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:b33aeaa780caaa08ffda87fc584d5eab61e3d3bbb5d86ead02161dc0c20d04bc", size = 68817, upload-time = "2025-12-06T13:23:30.079Z" },
+    { url = "https://files.pythonhosted.org/packages/d3/3a/87c3201e555ed71f73e961a787241a2438c2bbb2ca8809c29ddf938a3157/pybase64-1.4.3-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:1c0efcf78f11cf866bed49caa7b97552bc4855a892f9cc2372abcd3ed0056f0d", size = 57854, upload-time = "2025-12-06T13:23:31.17Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/7d/931c2539b31a7b375e7d595b88401eeb5bd6c5ce1059c9123f9b608aaa14/pybase64-1.4.3-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:66e3791f2ed725a46593f8bd2761ff37d01e2cdad065b1dceb89066f476e50c6", size = 54333, upload-time = "2025-12-06T13:23:32.422Z" },
+    { url = "https://files.pythonhosted.org/packages/de/5e/537601e02cc01f27e9d75f440f1a6095b8df44fc28b1eef2cd739aea8cec/pybase64-1.4.3-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:72bb0b6bddadab26e1b069bb78e83092711a111a80a0d6b9edcb08199ad7299b", size = 56492, upload-time = "2025-12-06T13:23:33.515Z" },
+    { url = "https://files.pythonhosted.org/packages/96/97/2a2e57acf8f5c9258d22aba52e71f8050e167b29ed2ee1113677c1b600c1/pybase64-1.4.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5b3365dbcbcdb0a294f0f50af0c0a16b27a232eddeeb0bceeefd844ef30d2a23", size = 70974, upload-time = "2025-12-06T13:23:36.27Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/8f/43c3bb11ca9bacf81cb0b7a71500bb65b2eda6d5fe07433c09b543de97f3/pybase64-1.4.3-graalpy312-graalpy250_312_native-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:5c29a582b0ea3936d02bd6fe9bf674ab6059e6e45ab71c78404ab2c913224414", size = 43461, upload-time = "2025-12-06T13:26:28.906Z" },
+    { url = "https://files.pythonhosted.org/packages/2d/4c/2a5258329200be57497d3972b5308558c6de42e3749c6cc2aa1cbe34b25a/pybase64-1.4.3-graalpy312-graalpy250_312_native-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:b6b664758c804fa919b4f1257aa8cf68e95db76fc331de5f70bfc3a34655afe1", size = 36058, upload-time = "2025-12-06T13:26:30.092Z" },
 ]
 
 [[package]]
@@ -1846,7 +1820,7 @@ wheels = [
 
 [[package]]
 name = "pydantic"
-version = "2.12.4"
+version = "2.12.5"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "annotated-types" },
@@ -1854,14 +1828,14 @@ dependencies = [
     { name = "typing-extensions" },
     { name = "typing-inspection" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/96/ad/a17bc283d7d81837c061c49e3eaa27a45991759a1b7eae1031921c6bd924/pydantic-2.12.4.tar.gz", hash = "sha256:0f8cb9555000a4b5b617f66bfd2566264c4984b27589d3b845685983e8ea85ac", size = 821038, upload-time = "2025-11-05T10:50:08.59Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/69/44/36f1a6e523abc58ae5f928898e4aca2e0ea509b5aa6f6f392a5d882be928/pydantic-2.12.5.tar.gz", hash = "sha256:4d351024c75c0f085a9febbb665ce8c0c6ec5d30e903bdb6394b7ede26aebb49", size = 821591, upload-time = "2025-11-26T15:11:46.471Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/82/2f/e68750da9b04856e2a7ec56fc6f034a5a79775e9b9a81882252789873798/pydantic-2.12.4-py3-none-any.whl", hash = "sha256:92d3d202a745d46f9be6df459ac5a064fdaa3c1c4cd8adcfa332ccf3c05f871e", size = 463400, upload-time = "2025-11-05T10:50:06.732Z" },
+    { url = "https://files.pythonhosted.org/packages/5a/87/b70ad306ebb6f9b585f114d0ac2137d792b48be34d732d60e597c2f8465a/pydantic-2.12.5-py3-none-any.whl", hash = "sha256:e561593fccf61e8a20fc46dfc2dfe075b8be7d0188df33f221ad1f0139180f9d", size = 463580, upload-time = "2025-11-26T15:11:44.605Z" },
 ]
 
 [package.optional-dependencies]
 email = [
-    { name = "email-validator" },
+    { name = "email-validator", marker = "sys_platform == 'linux'" },
 ]
 
 [[package]]
@@ -1898,8 +1872,8 @@ name = "pydantic-extra-types"
 version = "2.10.6"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "pydantic" },
-    { name = "typing-extensions" },
+    { name = "pydantic", marker = "sys_platform == 'linux'" },
+    { name = "typing-extensions", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/3a/10/fb64987804cde41bcc39d9cd757cd5f2bb5d97b389d81aa70238b14b8a7e/pydantic_extra_types-2.10.6.tar.gz", hash = "sha256:c63d70bf684366e6bbe1f4ee3957952ebe6973d41e7802aea0b770d06b116aeb", size = 141858, upload-time = "2025-10-08T13:47:49.483Z" }
 wheels = [
@@ -1908,7 +1882,7 @@ wheels = [
 
 [package.optional-dependencies]
 pycountry = [
-    { name = "pycountry" },
+    { name = "pycountry", marker = "sys_platform == 'linux'" },
 ]
 
 [[package]]
@@ -1916,9 +1890,9 @@ name = "pydantic-settings"
 version = "2.12.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "pydantic" },
-    { name = "python-dotenv" },
-    { name = "typing-inspection" },
+    { name = "pydantic", marker = "sys_platform == 'linux'" },
+    { name = "python-dotenv", marker = "sys_platform == 'linux'" },
+    { name = "typing-inspection", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/43/4b/ac7e0aae12027748076d72a8764ff1c9d82ca75a7a52622e67ed3f765c54/pydantic_settings-2.12.0.tar.gz", hash = "sha256:005538ef951e3c2a68e1c08b292b5f2e71490def8589d4221b95dab00dafcfd0", size = 194184, upload-time = "2025-11-10T14:25:47.013Z" }
 wheels = [
@@ -1945,12 +1919,12 @@ wheels = [
 
 [package.optional-dependencies]
 crypto = [
-    { name = "cryptography" },
+    { name = "cryptography", marker = "sys_platform == 'linux'" },
 ]
 
 [[package]]
 name = "pytest"
-version = "8.4.2"
+version = "9.0.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "colorama", marker = "sys_platform == 'win32'" },
@@ -1959,22 +1933,22 @@ dependencies = [
     { name = "pluggy" },
     { name = "pygments" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/a3/5c/00a0e072241553e1a7496d638deababa67c5058571567b92a7eaa258397c/pytest-8.4.2.tar.gz", hash = "sha256:86c0d0b93306b961d58d62a4db4879f27fe25513d4b969df351abdddb3c30e01", size = 1519618, upload-time = "2025-09-04T14:34:22.711Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/d1/db/7ef3487e0fb0049ddb5ce41d3a49c235bf9ad299b6a25d5780a89f19230f/pytest-9.0.2.tar.gz", hash = "sha256:75186651a92bd89611d1d9fc20f0b4345fd827c41ccd5c299a868a05d70edf11", size = 1568901, upload-time = "2025-12-06T21:30:51.014Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/a8/a4/20da314d277121d6534b3a980b29035dcd51e6744bd79075a6ce8fa4eb8d/pytest-8.4.2-py3-none-any.whl", hash = "sha256:872f880de3fc3a5bdc88a11b39c9710c3497a547cfa9320bc3c5e62fbf272e79", size = 365750, upload-time = "2025-09-04T14:34:20.226Z" },
+    { url = "https://files.pythonhosted.org/packages/3b/ab/b3226f0bd7cdcf710fbede2b3548584366da3b19b5021e74f5bde2a8fa3f/pytest-9.0.2-py3-none-any.whl", hash = "sha256:711ffd45bf766d5264d487b917733b453d917afd2b0ad65223959f59089f875b", size = 374801, upload-time = "2025-12-06T21:30:49.154Z" },
 ]
 
 [[package]]
 name = "pytest-asyncio"
-version = "1.2.0"
+version = "1.3.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "pytest" },
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/42/86/9e3c5f48f7b7b638b216e4b9e645f54d199d7abbbab7a64a13b4e12ba10f/pytest_asyncio-1.2.0.tar.gz", hash = "sha256:c609a64a2a8768462d0c99811ddb8bd2583c33fd33cf7f21af1c142e824ffb57", size = 50119, upload-time = "2025-09-12T07:33:53.816Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/90/2c/8af215c0f776415f3590cac4f9086ccefd6fd463befeae41cd4d3f193e5a/pytest_asyncio-1.3.0.tar.gz", hash = "sha256:d7f52f36d231b80ee124cd216ffb19369aa168fc10095013c6b014a34d3ee9e5", size = 50087, upload-time = "2025-11-10T16:07:47.256Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/04/93/2fa34714b7a4ae72f2f8dad66ba17dd9a2c793220719e736dda28b7aec27/pytest_asyncio-1.2.0-py3-none-any.whl", hash = "sha256:8e17ae5e46d8e7efe51ab6494dd2010f4ca8dae51652aa3c8d55acf50bfb2e99", size = 15095, upload-time = "2025-09-12T07:33:52.639Z" },
+    { url = "https://files.pythonhosted.org/packages/e5/35/f8b19922b6a25bc0880171a2f1a003eaeb93657475193ab516fd87cac9da/pytest_asyncio-1.3.0-py3-none-any.whl", hash = "sha256:611e26147c7f77640e6d0a92a38ed17c3e9848063698d5c93d5aa7aa11cebff5", size = 15075, upload-time = "2025-11-10T16:07:45.537Z" },
 ]
 
 [[package]]
@@ -2009,11 +1983,11 @@ wheels = [
 
 [[package]]
 name = "python-multipart"
-version = "0.0.20"
+version = "0.0.21"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/f3/87/f44d7c9f274c7ee665a29b885ec97089ec5dc034c7f3fafa03da9e39a09e/python_multipart-0.0.20.tar.gz", hash = "sha256:8dd0cab45b8e23064ae09147625994d090fa46f5b0d1e13af944c331a7fa9d13", size = 37158, upload-time = "2024-12-16T19:45:46.972Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/78/96/804520d0850c7db98e5ccb70282e29208723f0964e88ffd9d0da2f52ea09/python_multipart-0.0.21.tar.gz", hash = "sha256:7137ebd4d3bbf70ea1622998f902b97a29434a9e8dc40eb203bbcf7c2a2cba92", size = 37196, upload-time = "2025-12-17T09:24:22.446Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/45/58/38b5afbc1a800eeea951b9285d3912613f2603bdf897a4ab0f4bd7f405fc/python_multipart-0.0.20-py3-none-any.whl", hash = "sha256:8a62d3a8335e06589fe01f2a3e178cdcc632f3fbe0d492ad9ee0ec35aab1f104", size = 24546, upload-time = "2024-12-16T19:45:44.423Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/76/03af049af4dcee5d27442f71b6924f01f3efb5d2bd34f23fcd563f2cc5f5/python_multipart-0.0.21-py3-none-any.whl", hash = "sha256:cf7a6713e01c87aa35387f4774e812c4361150938d20d232800f75ffcf266090", size = 24541, upload-time = "2025-12-17T09:24:21.153Z" },
 ]
 
 [[package]]
@@ -2058,55 +2032,40 @@ name = "pyzmq"
 version = "27.1.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "cffi", marker = "implementation_name == 'pypy'" },
+    { name = "cffi", marker = "implementation_name == 'pypy' and sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/04/0b/3c9baedbdf613ecaa7aa07027780b8867f57b6293b6ee50de316c9f3222b/pyzmq-27.1.0.tar.gz", hash = "sha256:ac0765e3d44455adb6ddbf4417dcce460fc40a05978c08efdf2948072f6db540", size = 281750, upload-time = "2025-09-08T23:10:18.157Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/92/e7/038aab64a946d535901103da16b953c8c9cc9c961dadcbf3609ed6428d23/pyzmq-27.1.0-cp312-abi3-macosx_10_15_universal2.whl", hash = "sha256:452631b640340c928fa343801b0d07eb0c3789a5ffa843f6e1a9cee0ba4eb4fc", size = 1306279, upload-time = "2025-09-08T23:08:03.807Z" },
     { url = "https://files.pythonhosted.org/packages/e8/5e/c3c49fdd0f535ef45eefcc16934648e9e59dace4a37ee88fc53f6cd8e641/pyzmq-27.1.0-cp312-abi3-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:1c179799b118e554b66da67d88ed66cd37a169f1f23b5d9f0a231b4e8d44a113", size = 895645, upload-time = "2025-09-08T23:08:05.301Z" },
     { url = "https://files.pythonhosted.org/packages/f8/e5/b0b2504cb4e903a74dcf1ebae157f9e20ebb6ea76095f6cfffea28c42ecd/pyzmq-27.1.0-cp312-abi3-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3837439b7f99e60312f0c926a6ad437b067356dc2bc2ec96eb395fd0fe804233", size = 652574, upload-time = "2025-09-08T23:08:06.828Z" },
     { url = "https://files.pythonhosted.org/packages/f8/9b/c108cdb55560eaf253f0cbdb61b29971e9fb34d9c3499b0e96e4e60ed8a5/pyzmq-27.1.0-cp312-abi3-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:43ad9a73e3da1fab5b0e7e13402f0b2fb934ae1c876c51d0afff0e7c052eca31", size = 840995, upload-time = "2025-09-08T23:08:08.396Z" },
     { url = "https://files.pythonhosted.org/packages/c2/bb/b79798ca177b9eb0825b4c9998c6af8cd2a7f15a6a1a4272c1d1a21d382f/pyzmq-27.1.0-cp312-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:0de3028d69d4cdc475bfe47a6128eb38d8bc0e8f4d69646adfbcd840facbac28", size = 1642070, upload-time = "2025-09-08T23:08:09.989Z" },
     { url = "https://files.pythonhosted.org/packages/9c/80/2df2e7977c4ede24c79ae39dcef3899bfc5f34d1ca7a5b24f182c9b7a9ca/pyzmq-27.1.0-cp312-abi3-musllinux_1_2_i686.whl", hash = "sha256:cf44a7763aea9298c0aa7dbf859f87ed7012de8bda0f3977b6fb1d96745df856", size = 2021121, upload-time = "2025-09-08T23:08:11.907Z" },
     { url = "https://files.pythonhosted.org/packages/46/bd/2d45ad24f5f5ae7e8d01525eb76786fa7557136555cac7d929880519e33a/pyzmq-27.1.0-cp312-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:f30f395a9e6fbca195400ce833c731e7b64c3919aa481af4d88c3759e0cb7496", size = 1878550, upload-time = "2025-09-08T23:08:13.513Z" },
-    { url = "https://files.pythonhosted.org/packages/e6/2f/104c0a3c778d7c2ab8190e9db4f62f0b6957b53c9d87db77c284b69f33ea/pyzmq-27.1.0-cp312-abi3-win32.whl", hash = "sha256:250e5436a4ba13885494412b3da5d518cd0d3a278a1ae640e113c073a5f88edd", size = 559184, upload-time = "2025-09-08T23:08:15.163Z" },
-    { url = "https://files.pythonhosted.org/packages/fc/7f/a21b20d577e4100c6a41795842028235998a643b1ad406a6d4163ea8f53e/pyzmq-27.1.0-cp312-abi3-win_amd64.whl", hash = "sha256:9ce490cf1d2ca2ad84733aa1d69ce6855372cb5ce9223802450c9b2a7cba0ccf", size = 619480, upload-time = "2025-09-08T23:08:17.192Z" },
-    { url = "https://files.pythonhosted.org/packages/78/c2/c012beae5f76b72f007a9e91ee9401cb88c51d0f83c6257a03e785c81cc2/pyzmq-27.1.0-cp312-abi3-win_arm64.whl", hash = "sha256:75a2f36223f0d535a0c919e23615fc85a1e23b71f40c7eb43d7b1dedb4d8f15f", size = 552993, upload-time = "2025-09-08T23:08:18.926Z" },
 ]
 
 [[package]]
 name = "ray"
-version = "2.51.1"
+version = "2.53.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "click" },
-    { name = "filelock" },
-    { name = "jsonschema" },
-    { name = "msgpack" },
-    { name = "packaging" },
-    { name = "protobuf" },
-    { name = "pyyaml" },
-    { name = "requests" },
+    { name = "click", marker = "sys_platform == 'linux'" },
+    { name = "filelock", marker = "sys_platform == 'linux'" },
+    { name = "jsonschema", marker = "sys_platform == 'linux'" },
+    { name = "msgpack", marker = "sys_platform == 'linux'" },
+    { name = "packaging", marker = "sys_platform == 'linux'" },
+    { name = "protobuf", marker = "sys_platform == 'linux'" },
+    { name = "pyyaml", marker = "sys_platform == 'linux'" },
+    { name = "requests", marker = "sys_platform == 'linux'" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/f6/95/51e44ce79e42f02ca1c4d4c5501e6dd49f3a384c5f6324aceb4e0015988a/ray-2.51.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:ef847b025ca758baee4571a1ca001d973897cad772f8e95d7f303d24c38b649e", size = 68029226, upload-time = "2025-11-01T03:24:21.928Z" },
-    { url = "https://files.pythonhosted.org/packages/e2/b5/a93e39e131067edb7cba3385a609f61aaaf7aa54728cd3a7474bfbf3b0fc/ray-2.51.1-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:0bed9408712bad1511e65683a455302f88d94e5e5cb6a58cc4a154b61d8a0b4a", size = 70502423, upload-time = "2025-11-01T03:24:27.398Z" },
-    { url = "https://files.pythonhosted.org/packages/ee/59/69b7a653ed8176fc7fd894d462ed34bb1477e7fa71700324de99179b5b7e/ray-2.51.1-cp312-cp312-manylinux2014_x86_64.whl", hash = "sha256:4e786da7862cf73664977d0212a505d6d5a585beadf63e7dc1e1c129259bee20", size = 71353730, upload-time = "2025-11-01T03:24:33.495Z" },
-    { url = "https://files.pythonhosted.org/packages/38/91/0c4fe7aed34baa14d9c050c88f39ff16083d555bd6dcd6c4ffb4332a6f8a/ray-2.51.1-cp312-cp312-win_amd64.whl", hash = "sha256:198fda93074a6863555f4003e9013bb2ba0cd50b59b18c02affdc294b28a2eef", size = 26674921, upload-time = "2025-11-01T03:24:38.394Z" },
+    { url = "https://files.pythonhosted.org/packages/38/68/8e59b8413f3751fe7ce8b98ee8787d13964b47a4043587950790a9dd2151/ray-2.53.0-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:65e2ce58d3dc6baa3cf45824d889c1968ebde565ee54dfd80a98af8f31af8e4a", size = 71504450, upload-time = "2025-12-20T16:06:34.922Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/db/978a50d264565ca42e2a4bf115ec9a1f04f19ca5e620e6aa2f280747b644/ray-2.53.0-cp312-cp312-manylinux2014_x86_64.whl", hash = "sha256:14f46363e9b4cf0c1c8b4d8623ec337c5bd408377831b5e5b50067930137bbca", size = 72370424, upload-time = "2025-12-20T16:06:40.821Z" },
 ]
 
 [package.optional-dependencies]
 cgraph = [
-    { name = "cupy-cuda12x", marker = "sys_platform != 'darwin'" },
-]
-
-[[package]]
-name = "redis"
-version = "7.1.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/43/c8/983d5c6579a411d8a99bc5823cc5712768859b5ce2c8afe1a65b37832c81/redis-7.1.0.tar.gz", hash = "sha256:b1cc3cfa5a2cb9c2ab3ba700864fb0ad75617b41f01352ce5779dabf6d5f9c3c", size = 4796669, upload-time = "2025-11-19T15:54:39.961Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/89/f0/8956f8a86b20d7bb9d6ac0187cf4cd54d8065bc9a1a09eb8011d4d326596/redis-7.1.0-py3-none-any.whl", hash = "sha256:23c52b208f92b56103e17c5d06bdc1a6c2c0b3106583985a76a18f83b265de2b", size = 354159, upload-time = "2025-11-19T15:54:38.064Z" },
+    { name = "cupy-cuda12x", marker = "sys_platform == 'linux'" },
 ]
 
 [[package]]
@@ -2114,9 +2073,9 @@ name = "referencing"
 version = "0.37.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "attrs" },
-    { name = "rpds-py" },
-    { name = "typing-extensions" },
+    { name = "attrs", marker = "sys_platform == 'linux'" },
+    { name = "rpds-py", marker = "sys_platform == 'linux'" },
+    { name = "typing-extensions", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/22/f5/df4e9027acead3ecc63e50fe1e36aca1523e1719559c499951bb4b53188f/referencing-0.37.0.tar.gz", hash = "sha256:44aefc3142c5b842538163acb373e24cce6632bd54bdb01b21ad5863489f50d8", size = 78036, upload-time = "2025-10-13T15:30:48.871Z" }
 wheels = [
@@ -2175,16 +2134,16 @@ wheels = [
 
 [[package]]
 name = "rich-toolkit"
-version = "0.15.1"
+version = "0.17.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "click" },
-    { name = "rich" },
-    { name = "typing-extensions" },
+    { name = "click", marker = "sys_platform == 'linux'" },
+    { name = "rich", marker = "sys_platform == 'linux'" },
+    { name = "typing-extensions", marker = "sys_platform == 'linux'" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/67/33/1a18839aaa8feef7983590c05c22c9c09d245ada6017d118325bbfcc7651/rich_toolkit-0.15.1.tar.gz", hash = "sha256:6f9630eb29f3843d19d48c3bd5706a086d36d62016687f9d0efa027ddc2dd08a", size = 115322, upload-time = "2025-09-04T09:28:11.789Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/97/09/3f9b8d9daaf235195c626f21e03604c05b987404ee3bcacee0c1f67f2a8e/rich_toolkit-0.17.1.tar.gz", hash = "sha256:5af54df8d1dd9c8530e462e1bdcaed625c9b49f5a55b035aa0ba1c17bdb87c9a", size = 187925, upload-time = "2025-12-17T10:49:22.583Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/c8/49/42821d55ead7b5a87c8d121edf323cb393d8579f63e933002ade900b784f/rich_toolkit-0.15.1-py3-none-any.whl", hash = "sha256:36a0b1d9a135d26776e4b78f1d5c2655da6e0ef432380b5c6b523c8d8ab97478", size = 29412, upload-time = "2025-09-04T09:28:10.587Z" },
+    { url = "https://files.pythonhosted.org/packages/7f/7b/15e55fa8a76d0d41bf34d965af78acdaf80a315907adb30de8b63c272694/rich_toolkit-0.17.1-py3-none-any.whl", hash = "sha256:96d24bb921ecd225ffce7c526a9149e74006410c05e6d405bd74ffd54d5631ed", size = 31412, upload-time = "2025-12-17T10:49:21.793Z" },
 ]
 
 [[package]]
@@ -2193,8 +2152,6 @@ version = "0.7.6"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/e5/f5/8bed2310abe4ae04b67a38374a4d311dd85220f5d8da56f47ae9361be0b0/rignore-0.7.6.tar.gz", hash = "sha256:00d3546cd793c30cb17921ce674d2c8f3a4b00501cb0e3dd0e82217dbeba2671", size = 57140, upload-time = "2025-11-05T21:41:21.968Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/0b/0e/012556ef3047a2628842b44e753bb15f4dc46806780ff090f1e8fe4bf1eb/rignore-0.7.6-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:03e82348cb7234f8d9b2834f854400ddbbd04c0f8f35495119e66adbd37827a8", size = 883488, upload-time = "2025-11-05T20:42:41.359Z" },
-    { url = "https://files.pythonhosted.org/packages/93/b0/d4f1f3fe9eb3f8e382d45ce5b0547ea01c4b7e0b4b4eb87bcd66a1d2b888/rignore-0.7.6-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b9e624f6be6116ea682e76c5feb71ea91255c67c86cb75befe774365b2931961", size = 820411, upload-time = "2025-11-05T20:42:24.782Z" },
     { url = "https://files.pythonhosted.org/packages/4a/c8/dea564b36dedac8de21c18e1851789545bc52a0c22ece9843444d5608a6a/rignore-0.7.6-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bda49950d405aa8d0ebe26af807c4e662dd281d926530f03f29690a2e07d649a", size = 897821, upload-time = "2025-11-05T20:40:52.613Z" },
     { url = "https://files.pythonhosted.org/packages/b3/2b/ee96db17ac1835e024c5d0742eefb7e46de60020385ac883dd3d1cde2c1f/rignore-0.7.6-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b5fd5ab3840b8c16851d327ed06e9b8be6459702a53e5ab1fc4073b684b3789e", size = 873963, upload-time = "2025-11-05T20:41:07.49Z" },
     { url = "https://files.pythonhosted.org/packages/a5/8c/ad5a57bbb9d14d5c7e5960f712a8a0b902472ea3f4a2138cbf70d1777b75/rignore-0.7.6-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ced2a248352636a5c77504cb755dc02c2eef9a820a44d3f33061ce1bb8a7f2d2", size = 1169216, upload-time = "2025-11-05T20:41:23.73Z" },
@@ -2205,80 +2162,72 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/d4/cf/2c64f0b6725149f7c6e7e5a909d14354889b4beaadddaa5fff023ec71084/rignore-0.7.6-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:5719ea14ea2b652c0c0894be5dfde954e1853a80dea27dd2fbaa749618d837f5", size = 1139186, upload-time = "2025-11-05T21:40:31.27Z" },
     { url = "https://files.pythonhosted.org/packages/75/95/a86c84909ccc24af0d094b50d54697951e576c252a4d9f21b47b52af9598/rignore-0.7.6-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:8e23424fc7ce35726854f639cb7968151a792c0c3d9d082f7f67e0c362cfecca", size = 1117604, upload-time = "2025-11-05T21:40:48.07Z" },
     { url = "https://files.pythonhosted.org/packages/7f/5e/13b249613fd5d18d58662490ab910a9f0be758981d1797789913adb4e918/rignore-0.7.6-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3efdcf1dd84d45f3e2bd2f93303d9be103888f56dfa7c3349b5bf4f0657ec696", size = 1127725, upload-time = "2025-11-05T21:41:05.804Z" },
-    { url = "https://files.pythonhosted.org/packages/c7/28/fa5dcd1e2e16982c359128664e3785f202d3eca9b22dd0b2f91c4b3d242f/rignore-0.7.6-cp312-cp312-win32.whl", hash = "sha256:ccca9d1a8b5234c76b71546fc3c134533b013f40495f394a65614a81f7387046", size = 646145, upload-time = "2025-11-05T21:41:51.096Z" },
-    { url = "https://files.pythonhosted.org/packages/26/87/69387fb5dd81a0f771936381431780b8cf66fcd2cfe9495e1aaf41548931/rignore-0.7.6-cp312-cp312-win_amd64.whl", hash = "sha256:c96a285e4a8bfec0652e0bfcf42b1aabcdda1e7625f5006d188e3b1c87fdb543", size = 726090, upload-time = "2025-11-05T21:41:36.485Z" },
-    { url = "https://files.pythonhosted.org/packages/24/5f/e8418108dcda8087fb198a6f81caadbcda9fd115d61154bf0df4d6d3619b/rignore-0.7.6-cp312-cp312-win_arm64.whl", hash = "sha256:a64a750e7a8277a323f01ca50b7784a764845f6cce2fe38831cb93f0508d0051", size = 656317, upload-time = "2025-11-05T21:41:25.305Z" },
 ]
 
 [[package]]
 name = "rpds-py"
-version = "0.28.0"
+version = "0.30.0"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/48/dc/95f074d43452b3ef5d06276696ece4b3b5d696e7c9ad7173c54b1390cd70/rpds_py-0.28.0.tar.gz", hash = "sha256:abd4df20485a0983e2ca334a216249b6186d6e3c1627e106651943dbdb791aea", size = 27419, upload-time = "2025-10-22T22:24:29.327Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/20/af/3f2f423103f1113b36230496629986e0ef7e199d2aa8392452b484b38ced/rpds_py-0.30.0.tar.gz", hash = "sha256:dd8ff7cf90014af0c0f787eea34794ebf6415242ee1d6fa91eaba725cc441e84", size = 69469, upload-time = "2025-11-30T20:24:38.837Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/b8/5c/6c3936495003875fe7b14f90ea812841a08fca50ab26bd840e924097d9c8/rpds_py-0.28.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:6b4f28583a4f247ff60cd7bdda83db8c3f5b05a7a82ff20dd4b078571747708f", size = 366439, upload-time = "2025-10-22T22:22:04.525Z" },
-    { url = "https://files.pythonhosted.org/packages/56/f9/a0f1ca194c50aa29895b442771f036a25b6c41a35e4f35b1a0ea713bedae/rpds_py-0.28.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d678e91b610c29c4b3d52a2c148b641df2b4676ffe47c59f6388d58b99cdc424", size = 348170, upload-time = "2025-10-22T22:22:06.397Z" },
-    { url = "https://files.pythonhosted.org/packages/18/ea/42d243d3a586beb72c77fa5def0487daf827210069a95f36328e869599ea/rpds_py-0.28.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e819e0e37a44a78e1383bf1970076e2ccc4dc8c2bbaa2f9bd1dc987e9afff628", size = 378838, upload-time = "2025-10-22T22:22:07.932Z" },
-    { url = "https://files.pythonhosted.org/packages/e7/78/3de32e18a94791af8f33601402d9d4f39613136398658412a4e0b3047327/rpds_py-0.28.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5ee514e0f0523db5d3fb171f397c54875dbbd69760a414dccf9d4d7ad628b5bd", size = 393299, upload-time = "2025-10-22T22:22:09.435Z" },
-    { url = "https://files.pythonhosted.org/packages/13/7e/4bdb435afb18acea2eb8a25ad56b956f28de7c59f8a1d32827effa0d4514/rpds_py-0.28.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5f3fa06d27fdcee47f07a39e02862da0100cb4982508f5ead53ec533cd5fe55e", size = 518000, upload-time = "2025-10-22T22:22:11.326Z" },
-    { url = "https://files.pythonhosted.org/packages/31/d0/5f52a656875cdc60498ab035a7a0ac8f399890cc1ee73ebd567bac4e39ae/rpds_py-0.28.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:46959ef2e64f9e4a41fc89aa20dbca2b85531f9a72c21099a3360f35d10b0d5a", size = 408746, upload-time = "2025-10-22T22:22:13.143Z" },
-    { url = "https://files.pythonhosted.org/packages/3e/cd/49ce51767b879cde77e7ad9fae164ea15dce3616fe591d9ea1df51152706/rpds_py-0.28.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8455933b4bcd6e83fde3fefc987a023389c4b13f9a58c8d23e4b3f6d13f78c84", size = 386379, upload-time = "2025-10-22T22:22:14.602Z" },
-    { url = "https://files.pythonhosted.org/packages/6a/99/e4e1e1ee93a98f72fc450e36c0e4d99c35370220e815288e3ecd2ec36a2a/rpds_py-0.28.0-cp312-cp312-manylinux_2_31_riscv64.whl", hash = "sha256:ad50614a02c8c2962feebe6012b52f9802deec4263946cddea37aaf28dd25a66", size = 401280, upload-time = "2025-10-22T22:22:16.063Z" },
-    { url = "https://files.pythonhosted.org/packages/61/35/e0c6a57488392a8b319d2200d03dad2b29c0db9996f5662c3b02d0b86c02/rpds_py-0.28.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e5deca01b271492553fdb6c7fd974659dce736a15bae5dad7ab8b93555bceb28", size = 412365, upload-time = "2025-10-22T22:22:17.504Z" },
-    { url = "https://files.pythonhosted.org/packages/ff/6a/841337980ea253ec797eb084665436007a1aad0faac1ba097fb906c5f69c/rpds_py-0.28.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:735f8495a13159ce6a0d533f01e8674cec0c57038c920495f87dcb20b3ddb48a", size = 559573, upload-time = "2025-10-22T22:22:19.108Z" },
-    { url = "https://files.pythonhosted.org/packages/e7/5e/64826ec58afd4c489731f8b00729c5f6afdb86f1df1df60bfede55d650bb/rpds_py-0.28.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:961ca621ff10d198bbe6ba4957decca61aa2a0c56695384c1d6b79bf61436df5", size = 583973, upload-time = "2025-10-22T22:22:20.768Z" },
-    { url = "https://files.pythonhosted.org/packages/b6/ee/44d024b4843f8386a4eeaa4c171b3d31d55f7177c415545fd1a24c249b5d/rpds_py-0.28.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2374e16cc9131022e7d9a8f8d65d261d9ba55048c78f3b6e017971a4f5e6353c", size = 553800, upload-time = "2025-10-22T22:22:22.25Z" },
-    { url = "https://files.pythonhosted.org/packages/7d/89/33e675dccff11a06d4d85dbb4d1865f878d5020cbb69b2c1e7b2d3f82562/rpds_py-0.28.0-cp312-cp312-win32.whl", hash = "sha256:d15431e334fba488b081d47f30f091e5d03c18527c325386091f31718952fe08", size = 216954, upload-time = "2025-10-22T22:22:24.105Z" },
-    { url = "https://files.pythonhosted.org/packages/af/36/45f6ebb3210887e8ee6dbf1bc710ae8400bb417ce165aaf3024b8360d999/rpds_py-0.28.0-cp312-cp312-win_amd64.whl", hash = "sha256:a410542d61fc54710f750d3764380b53bf09e8c4edbf2f9141a82aa774a04f7c", size = 227844, upload-time = "2025-10-22T22:22:25.551Z" },
-    { url = "https://files.pythonhosted.org/packages/57/91/f3fb250d7e73de71080f9a221d19bd6a1c1eb0d12a1ea26513f6c1052ad6/rpds_py-0.28.0-cp312-cp312-win_arm64.whl", hash = "sha256:1f0cfd1c69e2d14f8c892b893997fa9a60d890a0c8a603e88dca4955f26d1edd", size = 217624, upload-time = "2025-10-22T22:22:26.914Z" },
+    { url = "https://files.pythonhosted.org/packages/65/1c/ae157e83a6357eceff62ba7e52113e3ec4834a84cfe07fa4b0757a7d105f/rpds_py-0.30.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca28829ae5f5d569bb62a79512c842a03a12576375d5ece7d2cadf8abe96ec28", size = 390763, upload-time = "2025-11-30T20:22:21.661Z" },
+    { url = "https://files.pythonhosted.org/packages/d4/36/eb2eb8515e2ad24c0bd43c3ee9cd74c33f7ca6430755ccdb240fd3144c44/rpds_py-0.30.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a1010ed9524c73b94d15919ca4d41d8780980e1765babf85f9a2f90d247153dd", size = 408951, upload-time = "2025-11-30T20:22:23.408Z" },
+    { url = "https://files.pythonhosted.org/packages/d6/65/ad8dc1784a331fabbd740ef6f71ce2198c7ed0890dab595adb9ea2d775a1/rpds_py-0.30.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f8d1736cfb49381ba528cd5baa46f82fdc65c06e843dab24dd70b63d09121b3f", size = 514622, upload-time = "2025-11-30T20:22:25.16Z" },
+    { url = "https://files.pythonhosted.org/packages/63/8e/0cfa7ae158e15e143fe03993b5bcd743a59f541f5952e1546b1ac1b5fd45/rpds_py-0.30.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d948b135c4693daff7bc2dcfc4ec57237a29bd37e60c2fabf5aff2bbacf3e2f1", size = 414492, upload-time = "2025-11-30T20:22:26.505Z" },
+    { url = "https://files.pythonhosted.org/packages/60/1b/6f8f29f3f995c7ffdde46a626ddccd7c63aefc0efae881dc13b6e5d5bb16/rpds_py-0.30.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47f236970bccb2233267d89173d3ad2703cd36a0e2a6e92d0560d333871a3d23", size = 394080, upload-time = "2025-11-30T20:22:27.934Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/d5/a266341051a7a3ca2f4b750a3aa4abc986378431fc2da508c5034d081b70/rpds_py-0.30.0-cp312-cp312-manylinux_2_31_riscv64.whl", hash = "sha256:2e6ecb5a5bcacf59c3f912155044479af1d0b6681280048b338b28e364aca1f6", size = 408680, upload-time = "2025-11-30T20:22:29.341Z" },
+    { url = "https://files.pythonhosted.org/packages/10/3b/71b725851df9ab7a7a4e33cf36d241933da66040d195a84781f49c50490c/rpds_py-0.30.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a8fa71a2e078c527c3e9dc9fc5a98c9db40bcc8a92b4e8858e36d329f8684b51", size = 423589, upload-time = "2025-11-30T20:22:31.469Z" },
+    { url = "https://files.pythonhosted.org/packages/00/2b/e59e58c544dc9bd8bd8384ecdb8ea91f6727f0e37a7131baeff8d6f51661/rpds_py-0.30.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:73c67f2db7bc334e518d097c6d1e6fed021bbc9b7d678d6cc433478365d1d5f5", size = 573289, upload-time = "2025-11-30T20:22:32.997Z" },
+    { url = "https://files.pythonhosted.org/packages/da/3e/a18e6f5b460893172a7d6a680e86d3b6bc87a54c1f0b03446a3c8c7b588f/rpds_py-0.30.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:5ba103fb455be00f3b1c2076c9d4264bfcb037c976167a6047ed82f23153f02e", size = 599737, upload-time = "2025-11-30T20:22:34.419Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/e2/714694e4b87b85a18e2c243614974413c60aa107fd815b8cbc42b873d1d7/rpds_py-0.30.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7cee9c752c0364588353e627da8a7e808a66873672bcb5f52890c33fd965b394", size = 563120, upload-time = "2025-11-30T20:22:35.903Z" },
 ]
 
 [[package]]
 name = "ruff"
-version = "0.14.5"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/82/fa/fbb67a5780ae0f704876cb8ac92d6d76da41da4dc72b7ed3565ab18f2f52/ruff-0.14.5.tar.gz", hash = "sha256:8d3b48d7d8aad423d3137af7ab6c8b1e38e4de104800f0d596990f6ada1a9fc1", size = 5615944, upload-time = "2025-11-13T19:58:51.155Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/68/31/c07e9c535248d10836a94e4f4e8c5a31a1beed6f169b31405b227872d4f4/ruff-0.14.5-py3-none-linux_armv6l.whl", hash = "sha256:f3b8248123b586de44a8018bcc9fefe31d23dda57a34e6f0e1e53bd51fd63594", size = 13171630, upload-time = "2025-11-13T19:57:54.894Z" },
-    { url = "https://files.pythonhosted.org/packages/8e/5c/283c62516dca697cd604c2796d1487396b7a436b2f0ecc3fd412aca470e0/ruff-0.14.5-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:f7a75236570318c7a30edd7f5491945f0169de738d945ca8784500b517163a72", size = 13413925, upload-time = "2025-11-13T19:57:59.181Z" },
-    { url = "https://files.pythonhosted.org/packages/b6/f3/aa319f4afc22cb6fcba2b9cdfc0f03bbf747e59ab7a8c5e90173857a1361/ruff-0.14.5-py3-none-macosx_11_0_arm64.whl", hash = "sha256:6d146132d1ee115f8802356a2dc9a634dbf58184c51bff21f313e8cd1c74899a", size = 12574040, upload-time = "2025-11-13T19:58:02.056Z" },
-    { url = "https://files.pythonhosted.org/packages/f9/7f/cb5845fcc7c7e88ed57f58670189fc2ff517fe2134c3821e77e29fd3b0c8/ruff-0.14.5-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e2380596653dcd20b057794d55681571a257a42327da8894b93bbd6111aa801f", size = 13009755, upload-time = "2025-11-13T19:58:05.172Z" },
-    { url = "https://files.pythonhosted.org/packages/21/d2/bcbedbb6bcb9253085981730687ddc0cc7b2e18e8dc13cf4453de905d7a0/ruff-0.14.5-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2d1fa985a42b1f075a098fa1ab9d472b712bdb17ad87a8ec86e45e7fa6273e68", size = 12937641, upload-time = "2025-11-13T19:58:08.345Z" },
-    { url = "https://files.pythonhosted.org/packages/a4/58/e25de28a572bdd60ffc6bb71fc7fd25a94ec6a076942e372437649cbb02a/ruff-0.14.5-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:88f0770d42b7fa02bbefddde15d235ca3aa24e2f0137388cc15b2dcbb1f7c7a7", size = 13610854, upload-time = "2025-11-13T19:58:11.419Z" },
-    { url = "https://files.pythonhosted.org/packages/7d/24/43bb3fd23ecee9861970978ea1a7a63e12a204d319248a7e8af539984280/ruff-0.14.5-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:3676cb02b9061fee7294661071c4709fa21419ea9176087cb77e64410926eb78", size = 15061088, upload-time = "2025-11-13T19:58:14.551Z" },
-    { url = "https://files.pythonhosted.org/packages/23/44/a022f288d61c2f8c8645b24c364b719aee293ffc7d633a2ca4d116b9c716/ruff-0.14.5-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b595bedf6bc9cab647c4a173a61acf4f1ac5f2b545203ba82f30fcb10b0318fb", size = 14734717, upload-time = "2025-11-13T19:58:17.518Z" },
-    { url = "https://files.pythonhosted.org/packages/58/81/5c6ba44de7e44c91f68073e0658109d8373b0590940efe5bd7753a2585a3/ruff-0.14.5-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f55382725ad0bdb2e8ee2babcbbfb16f124f5a59496a2f6a46f1d9d99d93e6e2", size = 14028812, upload-time = "2025-11-13T19:58:20.533Z" },
-    { url = "https://files.pythonhosted.org/packages/ad/ef/41a8b60f8462cb320f68615b00299ebb12660097c952c600c762078420f8/ruff-0.14.5-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7497d19dce23976bdaca24345ae131a1d38dcfe1b0850ad8e9e6e4fa321a6e19", size = 13825656, upload-time = "2025-11-13T19:58:23.345Z" },
-    { url = "https://files.pythonhosted.org/packages/7c/00/207e5de737fdb59b39eb1fac806904fe05681981b46d6a6db9468501062e/ruff-0.14.5-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:410e781f1122d6be4f446981dd479470af86537fb0b8857f27a6e872f65a38e4", size = 13959922, upload-time = "2025-11-13T19:58:26.537Z" },
-    { url = "https://files.pythonhosted.org/packages/bc/7e/fa1f5c2776db4be405040293618846a2dece5c70b050874c2d1f10f24776/ruff-0.14.5-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:c01be527ef4c91a6d55e53b337bfe2c0f82af024cc1a33c44792d6844e2331e1", size = 12932501, upload-time = "2025-11-13T19:58:29.822Z" },
-    { url = "https://files.pythonhosted.org/packages/67/d8/d86bf784d693a764b59479a6bbdc9515ae42c340a5dc5ab1dabef847bfaa/ruff-0.14.5-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:f66e9bb762e68d66e48550b59c74314168ebb46199886c5c5aa0b0fbcc81b151", size = 12927319, upload-time = "2025-11-13T19:58:32.923Z" },
-    { url = "https://files.pythonhosted.org/packages/ac/de/ee0b304d450ae007ce0cb3e455fe24fbcaaedae4ebaad6c23831c6663651/ruff-0.14.5-py3-none-musllinux_1_2_i686.whl", hash = "sha256:d93be8f1fa01022337f1f8f3bcaa7ffee2d0b03f00922c45c2207954f351f465", size = 13206209, upload-time = "2025-11-13T19:58:35.952Z" },
-    { url = "https://files.pythonhosted.org/packages/33/aa/193ca7e3a92d74f17d9d5771a765965d2cf42c86e6f0fd95b13969115723/ruff-0.14.5-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:c135d4b681f7401fe0e7312017e41aba9b3160861105726b76cfa14bc25aa367", size = 13953709, upload-time = "2025-11-13T19:58:39.002Z" },
-    { url = "https://files.pythonhosted.org/packages/cc/f1/7119e42aa1d3bf036ffc9478885c2e248812b7de9abea4eae89163d2929d/ruff-0.14.5-py3-none-win32.whl", hash = "sha256:c83642e6fccfb6dea8b785eb9f456800dcd6a63f362238af5fc0c83d027dd08b", size = 12925808, upload-time = "2025-11-13T19:58:42.779Z" },
-    { url = "https://files.pythonhosted.org/packages/3b/9d/7c0a255d21e0912114784e4a96bf62af0618e2190cae468cd82b13625ad2/ruff-0.14.5-py3-none-win_amd64.whl", hash = "sha256:9d55d7af7166f143c94eae1db3312f9ea8f95a4defef1979ed516dbb38c27621", size = 14331546, upload-time = "2025-11-13T19:58:45.691Z" },
-    { url = "https://files.pythonhosted.org/packages/e5/80/69756670caedcf3b9be597a6e12276a6cf6197076eb62aad0c608f8efce0/ruff-0.14.5-py3-none-win_arm64.whl", hash = "sha256:4b700459d4649e2594b31f20a9de33bc7c19976d4746d8d0798ad959621d64a4", size = 13433331, upload-time = "2025-11-13T19:58:48.434Z" },
+version = "0.14.10"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/57/08/52232a877978dd8f9cf2aeddce3e611b40a63287dfca29b6b8da791f5e8d/ruff-0.14.10.tar.gz", hash = "sha256:9a2e830f075d1a42cd28420d7809ace390832a490ed0966fe373ba288e77aaf4", size = 5859763, upload-time = "2025-12-18T19:28:57.98Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/60/01/933704d69f3f05ee16ef11406b78881733c186fe14b6a46b05cfcaf6d3b2/ruff-0.14.10-py3-none-linux_armv6l.whl", hash = "sha256:7a3ce585f2ade3e1f29ec1b92df13e3da262178df8c8bdf876f48fa0e8316c49", size = 13527080, upload-time = "2025-12-18T19:29:25.642Z" },
+    { url = "https://files.pythonhosted.org/packages/df/58/a0349197a7dfa603ffb7f5b0470391efa79ddc327c1e29c4851e85b09cc5/ruff-0.14.10-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:674f9be9372907f7257c51f1d4fc902cb7cf014b9980152b802794317941f08f", size = 13797320, upload-time = "2025-12-18T19:29:02.571Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/82/36be59f00a6082e38c23536df4e71cdbc6af8d7c707eade97fcad5c98235/ruff-0.14.10-py3-none-macosx_11_0_arm64.whl", hash = "sha256:d85713d522348837ef9df8efca33ccb8bd6fcfc86a2cde3ccb4bc9d28a18003d", size = 12918434, upload-time = "2025-12-18T19:28:51.202Z" },
+    { url = "https://files.pythonhosted.org/packages/a6/00/45c62a7f7e34da92a25804f813ebe05c88aa9e0c25e5cb5a7d23dd7450e3/ruff-0.14.10-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6987ebe0501ae4f4308d7d24e2d0fe3d7a98430f5adfd0f1fead050a740a3a77", size = 13371961, upload-time = "2025-12-18T19:29:04.991Z" },
+    { url = "https://files.pythonhosted.org/packages/40/31/a5906d60f0405f7e57045a70f2d57084a93ca7425f22e1d66904769d1628/ruff-0.14.10-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:16a01dfb7b9e4eee556fbfd5392806b1b8550c9b4a9f6acd3dbe6812b193c70a", size = 13275629, upload-time = "2025-12-18T19:29:21.381Z" },
+    { url = "https://files.pythonhosted.org/packages/3e/60/61c0087df21894cf9d928dc04bcd4fb10e8b2e8dca7b1a276ba2155b2002/ruff-0.14.10-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7165d31a925b7a294465fa81be8c12a0e9b60fb02bf177e79067c867e71f8b1f", size = 14029234, upload-time = "2025-12-18T19:29:00.132Z" },
+    { url = "https://files.pythonhosted.org/packages/44/84/77d911bee3b92348b6e5dab5a0c898d87084ea03ac5dc708f46d88407def/ruff-0.14.10-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:c561695675b972effb0c0a45db233f2c816ff3da8dcfbe7dfc7eed625f218935", size = 15449890, upload-time = "2025-12-18T19:28:53.573Z" },
+    { url = "https://files.pythonhosted.org/packages/e9/36/480206eaefa24a7ec321582dda580443a8f0671fdbf6b1c80e9c3e93a16a/ruff-0.14.10-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4bb98fcbbc61725968893682fd4df8966a34611239c9fd07a1f6a07e7103d08e", size = 15123172, upload-time = "2025-12-18T19:29:23.453Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/38/68e414156015ba80cef5473d57919d27dfb62ec804b96180bafdeaf0e090/ruff-0.14.10-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f24b47993a9d8cb858429e97bdf8544c78029f09b520af615c1d261bf827001d", size = 14460260, upload-time = "2025-12-18T19:29:27.808Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/19/9e050c0dca8aba824d67cc0db69fb459c28d8cd3f6855b1405b3f29cc91d/ruff-0.14.10-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:59aabd2e2c4fd614d2862e7939c34a532c04f1084476d6833dddef4afab87e9f", size = 14229978, upload-time = "2025-12-18T19:29:11.32Z" },
+    { url = "https://files.pythonhosted.org/packages/51/eb/e8dd1dd6e05b9e695aa9dd420f4577debdd0f87a5ff2fedda33c09e9be8c/ruff-0.14.10-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:213db2b2e44be8625002dbea33bb9c60c66ea2c07c084a00d55732689d697a7f", size = 14338036, upload-time = "2025-12-18T19:29:09.184Z" },
+    { url = "https://files.pythonhosted.org/packages/6a/12/f3e3a505db7c19303b70af370d137795fcfec136d670d5de5391e295c134/ruff-0.14.10-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:b914c40ab64865a17a9a5b67911d14df72346a634527240039eb3bd650e5979d", size = 13264051, upload-time = "2025-12-18T19:29:13.431Z" },
+    { url = "https://files.pythonhosted.org/packages/08/64/8c3a47eaccfef8ac20e0484e68e0772013eb85802f8a9f7603ca751eb166/ruff-0.14.10-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:1484983559f026788e3a5c07c81ef7d1e97c1c78ed03041a18f75df104c45405", size = 13283998, upload-time = "2025-12-18T19:29:06.994Z" },
+    { url = "https://files.pythonhosted.org/packages/12/84/534a5506f4074e5cc0529e5cd96cfc01bb480e460c7edf5af70d2bcae55e/ruff-0.14.10-py3-none-musllinux_1_2_i686.whl", hash = "sha256:c70427132db492d25f982fffc8d6c7535cc2fd2c83fc8888f05caaa248521e60", size = 13601891, upload-time = "2025-12-18T19:28:55.811Z" },
+    { url = "https://files.pythonhosted.org/packages/0d/1e/14c916087d8598917dbad9b2921d340f7884824ad6e9c55de948a93b106d/ruff-0.14.10-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:5bcf45b681e9f1ee6445d317ce1fa9d6cba9a6049542d1c3d5b5958986be8830", size = 14336660, upload-time = "2025-12-18T19:29:16.531Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/1c/d7b67ab43f30013b47c12b42d1acd354c195351a3f7a1d67f59e54227ede/ruff-0.14.10-py3-none-win32.whl", hash = "sha256:104c49fc7ab73f3f3a758039adea978869a918f31b73280db175b43a2d9b51d6", size = 13196187, upload-time = "2025-12-18T19:29:19.006Z" },
+    { url = "https://files.pythonhosted.org/packages/fb/9c/896c862e13886fae2af961bef3e6312db9ebc6adc2b156fe95e615dee8c1/ruff-0.14.10-py3-none-win_amd64.whl", hash = "sha256:466297bd73638c6bdf06485683e812db1c00c7ac96d4ddd0294a338c62fdc154", size = 14661283, upload-time = "2025-12-18T19:29:30.16Z" },
+    { url = "https://files.pythonhosted.org/packages/74/31/b0e29d572670dca3674eeee78e418f20bdf97fa8aa9ea71380885e175ca0/ruff-0.14.10-py3-none-win_arm64.whl", hash = "sha256:e51d046cf6dda98a4633b8a8a771451107413b0f07183b2bef03f075599e44e6", size = 13729839, upload-time = "2025-12-18T19:28:48.636Z" },
 ]
 
 [[package]]
 name = "safetensors"
-version = "0.6.2"
+version = "0.7.0"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/ac/cc/738f3011628920e027a11754d9cae9abec1aed00f7ae860abbf843755233/safetensors-0.6.2.tar.gz", hash = "sha256:43ff2aa0e6fa2dc3ea5524ac7ad93a9839256b8703761e76e2d0b2a3fa4f15d9", size = 197968, upload-time = "2025-08-08T13:13:58.654Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/29/9c/6e74567782559a63bd040a236edca26fd71bc7ba88de2ef35d75df3bca5e/safetensors-0.7.0.tar.gz", hash = "sha256:07663963b67e8bd9f0b8ad15bb9163606cd27cc5a1b96235a50d8369803b96b0", size = 200878, upload-time = "2025-11-19T15:18:43.199Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/4d/b1/3f5fd73c039fc87dba3ff8b5d528bfc5a32b597fea8e7a6a4800343a17c7/safetensors-0.6.2-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:9c85ede8ec58f120bad982ec47746981e210492a6db876882aa021446af8ffba", size = 454797, upload-time = "2025-08-08T13:13:52.066Z" },
-    { url = "https://files.pythonhosted.org/packages/8c/c9/bb114c158540ee17907ec470d01980957fdaf87b4aa07914c24eba87b9c6/safetensors-0.6.2-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:d6675cf4b39c98dbd7d940598028f3742e0375a6b4d4277e76beb0c35f4b843b", size = 432206, upload-time = "2025-08-08T13:13:50.931Z" },
-    { url = "https://files.pythonhosted.org/packages/d3/8e/f70c34e47df3110e8e0bb268d90db8d4be8958a54ab0336c9be4fe86dac8/safetensors-0.6.2-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1d2d2b3ce1e2509c68932ca03ab8f20570920cd9754b05063d4368ee52833ecd", size = 473261, upload-time = "2025-08-08T13:13:41.259Z" },
-    { url = "https://files.pythonhosted.org/packages/2a/f5/be9c6a7c7ef773e1996dc214e73485286df1836dbd063e8085ee1976f9cb/safetensors-0.6.2-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:93de35a18f46b0f5a6a1f9e26d91b442094f2df02e9fd7acf224cfec4238821a", size = 485117, upload-time = "2025-08-08T13:13:43.506Z" },
-    { url = "https://files.pythonhosted.org/packages/c9/55/23f2d0a2c96ed8665bf17a30ab4ce5270413f4d74b6d87dd663258b9af31/safetensors-0.6.2-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:89a89b505f335640f9120fac65ddeb83e40f1fd081cb8ed88b505bdccec8d0a1", size = 616154, upload-time = "2025-08-08T13:13:45.096Z" },
-    { url = "https://files.pythonhosted.org/packages/98/c6/affb0bd9ce02aa46e7acddbe087912a04d953d7a4d74b708c91b5806ef3f/safetensors-0.6.2-cp38-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fc4d0d0b937e04bdf2ae6f70cd3ad51328635fe0e6214aa1fc811f3b576b3bda", size = 520713, upload-time = "2025-08-08T13:13:46.25Z" },
-    { url = "https://files.pythonhosted.org/packages/fe/5d/5a514d7b88e310c8b146e2404e0dc161282e78634d9358975fd56dfd14be/safetensors-0.6.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8045db2c872db8f4cbe3faa0495932d89c38c899c603f21e9b6486951a5ecb8f", size = 485835, upload-time = "2025-08-08T13:13:49.373Z" },
-    { url = "https://files.pythonhosted.org/packages/7a/7b/4fc3b2ba62c352b2071bea9cfbad330fadda70579f617506ae1a2f129cab/safetensors-0.6.2-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:81e67e8bab9878bb568cffbc5f5e655adb38d2418351dc0859ccac158f753e19", size = 521503, upload-time = "2025-08-08T13:13:47.651Z" },
-    { url = "https://files.pythonhosted.org/packages/5a/50/0057e11fe1f3cead9254315a6c106a16dd4b1a19cd247f7cc6414f6b7866/safetensors-0.6.2-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:b0e4d029ab0a0e0e4fdf142b194514695b1d7d3735503ba700cf36d0fc7136ce", size = 652256, upload-time = "2025-08-08T13:13:53.167Z" },
-    { url = "https://files.pythonhosted.org/packages/e9/29/473f789e4ac242593ac1656fbece6e1ecd860bb289e635e963667807afe3/safetensors-0.6.2-cp38-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:fa48268185c52bfe8771e46325a1e21d317207bcabcb72e65c6e28e9ffeb29c7", size = 747281, upload-time = "2025-08-08T13:13:54.656Z" },
-    { url = "https://files.pythonhosted.org/packages/68/52/f7324aad7f2df99e05525c84d352dc217e0fa637a4f603e9f2eedfbe2c67/safetensors-0.6.2-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:d83c20c12c2d2f465997c51b7ecb00e407e5f94d7dec3ea0cc11d86f60d3fde5", size = 692286, upload-time = "2025-08-08T13:13:55.884Z" },
-    { url = "https://files.pythonhosted.org/packages/ad/fe/cad1d9762868c7c5dc70c8620074df28ebb1a8e4c17d4c0cb031889c457e/safetensors-0.6.2-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:d944cea65fad0ead848b6ec2c37cc0b197194bec228f8020054742190e9312ac", size = 655957, upload-time = "2025-08-08T13:13:57.029Z" },
-    { url = "https://files.pythonhosted.org/packages/59/a7/e2158e17bbe57d104f0abbd95dff60dda916cf277c9f9663b4bf9bad8b6e/safetensors-0.6.2-cp38-abi3-win32.whl", hash = "sha256:cab75ca7c064d3911411461151cb69380c9225798a20e712b102edda2542ddb1", size = 308926, upload-time = "2025-08-08T13:14:01.095Z" },
-    { url = "https://files.pythonhosted.org/packages/2c/c3/c0be1135726618dc1e28d181b8c442403d8dbb9e273fd791de2d4384bcdd/safetensors-0.6.2-cp38-abi3-win_amd64.whl", hash = "sha256:c7b214870df923cbc1593c3faee16bec59ea462758699bd3fee399d00aac072c", size = 320192, upload-time = "2025-08-08T13:13:59.467Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/47/aef6c06649039accf914afef490268e1067ed82be62bcfa5b7e886ad15e8/safetensors-0.7.0-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:c82f4d474cf725255d9e6acf17252991c3c8aac038d6ef363a4bf8be2f6db517", size = 467781, upload-time = "2025-11-19T15:18:35.84Z" },
+    { url = "https://files.pythonhosted.org/packages/e8/00/374c0c068e30cd31f1e1b46b4b5738168ec79e7689ca82ee93ddfea05109/safetensors-0.7.0-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:94fd4858284736bb67a897a41608b5b0c2496c9bdb3bf2af1fa3409127f20d57", size = 447058, upload-time = "2025-11-19T15:18:34.416Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/06/578ffed52c2296f93d7fd2d844cabfa92be51a587c38c8afbb8ae449ca89/safetensors-0.7.0-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e07d91d0c92a31200f25351f4acb2bc6aff7f48094e13ebb1d0fb995b54b6542", size = 491748, upload-time = "2025-11-19T15:18:09.79Z" },
+    { url = "https://files.pythonhosted.org/packages/ae/33/1debbbb70e4791dde185edb9413d1fe01619255abb64b300157d7f15dddd/safetensors-0.7.0-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8469155f4cb518bafb4acf4865e8bb9d6804110d2d9bdcaa78564b9fd841e104", size = 503881, upload-time = "2025-11-19T15:18:16.145Z" },
+    { url = "https://files.pythonhosted.org/packages/8e/1c/40c2ca924d60792c3be509833df711b553c60effbd91da6f5284a83f7122/safetensors-0.7.0-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:54bef08bf00a2bff599982f6b08e8770e09cc012d7bba00783fc7ea38f1fb37d", size = 623463, upload-time = "2025-11-19T15:18:21.11Z" },
+    { url = "https://files.pythonhosted.org/packages/9b/3a/13784a9364bd43b0d61eef4bea2845039bc2030458b16594a1bd787ae26e/safetensors-0.7.0-cp38-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:42cb091236206bb2016d245c377ed383aa7f78691748f3bb6ee1bfa51ae2ce6a", size = 532855, upload-time = "2025-11-19T15:18:25.719Z" },
+    { url = "https://files.pythonhosted.org/packages/a0/60/429e9b1cb3fc651937727befe258ea24122d9663e4d5709a48c9cbfceecb/safetensors-0.7.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dac7252938f0696ddea46f5e855dd3138444e82236e3be475f54929f0c510d48", size = 507152, upload-time = "2025-11-19T15:18:33.023Z" },
+    { url = "https://files.pythonhosted.org/packages/3c/a8/4b45e4e059270d17af60359713ffd83f97900d45a6afa73aaa0d737d48b6/safetensors-0.7.0-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1d060c70284127fa805085d8f10fbd0962792aed71879d00864acda69dbab981", size = 541856, upload-time = "2025-11-19T15:18:31.075Z" },
+    { url = "https://files.pythonhosted.org/packages/06/87/d26d8407c44175d8ae164a95b5a62707fcc445f3c0c56108e37d98070a3d/safetensors-0.7.0-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:cdab83a366799fa730f90a4ebb563e494f28e9e92c4819e556152ad55e43591b", size = 674060, upload-time = "2025-11-19T15:18:37.211Z" },
+    { url = "https://files.pythonhosted.org/packages/11/f5/57644a2ff08dc6325816ba7217e5095f17269dada2554b658442c66aed51/safetensors-0.7.0-cp38-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:672132907fcad9f2aedcb705b2d7b3b93354a2aec1b2f706c4db852abe338f85", size = 771715, upload-time = "2025-11-19T15:18:38.689Z" },
+    { url = "https://files.pythonhosted.org/packages/86/31/17883e13a814bd278ae6e266b13282a01049b0c81341da7fd0e3e71a80a3/safetensors-0.7.0-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:5d72abdb8a4d56d4020713724ba81dac065fedb7f3667151c4a637f1d3fb26c0", size = 714377, upload-time = "2025-11-19T15:18:40.162Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/d8/0c8a7dc9b41dcac53c4cbf9df2b9c83e0e0097203de8b37a712b345c0be5/safetensors-0.7.0-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:b0f6d66c1c538d5a94a73aa9ddca8ccc4227e6c9ff555322ea40bdd142391dd4", size = 677368, upload-time = "2025-11-19T15:18:41.627Z" },
+    { url = "https://files.pythonhosted.org/packages/05/e5/cb4b713c8a93469e3c5be7c3f8d77d307e65fe89673e731f5c2bfd0a9237/safetensors-0.7.0-cp38-abi3-win32.whl", hash = "sha256:c74af94bf3ac15ac4d0f2a7c7b4663a15f8c2ab15ed0fc7531ca61d0835eccba", size = 326423, upload-time = "2025-11-19T15:18:45.74Z" },
+    { url = "https://files.pythonhosted.org/packages/5d/e6/ec8471c8072382cb91233ba7267fd931219753bb43814cbc71757bfd4dab/safetensors-0.7.0-cp38-abi3-win_amd64.whl", hash = "sha256:d1239932053f56f3456f32eb9625590cc7582e905021f94636202a864d470755", size = 341380, upload-time = "2025-11-19T15:18:44.427Z" },
 ]
 
 [[package]]
@@ -2286,20 +2235,14 @@ name = "scipy"
 version = "1.16.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "numpy" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/0a/ca/d8ace4f98322d01abcd52d381134344bf7b431eba7ed8b42bdea5a3c2ac9/scipy-1.16.3.tar.gz", hash = "sha256:01e87659402762f43bd2fee13370553a17ada367d42e7487800bf2916535aecb", size = 30597883, upload-time = "2025-10-28T17:38:54.068Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/40/41/5bf55c3f386b1643812f3a5674edf74b26184378ef0f3e7c7a09a7e2ca7f/scipy-1.16.3-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:81fc5827606858cf71446a5e98715ba0e11f0dbc83d71c7409d05486592a45d6", size = 36659043, upload-time = "2025-10-28T17:32:40.285Z" },
-    { url = "https://files.pythonhosted.org/packages/1e/0f/65582071948cfc45d43e9870bf7ca5f0e0684e165d7c9ef4e50d783073eb/scipy-1.16.3-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:c97176013d404c7346bf57874eaac5187d969293bf40497140b0a2b2b7482e07", size = 28898986, upload-time = "2025-10-28T17:32:45.325Z" },
-    { url = "https://files.pythonhosted.org/packages/96/5e/36bf3f0ac298187d1ceadde9051177d6a4fe4d507e8f59067dc9dd39e650/scipy-1.16.3-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:2b71d93c8a9936046866acebc915e2af2e292b883ed6e2cbe5c34beb094b82d9", size = 20889814, upload-time = "2025-10-28T17:32:49.277Z" },
-    { url = "https://files.pythonhosted.org/packages/80/35/178d9d0c35394d5d5211bbff7ac4f2986c5488b59506fef9e1de13ea28d3/scipy-1.16.3-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:3d4a07a8e785d80289dfe66b7c27d8634a773020742ec7187b85ccc4b0e7b686", size = 23565795, upload-time = "2025-10-28T17:32:53.337Z" },
     { url = "https://files.pythonhosted.org/packages/fa/46/d1146ff536d034d02f83c8afc3c4bab2eddb634624d6529a8512f3afc9da/scipy-1.16.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0553371015692a898e1aa858fed67a3576c34edefa6b7ebdb4e9dde49ce5c203", size = 33349476, upload-time = "2025-10-28T17:32:58.353Z" },
     { url = "https://files.pythonhosted.org/packages/79/2e/415119c9ab3e62249e18c2b082c07aff907a273741b3f8160414b0e9193c/scipy-1.16.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:72d1717fd3b5e6ec747327ce9bda32d5463f472c9dce9f54499e81fbd50245a1", size = 35676692, upload-time = "2025-10-28T17:33:03.88Z" },
     { url = "https://files.pythonhosted.org/packages/27/82/df26e44da78bf8d2aeaf7566082260cfa15955a5a6e96e6a29935b64132f/scipy-1.16.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1fb2472e72e24d1530debe6ae078db70fb1605350c88a3d14bc401d6306dbffe", size = 36019345, upload-time = "2025-10-28T17:33:09.773Z" },
     { url = "https://files.pythonhosted.org/packages/82/31/006cbb4b648ba379a95c87262c2855cd0d09453e500937f78b30f02fa1cd/scipy-1.16.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c5192722cffe15f9329a3948c4b1db789fbb1f05c97899187dcf009b283aea70", size = 38678975, upload-time = "2025-10-28T17:33:15.809Z" },
-    { url = "https://files.pythonhosted.org/packages/c2/7f/acbd28c97e990b421af7d6d6cd416358c9c293fc958b8529e0bd5d2a2a19/scipy-1.16.3-cp312-cp312-win_amd64.whl", hash = "sha256:56edc65510d1331dae01ef9b658d428e33ed48b4f77b1d51caf479a0253f96dc", size = 38555926, upload-time = "2025-10-28T17:33:21.388Z" },
-    { url = "https://files.pythonhosted.org/packages/ce/69/c5c7807fd007dad4f48e0a5f2153038dc96e8725d3345b9ee31b2b7bed46/scipy-1.16.3-cp312-cp312-win_arm64.whl", hash = "sha256:a8a26c78ef223d3e30920ef759e25625a0ecdd0d60e5a8818b7513c3e5384cf2", size = 25463014, upload-time = "2025-10-28T17:33:25.975Z" },
 ]
 
 [[package]]
@@ -2308,27 +2251,21 @@ version = "0.2.1"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/15/15/2e7a025fc62d764b151ae6d0f2a92f8081755ebe8d4a64099accc6f77ba6/sentencepiece-0.2.1.tar.gz", hash = "sha256:8138cec27c2f2282f4a34d9a016e3374cd40e5c6e9cb335063db66a0a3b71fad", size = 3228515, upload-time = "2025-08-12T07:00:51.718Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/4a/be/32ce495aa1d0e0c323dcb1ba87096037358edee539cac5baf8755a6bd396/sentencepiece-0.2.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:57cae326c8727de58c85977b175af132a7138d84c764635d7e71bbee7e774133", size = 1943152, upload-time = "2025-08-12T06:59:40.048Z" },
-    { url = "https://files.pythonhosted.org/packages/88/7e/ff23008899a58678e98c6ff592bf4d368eee5a71af96d0df6b38a039dd4f/sentencepiece-0.2.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:56dd39a3c4d6493db3cdca7e8cc68c6b633f0d4195495cbadfcf5af8a22d05a6", size = 1325651, upload-time = "2025-08-12T06:59:41.536Z" },
-    { url = "https://files.pythonhosted.org/packages/19/84/42eb3ce4796777a1b5d3699dfd4dca85113e68b637f194a6c8d786f16a04/sentencepiece-0.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d9381351182ff9888cc80e41c632e7e274b106f450de33d67a9e8f6043da6f76", size = 1253645, upload-time = "2025-08-12T06:59:42.903Z" },
     { url = "https://files.pythonhosted.org/packages/89/fa/d3d5ebcba3cb9e6d3775a096251860c41a6bc53a1b9461151df83fe93255/sentencepiece-0.2.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:99f955df238021bf11f0fc37cdb54fd5e5b5f7fd30ecc3d93fb48b6815437167", size = 1316273, upload-time = "2025-08-12T06:59:44.476Z" },
     { url = "https://files.pythonhosted.org/packages/04/88/14f2f4a2b922d8b39be45bf63d79e6cd3a9b2f248b2fcb98a69b12af12f5/sentencepiece-0.2.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0cdfecef430d985f1c2bcbfff3defd1d95dae876fbd0173376012d2d7d24044b", size = 1387881, upload-time = "2025-08-12T06:59:46.09Z" },
-    { url = "https://files.pythonhosted.org/packages/fd/b8/903e5ccb77b4ef140605d5d71b4f9e0ad95d456d6184688073ed11712809/sentencepiece-0.2.1-cp312-cp312-win32.whl", hash = "sha256:a483fd29a34c3e34c39ac5556b0a90942bec253d260235729e50976f5dba1068", size = 999540, upload-time = "2025-08-12T06:59:48.023Z" },
-    { url = "https://files.pythonhosted.org/packages/2d/81/92df5673c067148c2545b1bfe49adfd775bcc3a169a047f5a0e6575ddaca/sentencepiece-0.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:4cdc7c36234fda305e85c32949c5211faaf8dd886096c7cea289ddc12a2d02de", size = 1054671, upload-time = "2025-08-12T06:59:49.895Z" },
-    { url = "https://files.pythonhosted.org/packages/fe/02/c5e3bc518655d714622bec87d83db9cdba1cd0619a4a04e2109751c4f47f/sentencepiece-0.2.1-cp312-cp312-win_arm64.whl", hash = "sha256:daeb5e9e9fcad012324807856113708614d534f596d5008638eb9b40112cd9e4", size = 1033923, upload-time = "2025-08-12T06:59:51.952Z" },
 ]
 
 [[package]]
 name = "sentry-sdk"
-version = "2.43.0"
+version = "2.48.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "certifi" },
     { name = "urllib3" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/b3/18/09875b4323b03ca9025bae7e6539797b27e4fc032998a466b4b9c3d24653/sentry_sdk-2.43.0.tar.gz", hash = "sha256:52ed6e251c5d2c084224d73efee56b007ef5c2d408a4a071270e82131d336e20", size = 368953, upload-time = "2025-10-29T11:26:08.156Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/40/f0/0e9dc590513d5e742d7799e2038df3a05167cba084c6ca4f3cdd75b55164/sentry_sdk-2.48.0.tar.gz", hash = "sha256:5213190977ff7fdff8a58b722fb807f8d5524a80488626ebeda1b5676c0c1473", size = 384828, upload-time = "2025-12-16T14:55:41.722Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/69/31/8228fa962f7fd8814d634e4ebece8780e2cdcfbdf0cd2e14d4a6861a7cd5/sentry_sdk-2.43.0-py2.py3-none-any.whl", hash = "sha256:4aacafcf1756ef066d359ae35030881917160ba7f6fc3ae11e0e58b09edc2d5d", size = 400997, upload-time = "2025-10-29T11:26:05.77Z" },
+    { url = "https://files.pythonhosted.org/packages/4d/19/8d77f9992e5cbfcaa9133c3bf63b4fbbb051248802e1e803fed5c552fbb2/sentry_sdk-2.48.0-py2.py3-none-any.whl", hash = "sha256:6b12ac256769d41825d9b7518444e57fa35b5642df4c7c5e322af4d2c8721172", size = 414555, upload-time = "2025-12-16T14:55:40.152Z" },
 ]
 
 [[package]]
@@ -2337,25 +2274,21 @@ version = "1.3.7"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/8d/48/49393a96a2eef1ab418b17475fb92b8fcfad83d099e678751b05472e69de/setproctitle-1.3.7.tar.gz", hash = "sha256:bc2bc917691c1537d5b9bca1468437176809c7e11e5694ca79a9ca12345dcb9e", size = 27002, upload-time = "2025-09-05T12:51:25.278Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/fb/f0/2dc88e842077719d7384d86cc47403e5102810492b33680e7dadcee64cd8/setproctitle-1.3.7-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:2dc99aec591ab6126e636b11035a70991bc1ab7a261da428491a40b84376654e", size = 18049, upload-time = "2025-09-05T12:49:36.241Z" },
-    { url = "https://files.pythonhosted.org/packages/f0/b4/50940504466689cda65680c9e9a1e518e5750c10490639fa687489ac7013/setproctitle-1.3.7-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:cdd8aa571b7aa39840fdbea620e308a19691ff595c3a10231e9ee830339dd798", size = 13079, upload-time = "2025-09-05T12:49:38.088Z" },
     { url = "https://files.pythonhosted.org/packages/d0/99/71630546b9395b095f4082be41165d1078204d1696c2d9baade3de3202d0/setproctitle-1.3.7-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:2906b6c7959cdb75f46159bf0acd8cc9906cf1361c9e1ded0d065fe8f9039629", size = 32932, upload-time = "2025-09-05T12:49:39.271Z" },
     { url = "https://files.pythonhosted.org/packages/50/22/cee06af4ffcfb0e8aba047bd44f5262e644199ae7527ae2c1f672b86495c/setproctitle-1.3.7-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6915964a6dda07920a1159321dcd6d94fc7fc526f815ca08a8063aeca3c204f1", size = 33736, upload-time = "2025-09-05T12:49:40.565Z" },
     { url = "https://files.pythonhosted.org/packages/5c/00/a5949a8bb06ef5e7df214fc393bb2fb6aedf0479b17214e57750dfdd0f24/setproctitle-1.3.7-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:cff72899861c765bd4021d1ff1c68d60edc129711a2fdba77f9cb69ef726a8b6", size = 35605, upload-time = "2025-09-05T12:49:42.362Z" },
     { url = "https://files.pythonhosted.org/packages/b0/3a/50caca532a9343828e3bf5778c7a84d6c737a249b1796d50dd680290594d/setproctitle-1.3.7-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:b7cb05bd446687ff816a3aaaf831047fc4c364feff7ada94a66024f1367b448c", size = 33143, upload-time = "2025-09-05T12:49:43.515Z" },
     { url = "https://files.pythonhosted.org/packages/ca/14/b843a251296ce55e2e17c017d6b9f11ce0d3d070e9265de4ecad948b913d/setproctitle-1.3.7-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:3a57b9a00de8cae7e2a1f7b9f0c2ac7b69372159e16a7708aa2f38f9e5cc987a", size = 34434, upload-time = "2025-09-05T12:49:45.31Z" },
     { url = "https://files.pythonhosted.org/packages/c8/b7/06145c238c0a6d2c4bc881f8be230bb9f36d2bf51aff7bddcb796d5eed67/setproctitle-1.3.7-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:d8828b356114f6b308b04afe398ed93803d7fca4a955dd3abe84430e28d33739", size = 32795, upload-time = "2025-09-05T12:49:46.419Z" },
-    { url = "https://files.pythonhosted.org/packages/ef/dc/ef76a81fac9bf27b84ed23df19c1f67391a753eed6e3c2254ebcb5133f56/setproctitle-1.3.7-cp312-cp312-win32.whl", hash = "sha256:b0304f905efc845829ac2bc791ddebb976db2885f6171f4a3de678d7ee3f7c9f", size = 12552, upload-time = "2025-09-05T12:49:47.635Z" },
-    { url = "https://files.pythonhosted.org/packages/e2/5b/a9fe517912cd6e28cf43a212b80cb679ff179a91b623138a99796d7d18a0/setproctitle-1.3.7-cp312-cp312-win_amd64.whl", hash = "sha256:9888ceb4faea3116cf02a920ff00bfbc8cc899743e4b4ac914b03625bdc3c300", size = 13247, upload-time = "2025-09-05T12:49:49.16Z" },
 ]
 
 [[package]]
 name = "setuptools"
-version = "79.0.1"
+version = "80.9.0"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/bb/71/b6365e6325b3290e14957b2c3a804a529968c77a049b2ed40c095f749707/setuptools-79.0.1.tar.gz", hash = "sha256:128ce7b8f33c3079fd1b067ecbb4051a66e8526e7b65f6cec075dfc650ddfa88", size = 1367909, upload-time = "2025-04-23T22:20:59.241Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/18/5d/3bf57dcd21979b887f014ea83c24ae194cfcd12b9e0fda66b957c69d1fca/setuptools-80.9.0.tar.gz", hash = "sha256:f36b47402ecde768dbfafc46e8e4207b4360c654f1f3bb84475f0a28628fb19c", size = 1319958, upload-time = "2025-05-27T00:56:51.443Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/0d/6d/b4752b044bf94cb802d88a888dc7d288baaf77d7910b7dedda74b5ceea0c/setuptools-79.0.1-py3-none-any.whl", hash = "sha256:e147c0549f27767ba362f9da434eab9c5dc0045d5304feb602a0af001089fc51", size = 1256281, upload-time = "2025-04-23T22:20:56.768Z" },
+    { url = "https://files.pythonhosted.org/packages/a3/dc/17031897dae0efacfea57dfd3a82fdd2a2aeb58e0ff71b77b87e44edc772/setuptools-80.9.0-py3-none-any.whl", hash = "sha256:062d34222ad13e0cc312a4c02d73f059e86a4acbfbdea8f8f76b28c99f306922", size = 1201486, upload-time = "2025-05-27T00:56:49.664Z" },
 ]
 
 [[package]]
@@ -2376,6 +2309,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" },
 ]
 
+[[package]]
+name = "smmap"
+version = "5.0.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/44/cd/a040c4b3119bbe532e5b0732286f805445375489fceaec1f48306068ee3b/smmap-5.0.2.tar.gz", hash = "sha256:26ea65a03958fa0c8a1c7e8c7a58fdc77221b8910f6be2131affade476898ad5", size = 22329, upload-time = "2025-01-02T07:14:40.909Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/04/be/d09147ad1ec7934636ad912901c5fd7667e1c858e19d355237db0d0cd5e4/smmap-5.0.2-py3-none-any.whl", hash = "sha256:b30115f0def7d7531d22a0fb6502488d879e75b260a9db4d0819cfb25403af5e", size = 24303, upload-time = "2025-01-02T07:14:38.724Z" },
+]
+
 [[package]]
 name = "sniffio"
 version = "1.3.1"
@@ -2387,28 +2329,28 @@ wheels = [
 
 [[package]]
 name = "sse-starlette"
-version = "3.0.4"
+version = "3.1.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "anyio" },
-    { name = "starlette" },
+    { name = "anyio", marker = "sys_platform == 'linux'" },
+    { name = "starlette", marker = "sys_platform == 'linux'" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/17/8b/54651ad49bce99a50fd61a7f19c2b6a79fbb072e693101fbb1194c362054/sse_starlette-3.0.4.tar.gz", hash = "sha256:5e34286862e96ead0eb70f5ddd0bd21ab1f6473a8f44419dd267f431611383dd", size = 22576, upload-time = "2025-12-14T16:22:52.493Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/62/08/8f554b0e5bad3e4e880521a1686d96c05198471eed860b0eb89b57ea3636/sse_starlette-3.1.1.tar.gz", hash = "sha256:bffa531420c1793ab224f63648c059bcadc412bf9fdb1301ac8de1cf9a67b7fb", size = 24306, upload-time = "2025-12-26T15:22:53.836Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/71/22/8ab1066358601163e1ac732837adba3672f703818f693e179b24e0d3b65c/sse_starlette-3.0.4-py3-none-any.whl", hash = "sha256:32c80ef0d04506ced4b0b6ab8fe300925edc37d26f666afb1874c754895f5dc3", size = 11764, upload-time = "2025-12-14T16:22:51.453Z" },
+    { url = "https://files.pythonhosted.org/packages/e3/31/4c281581a0f8de137b710a07f65518b34bcf333b201cfa06cfda9af05f8a/sse_starlette-3.1.1-py3-none-any.whl", hash = "sha256:bb38f71ae74cfd86b529907a9fda5632195dfa6ae120f214ea4c890c7ee9d436", size = 12442, upload-time = "2025-12-26T15:22:52.911Z" },
 ]
 
 [[package]]
 name = "starlette"
-version = "0.49.3"
+version = "0.50.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "anyio" },
-    { name = "typing-extensions" },
+    { name = "anyio", marker = "sys_platform == 'linux'" },
+    { name = "typing-extensions", marker = "sys_platform == 'linux'" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/de/1a/608df0b10b53b0beb96a37854ee05864d182ddd4b1156a22f1ad3860425a/starlette-0.49.3.tar.gz", hash = "sha256:1c14546f299b5901a1ea0e34410575bc33bbd741377a10484a54445588d00284", size = 2655031, upload-time = "2025-11-01T15:12:26.13Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/ba/b8/73a0e6a6e079a9d9cfa64113d771e421640b6f679a52eeb9b32f72d871a1/starlette-0.50.0.tar.gz", hash = "sha256:a2a17b22203254bcbc2e1f926d2d55f3f9497f769416b3190768befe598fa3ca", size = 2646985, upload-time = "2025-11-01T15:25:27.516Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/a3/e0/021c772d6a662f43b63044ab481dc6ac7592447605b5b35a957785363122/starlette-0.49.3-py3-none-any.whl", hash = "sha256:b579b99715fdc2980cf88c8ec96d3bf1ce16f5a8051a7c2b84ef9b1cdecaea2f", size = 74340, upload-time = "2025-11-01T15:12:24.387Z" },
+    { url = "https://files.pythonhosted.org/packages/d9/52/1064f510b141bd54025f9b55105e26d1fa970b9be67ad766380a3c9b74b0/starlette-0.50.0-py3-none-any.whl", hash = "sha256:9e5391843ec9b6e472eed1365a78c8098cfceb7a74bfd4d6b1c0c0095efb3bca", size = 74033, upload-time = "2025-11-01T15:25:25.461Z" },
 ]
 
 [[package]]
@@ -2446,18 +2388,15 @@ name = "tiktoken"
 version = "0.12.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "regex" },
-    { name = "requests" },
+    { name = "regex", marker = "sys_platform == 'linux'" },
+    { name = "requests", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/7d/ab/4d017d0f76ec3171d469d80fc03dfbb4e48a4bcaddaa831b31d526f05edc/tiktoken-0.12.0.tar.gz", hash = "sha256:b18ba7ee2b093863978fcb14f74b3707cdc8d4d4d3836853ce7ec60772139931", size = 37806, upload-time = "2025-10-06T20:22:45.419Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/a4/85/be65d39d6b647c79800fd9d29241d081d4eeb06271f383bb87200d74cf76/tiktoken-0.12.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b97f74aca0d78a1ff21b8cd9e9925714c15a9236d6ceacf5c7327c117e6e21e8", size = 1050728, upload-time = "2025-10-06T20:21:52.756Z" },
-    { url = "https://files.pythonhosted.org/packages/4a/42/6573e9129bc55c9bf7300b3a35bef2c6b9117018acca0dc760ac2d93dffe/tiktoken-0.12.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2b90f5ad190a4bb7c3eb30c5fa32e1e182ca1ca79f05e49b448438c3e225a49b", size = 994049, upload-time = "2025-10-06T20:21:53.782Z" },
     { url = "https://files.pythonhosted.org/packages/66/c5/ed88504d2f4a5fd6856990b230b56d85a777feab84e6129af0822f5d0f70/tiktoken-0.12.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:65b26c7a780e2139e73acc193e5c63ac754021f160df919add909c1492c0fb37", size = 1129008, upload-time = "2025-10-06T20:21:54.832Z" },
     { url = "https://files.pythonhosted.org/packages/f4/90/3dae6cc5436137ebd38944d396b5849e167896fc2073da643a49f372dc4f/tiktoken-0.12.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:edde1ec917dfd21c1f2f8046b86348b0f54a2c0547f68149d8600859598769ad", size = 1152665, upload-time = "2025-10-06T20:21:56.129Z" },
     { url = "https://files.pythonhosted.org/packages/a3/fe/26df24ce53ffde419a42f5f53d755b995c9318908288c17ec3f3448313a3/tiktoken-0.12.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:35a2f8ddd3824608b3d650a000c1ef71f730d0c56486845705a8248da00f9fe5", size = 1194230, upload-time = "2025-10-06T20:21:57.546Z" },
     { url = "https://files.pythonhosted.org/packages/20/cc/b064cae1a0e9fac84b0d2c46b89f4e57051a5f41324e385d10225a984c24/tiktoken-0.12.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:83d16643edb7fa2c99eff2ab7733508aae1eebb03d5dfc46f5565862810f24e3", size = 1254688, upload-time = "2025-10-06T20:21:58.619Z" },
-    { url = "https://files.pythonhosted.org/packages/81/10/b8523105c590c5b8349f2587e2fdfe51a69544bd5a76295fc20f2374f470/tiktoken-0.12.0-cp312-cp312-win_amd64.whl", hash = "sha256:ffc5288f34a8bc02e1ea7047b8d041104791d2ddbf42d1e5fa07822cbffe16bd", size = 878694, upload-time = "2025-10-06T20:21:59.876Z" },
 ]
 
 [[package]]
@@ -2487,38 +2426,97 @@ wheels = [
 
 [[package]]
 name = "torch"
-version = "2.9.0"
+version = "2.9.0+cu128"
+source = { registry = "https://download.pytorch.org/whl/cu128" }
+resolution-markers = [
+    "sys_platform == 'linux'",
+]
+dependencies = [
+    { name = "filelock", marker = "sys_platform == 'linux'" },
+    { name = "fsspec", marker = "sys_platform == 'linux'" },
+    { name = "jinja2", marker = "sys_platform == 'linux'" },
+    { name = "networkx", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-cublas-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-cuda-cupti-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-cuda-nvrtc-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-cuda-runtime-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-cudnn-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-cufft-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-cufile-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-curand-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-cusolver-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-cusparse-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-cusparselt-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-nccl-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-nvjitlink-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-nvshmem-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-nvtx-cu12", marker = "sys_platform == 'linux'" },
+    { name = "setuptools", marker = "sys_platform == 'linux'" },
+    { name = "sympy", marker = "sys_platform == 'linux'" },
+    { name = "triton", marker = "sys_platform == 'linux'" },
+    { name = "typing-extensions", marker = "sys_platform == 'linux'" },
+]
+wheels = [
+    { url = "https://download.pytorch.org/whl/cu128/torch-2.9.0%2Bcu128-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:e1765625084e320f1eb2f4eb5fd9d14d39d08d7a1880c10a307ce5de20831d27" },
+    { url = "https://download.pytorch.org/whl/cu128/torch-2.9.0%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:87c62d3b95f1a2270bd116dbd47dc515c0b2035076fbb4a03b4365ea289e89c4" },
+]
+
+[[package]]
+name = "torch"
+version = "2.9.1"
+source = { registry = "https://download.pytorch.org/whl/cpu" }
+resolution-markers = [
+    "sys_platform == 'darwin'",
+]
+dependencies = [
+    { name = "filelock", marker = "sys_platform == 'darwin'" },
+    { name = "fsspec", marker = "sys_platform == 'darwin'" },
+    { name = "jinja2", marker = "sys_platform == 'darwin'" },
+    { name = "networkx", marker = "sys_platform == 'darwin'" },
+    { name = "setuptools", marker = "sys_platform == 'darwin'" },
+    { name = "sympy", marker = "sys_platform == 'darwin'" },
+    { name = "typing-extensions", marker = "sys_platform == 'darwin'" },
+]
+wheels = [
+    { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1-cp312-none-macosx_11_0_arm64.whl" },
+]
+
+[[package]]
+name = "torch"
+version = "2.9.1+cpu"
+source = { registry = "https://download.pytorch.org/whl/cpu" }
+resolution-markers = [
+    "sys_platform != 'darwin' and sys_platform != 'linux'",
+]
+dependencies = [
+    { name = "filelock", marker = "sys_platform != 'darwin' and sys_platform != 'linux'" },
+    { name = "fsspec", marker = "sys_platform != 'darwin' and sys_platform != 'linux'" },
+    { name = "jinja2", marker = "sys_platform != 'darwin' and sys_platform != 'linux'" },
+    { name = "networkx", marker = "sys_platform != 'darwin' and sys_platform != 'linux'" },
+    { name = "setuptools", marker = "sys_platform != 'darwin' and sys_platform != 'linux'" },
+    { name = "sympy", marker = "sys_platform != 'darwin' and sys_platform != 'linux'" },
+    { name = "typing-extensions", marker = "sys_platform != 'darwin' and sys_platform != 'linux'" },
+]
+wheels = [
+    { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1%2Bcpu-cp312-cp312-win_amd64.whl" },
+    { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1%2Bcpu-cp312-cp312-win_arm64.whl" },
+]
+
+[[package]]
+name = "torch-c-dlpack-ext"
+version = "0.1.4"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "filelock" },
-    { name = "fsspec" },
-    { name = "jinja2" },
-    { name = "networkx" },
-    { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cuda-cupti-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cuda-nvrtc-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cuda-runtime-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cudnn-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cufft-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cufile-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-curand-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cusolver-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cusparse-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cusparselt-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-nccl-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-nvshmem-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-nvtx-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "setuptools" },
-    { name = "sympy" },
-    { name = "triton", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "typing-extensions" },
+    { name = "torch", version = "2.9.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux'" },
+    { name = "torch", version = "2.9.1", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform == 'darwin'" },
+    { name = "torch", version = "2.9.1+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform != 'darwin' and sys_platform != 'linux'" },
 ]
+sdist = { url = "https://files.pythonhosted.org/packages/b7/cc/24e5eee56bfe2f99b9c026d55bc1a77ceaf409791d9be71a001ede1b2f4e/torch_c_dlpack_ext-0.1.4.tar.gz", hash = "sha256:ad292d17e285ab9523940e51e87d21ffce4982ce8beb46fb18b5c2b4760a1a10", size = 3683, upload-time = "2025-12-09T00:37:56.739Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/d1/d3/3985739f3b8e88675127bf70f82b3a48ae083e39cda56305dbd90398fec0/torch-2.9.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:e5f7af1dc4c0a7c4a260c2534f41ddaf209714f7c89145e644c44712fbd6b642", size = 104107898, upload-time = "2025-10-15T15:46:20.883Z" },
-    { url = "https://files.pythonhosted.org/packages/a5/4b/f4bb2e6c25d0272f798cd6d7a04ed315da76cec68c602d87040c7847287f/torch-2.9.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:01cff95ecd9a212ea2f141db28acccdceb6a4c54f64e6c51091146f5e2a772c6", size = 899738273, upload-time = "2025-10-15T15:50:04.188Z" },
-    { url = "https://files.pythonhosted.org/packages/66/11/c1c5ba6691cda6279087c35bd626536e4fd29521fe740abf5008377a9a02/torch-2.9.0-cp312-cp312-win_amd64.whl", hash = "sha256:4582b162f541651f0cb184d3e291c05c2f556c7117c64a9873e2ee158d40062b", size = 109280887, upload-time = "2025-10-15T15:46:26.228Z" },
-    { url = "https://files.pythonhosted.org/packages/dd/5f/b85bd8c05312d71de9402bf5868d217c38827cfd09d8f8514e5be128a52b/torch-2.9.0-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:33f58e9a102a91259af289d50525c30323b5c9ae1d31322b6447c0814da68695", size = 74478983, upload-time = "2025-10-15T15:46:39.406Z" },
+    { url = "https://files.pythonhosted.org/packages/84/c8/97c3d4a1c05dd41e4ba70a8abff47a93951c035a3db1a532777b372f63bc/torch_c_dlpack_ext-0.1.4-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:78253bc0d2ee4f0c4bf38e207f19de93ba3625430e5ecb08f3a800d93ea9a144", size = 5281944, upload-time = "2025-12-09T00:37:32.153Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/4c/7d59344006807613baca1daf3109ea6a81743146330b5afde96dc953115c/torch_c_dlpack_ext-0.1.4-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d7b3f6beab017961a082f4012a68fcdeddb55a8b10cd8db630902bd46068b5e5", size = 433744, upload-time = "2025-12-09T00:37:33.538Z" },
+    { url = "https://files.pythonhosted.org/packages/35/c0/0ae9067fd9f15f1feacfd7398314aa48f922ddb9cbeb5f95c8a2e1831cb8/torch_c_dlpack_ext-0.1.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:383794a3281862f8736efa99789713187fb8e1937a5e2f32456bbbe52fa3a8a3", size = 888525, upload-time = "2025-12-09T00:37:35.374Z" },
+    { url = "https://files.pythonhosted.org/packages/b9/5c/74143cedeaf98f632b60c68770e697ce978f75ff6de8f3fe6e58f46459ba/torch_c_dlpack_ext-0.1.4-cp312-cp312-win_amd64.whl", hash = "sha256:d4a343642c8ee46f1257731b0a5473a76eacaf1a09db13dbf12a2b012b586041", size = 1473667, upload-time = "2025-12-09T00:37:37.382Z" },
 ]
 
 [[package]]
@@ -2526,13 +2524,11 @@ name = "torchaudio"
 version = "2.9.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "torch" },
+    { name = "torch", version = "2.9.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux'" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/b7/63/3c0ede3aa3d19a8a6698ddd107fa88660549360b51bf8ce2717cd498d800/torchaudio-2.9.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ab4cbcccfd873b0fb41fcb39c9869e59ef84bb95b093f6f58e2d05172a7500d2", size = 809116, upload-time = "2025-10-15T15:52:00.911Z" },
     { url = "https://files.pythonhosted.org/packages/be/d5/25e58745defe9d05893d3cba5c0e1a76aeaac503ac5ec4d9f83c871df71c/torchaudio-2.9.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:7f93388b6e536c14d6015b6f75277a8b45efc532f61b35adc1ed06c98a86003e", size = 476020, upload-time = "2025-10-15T15:51:59.967Z" },
     { url = "https://files.pythonhosted.org/packages/f0/9c/58b8b49dfba2ae85e41ca86b0c52de45bbbea01987490de219c99c523a58/torchaudio-2.9.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:508318a2130b40ad51378f90caf8727a4bd3ac2b296f2b90c900b44e6068a940", size = 2059901, upload-time = "2025-10-15T15:51:54.634Z" },
-    { url = "https://files.pythonhosted.org/packages/d7/eb/58b05f75d12f69ccc460893a20c999da082e063082120ed06e05cca3a053/torchaudio-2.9.0-cp312-cp312-win_amd64.whl", hash = "sha256:82117e3a605f2959dc09b4cd8a11178d6e92727d5f85e5d4f9fe47502f84ee96", size = 665350, upload-time = "2025-10-15T15:52:08.384Z" },
 ]
 
 [[package]]
@@ -2540,15 +2536,13 @@ name = "torchvision"
 version = "0.24.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "numpy" },
-    { name = "pillow" },
-    { name = "torch" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux'" },
+    { name = "pillow", marker = "sys_platform == 'linux'" },
+    { name = "torch", version = "2.9.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux'" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/47/ef/81e4e69e02e2c4650b30e8c11c8974f946682a30e0ab7e9803a831beff76/torchvision-0.24.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c61d40bcd2e2451e932902a702ad495ba1ec6f279e90b1e15cef2bb55dc911e2", size = 1891726, upload-time = "2025-10-15T15:51:16.977Z" },
     { url = "https://files.pythonhosted.org/packages/00/7b/e3809b3302caea9a12c13f3adebe4fef127188438e719fd6c8dc93db1da6/torchvision-0.24.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:b0531d1483fc322d7da0d83be52f0df860a75114ab87dbeeb9de765feaeda843", size = 2419495, upload-time = "2025-10-15T15:51:11.885Z" },
     { url = "https://files.pythonhosted.org/packages/7e/e6/7324ead6793075a8c75c56abeed1236d1750de16a5613cfe2ddad164a92a/torchvision-0.24.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:26b9dd9c083f8e5f7ac827de6d5b88c615d9c582dc87666770fbdf16887e4c25", size = 8050480, upload-time = "2025-10-15T15:51:24.012Z" },
-    { url = "https://files.pythonhosted.org/packages/3e/ad/3c56fcd2a0d6e8afa80e115b5ade4302232ec99655220a51d05709819523/torchvision-0.24.0-cp312-cp312-win_amd64.whl", hash = "sha256:060b7c50ed4b3fb0316b08e2e31bfd874ec2f63ef5ae02f81e54341ca4e88703", size = 4292225, upload-time = "2025-10-15T15:51:27.699Z" },
 ]
 
 [[package]]
@@ -2565,12 +2559,13 @@ wheels = [
 
 [[package]]
 name = "transformers"
-version = "4.57.1"
+version = "4.57.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "filelock" },
     { name = "huggingface-hub" },
-    { name = "numpy" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux'" },
+    { name = "numpy", version = "2.4.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'linux'" },
     { name = "packaging" },
     { name = "pyyaml" },
     { name = "regex" },
@@ -2579,9 +2574,9 @@ dependencies = [
     { name = "tokenizers" },
     { name = "tqdm" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/d6/68/a39307bcc4116a30b2106f2e689130a48de8bd8a1e635b5e1030e46fcd9e/transformers-4.57.1.tar.gz", hash = "sha256:f06c837959196c75039809636cd964b959f6604b75b8eeec6fdfc0440b89cc55", size = 10142511, upload-time = "2025-10-14T15:39:26.18Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/dd/70/d42a739e8dfde3d92bb2fff5819cbf331fe9657323221e79415cd5eb65ee/transformers-4.57.3.tar.gz", hash = "sha256:df4945029aaddd7c09eec5cad851f30662f8bd1746721b34cc031d70c65afebc", size = 10139680, upload-time = "2025-11-25T15:51:30.139Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/71/d3/c16c3b3cf7655a67db1144da94b021c200ac1303f82428f2beef6c2e72bb/transformers-4.57.1-py3-none-any.whl", hash = "sha256:b10d05da8fa67dc41644dbbf9bc45a44cb86ae33da6f9295f5fbf5b7890bd267", size = 11990925, upload-time = "2025-10-14T15:39:23.085Z" },
+    { url = "https://files.pythonhosted.org/packages/6a/6b/2f416568b3c4c91c96e5a365d164f8a4a4a88030aa8ab4644181fdadce97/transformers-4.57.3-py3-none-any.whl", hash = "sha256:c77d353a4851b1880191603d36acb313411d3577f6e2897814f333841f7003f4", size = 11993463, upload-time = "2025-11-25T15:51:26.493Z" },
 ]
 
 [[package]]
@@ -2589,47 +2584,48 @@ name = "triton"
 version = "3.5.0"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/87/9b/30988039e1e84df7554fba24e6a734d2d0e847af33cabdf9b532b3c51456/triton-3.5.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7da21fccceafc163e3a5e857abe34351ef76345af06cabf9637a914742671f0b", size = 159946647, upload-time = "2025-10-15T19:15:56.325Z" },
     { url = "https://files.pythonhosted.org/packages/f5/3a/e991574f3102147b642e49637e0281e9bb7c4ba254edb2bab78247c85e01/triton-3.5.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c9e71db82261c4ffa3921cd050cd5faa18322d2d405c30eb56084afaff3b0833", size = 170476535, upload-time = "2025-10-13T16:38:05.18Z" },
 ]
 
 [[package]]
 name = "ty"
-version = "0.0.5"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/9e/db/6299d478000f4f1c6f9bf2af749359381610ffc4cbe6713b66e436ecf6e7/ty-0.0.5.tar.gz", hash = "sha256:983da6330773ff71e2b249810a19c689f9a0372f6e21bbf7cde37839d05b4346", size = 4806218, upload-time = "2025-12-20T21:19:17.24Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/7c/98/c1f61ba378b4191e641bb36c07b7fcc70ff844d61be7a4bf2fea7472b4a9/ty-0.0.5-py3-none-linux_armv6l.whl", hash = "sha256:1594cd9bb68015eb2f5a3c68a040860f3c9306dc6667d7a0e5f4df9967b460e2", size = 9785554, upload-time = "2025-12-20T21:19:05.024Z" },
-    { url = "https://files.pythonhosted.org/packages/ab/f9/b37b77c03396bd779c1397dae4279b7ad79315e005b3412feed8812a4256/ty-0.0.5-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:7c0140ba980233d28699d9ddfe8f43d0b3535d6a3bbff9935df625a78332a3cf", size = 9603995, upload-time = "2025-12-20T21:19:15.256Z" },
-    { url = "https://files.pythonhosted.org/packages/7d/70/4e75c11903b0e986c0203040472627cb61d6a709e1797fb08cdf9d565743/ty-0.0.5-py3-none-macosx_11_0_arm64.whl", hash = "sha256:15de414712cde92048ae4b1a77c4dc22920bd23653fe42acaf73028bad88f6b9", size = 9145815, upload-time = "2025-12-20T21:19:36.481Z" },
-    { url = "https://files.pythonhosted.org/packages/89/05/93983dfcf871a41dfe58e5511d28e6aa332a1f826cc67333f77ae41a2f8a/ty-0.0.5-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:438aa51ad6c5fae64191f8d58876266e26f9250cf09f6624b6af47a22fa88618", size = 9619849, upload-time = "2025-12-20T21:19:19.084Z" },
-    { url = "https://files.pythonhosted.org/packages/82/b6/896ab3aad59f846823f202e94be6016fb3f72434d999d2ae9bd0f28b3af9/ty-0.0.5-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1b3d373fd96af1564380caf153600481c676f5002ee76ba8a7c3508cdff82ee0", size = 9606611, upload-time = "2025-12-20T21:19:24.583Z" },
-    { url = "https://files.pythonhosted.org/packages/ca/ae/098e33fc92330285ed843e2750127e896140c4ebd2d73df7732ea496f588/ty-0.0.5-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8453692503212ad316cf8b99efbe85a91e5f63769c43be5345e435a1b16cba5a", size = 10029523, upload-time = "2025-12-20T21:19:07.055Z" },
-    { url = "https://files.pythonhosted.org/packages/04/5a/f4b4c33758b9295e9aca0de9645deca0f4addd21d38847228723a6e780fc/ty-0.0.5-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:2e4c454139473abbd529767b0df7a795ed828f780aef8d0d4b144558c0dc4446", size = 10870892, upload-time = "2025-12-20T21:19:34.495Z" },
-    { url = "https://files.pythonhosted.org/packages/c3/c5/4e3e7e88389365aa1e631c99378711cf0c9d35a67478cb4720584314cf44/ty-0.0.5-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:426d4f3b82475b1ec75f3cc9ee5a667c8a4ae8441a09fcd8e823a53b706d00c7", size = 10599291, upload-time = "2025-12-20T21:19:26.557Z" },
-    { url = "https://files.pythonhosted.org/packages/c1/5d/138f859ea87bd95e17b9818e386ae25a910e46521c41d516bf230ed83ffc/ty-0.0.5-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5710817b67c6b2e4c0224e4f319b7decdff550886e9020f6d46aa1ce8f89a609", size = 10413515, upload-time = "2025-12-20T21:19:11.094Z" },
-    { url = "https://files.pythonhosted.org/packages/27/21/1cbcd0d3b1182172f099e88218137943e0970603492fb10c7c9342369d9a/ty-0.0.5-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e23c55ef08882c7c5ced1ccb90b4eeefa97f690aea254f58ac0987896c590f76", size = 10144992, upload-time = "2025-12-20T21:19:13.225Z" },
-    { url = "https://files.pythonhosted.org/packages/ad/30/fdac06a5470c09ad2659a0806497b71f338b395d59e92611f71b623d05a0/ty-0.0.5-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:b9e4c1a28a23b14cf8f4f793f4da396939f16c30bfa7323477c8cc234e352ac4", size = 9606408, upload-time = "2025-12-20T21:19:09.212Z" },
-    { url = "https://files.pythonhosted.org/packages/09/93/e99dcd7f53295192d03efd9cbcec089a916f49cad4935c0160ea9adbd53d/ty-0.0.5-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:4e9ebb61529b9745af662e37c37a01ad743cdd2c95f0d1421705672874d806cd", size = 9630040, upload-time = "2025-12-20T21:19:38.165Z" },
-    { url = "https://files.pythonhosted.org/packages/d7/f8/6d1e87186e4c35eb64f28000c1df8fd5f73167ce126c5e3dd21fd1204a23/ty-0.0.5-py3-none-musllinux_1_2_i686.whl", hash = "sha256:5eb191a8e332f50f56dfe45391bdd7d43dd4ef6e60884710fd7ce84c5d8c1eb5", size = 9754016, upload-time = "2025-12-20T21:19:32.79Z" },
-    { url = "https://files.pythonhosted.org/packages/28/e6/20f989342cb3115852dda404f1d89a10a3ce93f14f42b23f095a3d1a00c9/ty-0.0.5-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:92ed7451a1e82ee134a2c24ca43b74dd31e946dff2b08e5c34473e6b051de542", size = 10252877, upload-time = "2025-12-20T21:19:20.787Z" },
-    { url = "https://files.pythonhosted.org/packages/57/9d/fc66fa557443233dfad9ae197ff3deb70ae0efcfb71d11b30ef62f5cdcc3/ty-0.0.5-py3-none-win32.whl", hash = "sha256:71f6707e4c1c010c158029a688a498220f28bb22fdb6707e5c20e09f11a5e4f2", size = 9212640, upload-time = "2025-12-20T21:19:30.817Z" },
-    { url = "https://files.pythonhosted.org/packages/68/b6/05c35f6dea29122e54af0e9f8dfedd0a100c721affc8cc801ebe2bc2ed13/ty-0.0.5-py3-none-win_amd64.whl", hash = "sha256:2b8b754a0d7191e94acdf0c322747fec34371a4d0669f5b4e89549aef28814ae", size = 10034701, upload-time = "2025-12-20T21:19:28.311Z" },
-    { url = "https://files.pythonhosted.org/packages/df/ca/4201ed5cb2af73912663d0c6ded927c28c28b3c921c9348aa8d2cfef4853/ty-0.0.5-py3-none-win_arm64.whl", hash = "sha256:83bea5a5296caac20d52b790ded2b830a7ff91c4ed9f36730fe1f393ceed6654", size = 9566474, upload-time = "2025-12-20T21:19:22.518Z" },
+version = "0.0.7"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/b3/43/8be3ec2e2ce6119cff9ee3a207fae0cb4f2b4f8ed6534175130a32be24a7/ty-0.0.7.tar.gz", hash = "sha256:90e53b20b86c418ee41a8385f17da44cc7f916f96f9eee87593423ce8292ca72", size = 4826677, upload-time = "2025-12-24T21:28:49.136Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/6e/56/fafa123acf955089306372add312f16e97aba61f7c4daf74e2bb9c350d23/ty-0.0.7-py3-none-linux_armv6l.whl", hash = "sha256:b30105bd9a0b064497111c50c206d5b6a032f29bcf39f09a12085c3009d72784", size = 9862360, upload-time = "2025-12-24T21:28:36.762Z" },
+    { url = "https://files.pythonhosted.org/packages/71/f4/9c30ff498d9a60e24f16d26c0cf93cd03a119913ffa720a77149f02df06e/ty-0.0.7-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:b4df20889115f3d5611a9d9cdedc222e3fd82b5fe87bb0a9f7246e53a23becc7", size = 9712866, upload-time = "2025-12-24T21:28:25.926Z" },
+    { url = "https://files.pythonhosted.org/packages/43/84/e06a4a6e4011890027ffee41efbf261b1335103d09009d625ace7f1a60eb/ty-0.0.7-py3-none-macosx_11_0_arm64.whl", hash = "sha256:f699589d8511e1e17c5a7edfc5f4a4e80f2a6d4a3932a0e9e3422fd32d731472", size = 9221692, upload-time = "2025-12-24T21:28:29.649Z" },
+    { url = "https://files.pythonhosted.org/packages/7a/e9/ebb4192d3627730125d40ee403a17dc91bab59d69c3eff286453b3218d01/ty-0.0.7-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3eaec2d8aa153ee4bcc43b17a384d0f9e66177c8c8127be3358b6b8348b9e3b", size = 9710340, upload-time = "2025-12-24T21:28:55.148Z" },
+    { url = "https://files.pythonhosted.org/packages/8f/4a/ec144458a9cfb324d5cb471483094e62e74d73179343dff262a5cca1a1e1/ty-0.0.7-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:177d160295e6a56bdf0b61f6120bc4502fff301d4d10855ba711c109aa7f37fb", size = 9670317, upload-time = "2025-12-24T21:28:43.096Z" },
+    { url = "https://files.pythonhosted.org/packages/b6/94/fe7106fd5e2ac06b81fba7b785a6216774618edc3fda9e17f58efe3cede6/ty-0.0.7-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:30518b95ab5cc83615794cca765a5fb86df39a0d9c3dadc0ab2d787ab7830008", size = 10096517, upload-time = "2025-12-24T21:28:23.667Z" },
+    { url = "https://files.pythonhosted.org/packages/45/d9/db96ccfd663c96bdd4bb63db72899198c01445012f939477a5318a563f14/ty-0.0.7-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:7867b3f75c2d9602cc6fb3b6d462580b707c2d112d4b27037142b0d01f8bfd03", size = 10996406, upload-time = "2025-12-24T21:28:39.134Z" },
+    { url = "https://files.pythonhosted.org/packages/94/da/103915c08c3e6a14f95959614646fcdc9a240cd9a039fadbdcd086c819ee/ty-0.0.7-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:878d45858e209b7904753fbc5155f4cb75dadc20a26bbb77614bfef31580f9ae", size = 10712829, upload-time = "2025-12-24T21:28:27.745Z" },
+    { url = "https://files.pythonhosted.org/packages/47/c0/d9be417bc8e459e13e9698978579eec9868f91f4c5d6ef663249967fec8b/ty-0.0.7-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:651820b193901825afce40ae68f6a51cd64dbfa4b81a45db90061401261f25e4", size = 10486541, upload-time = "2025-12-24T21:28:45.17Z" },
+    { url = "https://files.pythonhosted.org/packages/ad/09/d1858c66620d8ae566e021ad0d7168914b1568841f8fe9e439116ce6b440/ty-0.0.7-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f56a5a0c1c045863b1b70c358a392b3f73b8528c5c571d409f19dd465525e116", size = 10255312, upload-time = "2025-12-24T21:28:53.17Z" },
+    { url = "https://files.pythonhosted.org/packages/b6/0a/78f75089db491fd5fcc13d2845a0b2771b7f7d377450c64c6616e9c227bc/ty-0.0.7-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:748218fbc1f7b7f1b9d14e77d4f3d7fec72af794417e26b0185bdb94153afe1c", size = 9696201, upload-time = "2025-12-24T21:28:57.345Z" },
+    { url = "https://files.pythonhosted.org/packages/01/9e/b26e94832fd563fef6f77a4487affc77a027b0e53106422c66aafb37fa01/ty-0.0.7-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:1ff80f3985a52a7358b9069b4a8d223e92cf312544a934a062d6d3a4fb6876b3", size = 9688907, upload-time = "2025-12-24T21:28:59.485Z" },
+    { url = "https://files.pythonhosted.org/packages/5a/8f/cc48601fb92c964cf6c34277e0d947076146b7de47aa11b5dbae45e01ce7/ty-0.0.7-py3-none-musllinux_1_2_i686.whl", hash = "sha256:a808910ce672ba4446699f4c021283208f58f988bcfc3bdbdfc6e005819d9ee0", size = 9829982, upload-time = "2025-12-24T21:28:34.429Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/af/7fa9c2bfa25865968bded637f7e71f1a712f4fbede88f487b6a9101ab936/ty-0.0.7-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:2718fea5f314eda01703fb406ec89b1fc8710b3fc6a09bbd6f7a4f3502ddc889", size = 10361037, upload-time = "2025-12-24T21:28:47.027Z" },
+    { url = "https://files.pythonhosted.org/packages/1c/5b/1a6ff1495975cd1c02aa8d03bc5c9d8006eaeb8bf354446f88d70f0518fd/ty-0.0.7-py3-none-win32.whl", hash = "sha256:ae89bb8dc50deb66f34eab3113aa61ac5d7f85ecf16279e5918548085a89021c", size = 9295092, upload-time = "2025-12-24T21:28:51.041Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/f6/47e9364635d048002354f84d2d0d6dfc9eb166dc67850739f88e1fec4fc5/ty-0.0.7-py3-none-win_amd64.whl", hash = "sha256:25bd20e3d4d0f07b422f9b42711ba24d28116031273bd23dbda66cec14df1c06", size = 10162816, upload-time = "2025-12-24T21:28:41.006Z" },
+    { url = "https://files.pythonhosted.org/packages/7f/f4/c4fc28410c4493982b7481fb23f62bacb02fd2912ebec3b9bc7de18bebb8/ty-0.0.7-py3-none-win_arm64.whl", hash = "sha256:c87d27484dba9fca0053b6a9eee47eecc760aab2bbb8e6eab3d7f81531d1ad0c", size = 9653112, upload-time = "2025-12-24T21:28:31.562Z" },
 ]
 
 [[package]]
 name = "typer"
-version = "0.20.0"
+version = "0.21.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "click" },
-    { name = "rich" },
-    { name = "shellingham" },
-    { name = "typing-extensions" },
+    { name = "click", marker = "sys_platform == 'linux'" },
+    { name = "rich", marker = "sys_platform == 'linux'" },
+    { name = "shellingham", marker = "sys_platform == 'linux'" },
+    { name = "typing-extensions", marker = "sys_platform == 'linux'" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/8f/28/7c85c8032b91dbe79725b6f17d2fffc595dff06a35c7a30a37bef73a1ab4/typer-0.20.0.tar.gz", hash = "sha256:1aaf6494031793e4876fb0bacfa6a912b551cf43c1e63c800df8b1a866720c37", size = 106492, upload-time = "2025-10-20T17:03:49.445Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/85/30/ff9ede605e3bd086b4dd842499814e128500621f7951ca1e5ce84bbf61b1/typer-0.21.0.tar.gz", hash = "sha256:c87c0d2b6eee3b49c5c64649ec92425492c14488096dfbc8a0c2799b2f6f9c53", size = 106781, upload-time = "2025-12-25T09:54:53.651Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/78/64/7713ffe4b5983314e9d436a90d5bd4f63b6054e2aca783a3cfc44cb95bbf/typer-0.20.0-py3-none-any.whl", hash = "sha256:5b463df6793ec1dca6213a3cf4c0f03bc6e322ac5e16e13ddd622a889489784a", size = 47028, upload-time = "2025-10-20T17:03:47.617Z" },
+    { url = "https://files.pythonhosted.org/packages/e1/e4/5ebc1899d31d2b1601b32d21cfb4bba022ae6fce323d365f0448031b1660/typer-0.21.0-py3-none-any.whl", hash = "sha256:c79c01ca6b30af9fd48284058a7056ba0d3bf5cf10d0ff3d0c5b11b68c258ac6", size = 47109, upload-time = "2025-12-25T09:54:51.918Z" },
 ]
 
 [[package]]
@@ -2664,35 +2660,34 @@ wheels = [
 
 [[package]]
 name = "urllib3"
-version = "2.5.0"
+version = "2.6.2"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/15/22/9ee70a2574a4f4599c47dd506532914ce044817c7752a79b6a51286319bc/urllib3-2.5.0.tar.gz", hash = "sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760", size = 393185, upload-time = "2025-06-18T14:07:41.644Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/1e/24/a2a2ed9addd907787d7aa0355ba36a6cadf1768b934c652ea78acbd59dcd/urllib3-2.6.2.tar.gz", hash = "sha256:016f9c98bb7e98085cb2b4b17b87d2c702975664e4f060c6532e64d1c1a5e797", size = 432930, upload-time = "2025-12-11T15:56:40.252Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/a7/c2/fe1e52489ae3122415c51f387e221dd0773709bad6c6cdaa599e8a2c5185/urllib3-2.5.0-py3-none-any.whl", hash = "sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc", size = 129795, upload-time = "2025-06-18T14:07:40.39Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/b9/4095b668ea3678bf6a0af005527f39de12fb026516fb3df17495a733b7f8/urllib3-2.6.2-py3-none-any.whl", hash = "sha256:ec21cddfe7724fc7cb4ba4bea7aa8e2ef36f607a4bab81aa6ce42a13dc3f03dd", size = 131182, upload-time = "2025-12-11T15:56:38.584Z" },
 ]
 
 [[package]]
 name = "uvicorn"
-version = "0.38.0"
+version = "0.40.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "click" },
-    { name = "h11" },
+    { name = "click", marker = "sys_platform == 'linux'" },
+    { name = "h11", marker = "sys_platform == 'linux'" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/cb/ce/f06b84e2697fef4688ca63bdb2fdf113ca0a3be33f94488f2cadb690b0cf/uvicorn-0.38.0.tar.gz", hash = "sha256:fd97093bdd120a2609fc0d3afe931d4d4ad688b6e75f0f929fde1bc36fe0e91d", size = 80605, upload-time = "2025-10-18T13:46:44.63Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/c3/d1/8f3c683c9561a4e6689dd3b1d345c815f10f86acd044ee1fb9a4dcd0b8c5/uvicorn-0.40.0.tar.gz", hash = "sha256:839676675e87e73694518b5574fd0f24c9d97b46bea16df7b8c05ea1a51071ea", size = 81761, upload-time = "2025-12-21T14:16:22.45Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/ee/d9/d88e73ca598f4f6ff671fb5fde8a32925c2e08a637303a1d12883c7305fa/uvicorn-0.38.0-py3-none-any.whl", hash = "sha256:48c0afd214ceb59340075b4a052ea1ee91c16fbc2a9b1469cca0e54566977b02", size = 68109, upload-time = "2025-10-18T13:46:42.958Z" },
+    { url = "https://files.pythonhosted.org/packages/3d/d8/2083a1daa7439a66f3a48589a57d576aa117726762618f6bb09fe3798796/uvicorn-0.40.0-py3-none-any.whl", hash = "sha256:c6c8f55bc8bf13eb6fa9ff87ad62308bbbc33d0b67f84293151efe87e0d5f2ee", size = 68502, upload-time = "2025-12-21T14:16:21.041Z" },
 ]
 
 [package.optional-dependencies]
 standard = [
-    { name = "colorama", marker = "sys_platform == 'win32'" },
-    { name = "httptools" },
-    { name = "python-dotenv" },
-    { name = "pyyaml" },
-    { name = "uvloop", marker = "platform_python_implementation != 'PyPy' and sys_platform != 'cygwin' and sys_platform != 'win32'" },
-    { name = "watchfiles" },
-    { name = "websockets" },
+    { name = "httptools", marker = "sys_platform == 'linux'" },
+    { name = "python-dotenv", marker = "sys_platform == 'linux'" },
+    { name = "pyyaml", marker = "sys_platform == 'linux'" },
+    { name = "uvloop", marker = "platform_python_implementation != 'PyPy' and sys_platform == 'linux'" },
+    { name = "watchfiles", marker = "sys_platform == 'linux'" },
+    { name = "websockets", marker = "sys_platform == 'linux'" },
 ]
 
 [[package]]
@@ -2701,8 +2696,6 @@ version = "0.22.1"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/06/f0/18d39dbd1971d6d62c4629cc7fa67f74821b0dc1f5a77af43719de7936a7/uvloop-0.22.1.tar.gz", hash = "sha256:6c84bae345b9147082b17371e3dd5d42775bddce91f885499017f4607fdaf39f", size = 2443250, upload-time = "2025-10-16T22:17:19.342Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/3d/ff/7f72e8170be527b4977b033239a83a68d5c881cc4775fca255c677f7ac5d/uvloop-0.22.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:fe94b4564e865d968414598eea1a6de60adba0c040ba4ed05ac1300de402cd42", size = 1359936, upload-time = "2025-10-16T22:16:29.436Z" },
-    { url = "https://files.pythonhosted.org/packages/c3/c6/e5d433f88fd54d81ef4be58b2b7b0cea13c442454a1db703a1eea0db1a59/uvloop-0.22.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:51eb9bd88391483410daad430813d982010f9c9c89512321f5b60e2cddbdddd6", size = 752769, upload-time = "2025-10-16T22:16:30.493Z" },
     { url = "https://files.pythonhosted.org/packages/24/68/a6ac446820273e71aa762fa21cdcc09861edd3536ff47c5cd3b7afb10eeb/uvloop-0.22.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:700e674a166ca5778255e0e1dc4e9d79ab2acc57b9171b79e65feba7184b3370", size = 4317413, upload-time = "2025-10-16T22:16:31.644Z" },
     { url = "https://files.pythonhosted.org/packages/5f/6f/e62b4dfc7ad6518e7eff2516f680d02a0f6eb62c0c212e152ca708a0085e/uvloop-0.22.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7b5b1ac819a3f946d3b2ee07f09149578ae76066d70b44df3fa990add49a82e4", size = 4426307, upload-time = "2025-10-16T22:16:32.917Z" },
     { url = "https://files.pythonhosted.org/packages/90/60/97362554ac21e20e81bcef1150cb2a7e4ffdaf8ea1e5b2e8bf7a053caa18/uvloop-0.22.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e047cc068570bac9866237739607d1313b9253c3051ad84738cbb095be0537b2", size = 4131970, upload-time = "2025-10-16T22:16:34.015Z" },
@@ -2714,65 +2707,65 @@ name = "vllm"
 version = "0.13.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "aiohttp" },
-    { name = "anthropic" },
-    { name = "blake3" },
-    { name = "cachetools" },
-    { name = "cbor2" },
-    { name = "cloudpickle" },
-    { name = "compressed-tensors" },
-    { name = "depyf" },
-    { name = "diskcache" },
-    { name = "einops" },
-    { name = "fastapi", extra = ["standard"] },
-    { name = "filelock" },
-    { name = "flashinfer-python" },
-    { name = "gguf" },
-    { name = "ijson" },
-    { name = "lark" },
-    { name = "llguidance", marker = "platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 'ppc64le' or platform_machine == 's390x' or platform_machine == 'x86_64'" },
-    { name = "lm-format-enforcer" },
-    { name = "mcp" },
-    { name = "mistral-common", extra = ["image"] },
-    { name = "model-hosting-container-standards" },
-    { name = "msgspec" },
-    { name = "ninja" },
-    { name = "numba" },
-    { name = "numpy" },
-    { name = "openai" },
-    { name = "openai-harmony" },
-    { name = "opencv-python-headless" },
-    { name = "outlines-core" },
-    { name = "partial-json-parser" },
-    { name = "pillow" },
-    { name = "prometheus-client" },
-    { name = "prometheus-fastapi-instrumentator" },
-    { name = "protobuf" },
-    { name = "psutil" },
-    { name = "py-cpuinfo" },
-    { name = "pybase64" },
-    { name = "pydantic" },
-    { name = "python-json-logger" },
-    { name = "pyyaml" },
-    { name = "pyzmq" },
-    { name = "ray", extra = ["cgraph"] },
-    { name = "regex" },
-    { name = "requests" },
-    { name = "scipy" },
-    { name = "sentencepiece" },
-    { name = "setproctitle" },
-    { name = "setuptools" },
-    { name = "six" },
-    { name = "tiktoken" },
-    { name = "tokenizers" },
-    { name = "torch" },
-    { name = "torchaudio" },
-    { name = "torchvision" },
-    { name = "tqdm" },
-    { name = "transformers" },
-    { name = "typing-extensions" },
-    { name = "watchfiles" },
-    { name = "xgrammar", marker = "platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 'ppc64le' or platform_machine == 's390x' or platform_machine == 'x86_64'" },
+    { name = "aiohttp", marker = "sys_platform == 'linux'" },
+    { name = "anthropic", marker = "sys_platform == 'linux'" },
+    { name = "blake3", marker = "sys_platform == 'linux'" },
+    { name = "cachetools", marker = "sys_platform == 'linux'" },
+    { name = "cbor2", marker = "sys_platform == 'linux'" },
+    { name = "cloudpickle", marker = "sys_platform == 'linux'" },
+    { name = "compressed-tensors", marker = "sys_platform == 'linux'" },
+    { name = "depyf", marker = "sys_platform == 'linux'" },
+    { name = "diskcache", marker = "sys_platform == 'linux'" },
+    { name = "einops", marker = "sys_platform == 'linux'" },
+    { name = "fastapi", extra = ["standard"], marker = "sys_platform == 'linux'" },
+    { name = "filelock", marker = "sys_platform == 'linux'" },
+    { name = "flashinfer-python", marker = "sys_platform == 'linux'" },
+    { name = "gguf", marker = "sys_platform == 'linux'" },
+    { name = "ijson", marker = "sys_platform == 'linux'" },
+    { name = "lark", marker = "sys_platform == 'linux'" },
+    { name = "llguidance", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'arm64' and sys_platform == 'linux') or (platform_machine == 'ppc64le' and sys_platform == 'linux') or (platform_machine == 's390x' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
+    { name = "lm-format-enforcer", marker = "sys_platform == 'linux'" },
+    { name = "mcp", marker = "sys_platform == 'linux'" },
+    { name = "mistral-common", extra = ["image"], marker = "sys_platform == 'linux'" },
+    { name = "model-hosting-container-standards", marker = "sys_platform == 'linux'" },
+    { name = "msgspec", marker = "sys_platform == 'linux'" },
+    { name = "ninja", marker = "sys_platform == 'linux'" },
+    { name = "numba", marker = "sys_platform == 'linux'" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux'" },
+    { name = "openai", marker = "sys_platform == 'linux'" },
+    { name = "openai-harmony", marker = "sys_platform == 'linux'" },
+    { name = "opencv-python-headless", marker = "sys_platform == 'linux'" },
+    { name = "outlines-core", marker = "sys_platform == 'linux'" },
+    { name = "partial-json-parser", marker = "sys_platform == 'linux'" },
+    { name = "pillow", marker = "sys_platform == 'linux'" },
+    { name = "prometheus-client", marker = "sys_platform == 'linux'" },
+    { name = "prometheus-fastapi-instrumentator", marker = "sys_platform == 'linux'" },
+    { name = "protobuf", marker = "sys_platform == 'linux'" },
+    { name = "psutil", marker = "sys_platform == 'linux'" },
+    { name = "py-cpuinfo", marker = "sys_platform == 'linux'" },
+    { name = "pybase64", marker = "sys_platform == 'linux'" },
+    { name = "pydantic", marker = "sys_platform == 'linux'" },
+    { name = "python-json-logger", marker = "sys_platform == 'linux'" },
+    { name = "pyyaml", marker = "sys_platform == 'linux'" },
+    { name = "pyzmq", marker = "sys_platform == 'linux'" },
+    { name = "ray", extra = ["cgraph"], marker = "sys_platform == 'linux'" },
+    { name = "regex", marker = "sys_platform == 'linux'" },
+    { name = "requests", marker = "sys_platform == 'linux'" },
+    { name = "scipy", marker = "sys_platform == 'linux'" },
+    { name = "sentencepiece", marker = "sys_platform == 'linux'" },
+    { name = "setproctitle", marker = "sys_platform == 'linux'" },
+    { name = "setuptools", marker = "sys_platform == 'linux'" },
+    { name = "six", marker = "sys_platform == 'linux'" },
+    { name = "tiktoken", marker = "sys_platform == 'linux'" },
+    { name = "tokenizers", marker = "sys_platform == 'linux'" },
+    { name = "torch", version = "2.9.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux'" },
+    { name = "torchaudio", marker = "sys_platform == 'linux'" },
+    { name = "torchvision", marker = "sys_platform == 'linux'" },
+    { name = "tqdm", marker = "sys_platform == 'linux'" },
+    { name = "transformers", marker = "sys_platform == 'linux'" },
+    { name = "typing-extensions", marker = "sys_platform == 'linux'" },
+    { name = "watchfiles", marker = "sys_platform == 'linux'" },
+    { name = "xgrammar", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'arm64' and sys_platform == 'linux') or (platform_machine == 'ppc64le' and sys_platform == 'linux') or (platform_machine == 's390x' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/11/12/b922f96778d07df1c28dfa9a81fbc9706c13c5d0a4e8d154060818a79705/vllm-0.13.0.tar.gz", hash = "sha256:4ad43db45fef37114b550d03a4f423fb3fa3a31d8bc09ee810ef8b9cdcd4b5fe", size = 17828199, upload-time = "2025-12-19T03:30:32.741Z" }
 wheels = [
@@ -2789,17 +2782,44 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/8d/96/04e7b441807b26b794da5b11e59ed7f83b2cf8af202bd7eba8ad2fa6046e/wadler_lindig-0.1.7-py3-none-any.whl", hash = "sha256:e3ec83835570fd0a9509f969162aeb9c65618f998b1f42918cfc8d45122fe953", size = 20516, upload-time = "2025-06-18T07:00:41.684Z" },
 ]
 
+[[package]]
+name = "wandb"
+version = "0.23.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "click" },
+    { name = "gitpython" },
+    { name = "packaging" },
+    { name = "platformdirs" },
+    { name = "protobuf" },
+    { name = "pydantic" },
+    { name = "pyyaml" },
+    { name = "requests" },
+    { name = "sentry-sdk" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/0a/cc/770ae3aa7ae44f6792f7ecb81c14c0e38b672deb35235719bb1006519487/wandb-0.23.1.tar.gz", hash = "sha256:f6fb1e3717949b29675a69359de0eeb01e67d3360d581947d5b3f98c273567d6", size = 44298053, upload-time = "2025-12-03T02:25:10.79Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/12/0b/c3d7053dfd93fd259a63c7818d9c4ac2ba0642ff8dc8db98662ea0cf9cc0/wandb-0.23.1-py3-none-macosx_12_0_arm64.whl", hash = "sha256:358e15471d19b7d73fc464e37371c19d44d39e433252ac24df107aff993a286b", size = 21527293, upload-time = "2025-12-03T02:24:48.011Z" },
+    { url = "https://files.pythonhosted.org/packages/ee/9f/059420fa0cb6c511dc5c5a50184122b6aca7b178cb2aa210139e354020da/wandb-0.23.1-py3-none-macosx_12_0_x86_64.whl", hash = "sha256:110304407f4b38f163bdd50ed5c5225365e4df3092f13089c30171a75257b575", size = 22745926, upload-time = "2025-12-03T02:24:50.519Z" },
+    { url = "https://files.pythonhosted.org/packages/96/b6/fd465827c14c64d056d30b4c9fcf4dac889a6969dba64489a88fc4ffa333/wandb-0.23.1-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:6cc984cf85feb2f8ee0451d76bc9fb7f39da94956bb8183e30d26284cf203b65", size = 21212973, upload-time = "2025-12-03T02:24:52.828Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/ee/9a8bb9a39cc1f09c3060456cc79565110226dc4099a719af5c63432da21d/wandb-0.23.1-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:67431cd3168d79fdb803e503bd669c577872ffd5dadfa86de733b3274b93088e", size = 22887885, upload-time = "2025-12-03T02:24:55.281Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/4d/8d9e75add529142e037b05819cb3ab1005679272950128d69d218b7e5b2e/wandb-0.23.1-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:07be70c0baa97ea25fadc4a9d0097f7371eef6dcacc5ceb525c82491a31e9244", size = 21250967, upload-time = "2025-12-03T02:24:57.603Z" },
+    { url = "https://files.pythonhosted.org/packages/97/72/0b35cddc4e4168f03c759b96d9f671ad18aec8bdfdd84adfea7ecb3f5701/wandb-0.23.1-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:216c95b08e0a2ec6a6008373b056d597573d565e30b43a7a93c35a171485ee26", size = 22988382, upload-time = "2025-12-03T02:25:00.518Z" },
+    { url = "https://files.pythonhosted.org/packages/c0/6d/e78093d49d68afb26f5261a70fc7877c34c114af5c2ee0ab3b1af85f5e76/wandb-0.23.1-py3-none-win32.whl", hash = "sha256:fb5cf0f85692f758a5c36ab65fea96a1284126de64e836610f92ddbb26df5ded", size = 22150756, upload-time = "2025-12-03T02:25:02.734Z" },
+    { url = "https://files.pythonhosted.org/packages/05/27/4f13454b44c9eceaac3d6e4e4efa2230b6712d613ff9bf7df010eef4fd18/wandb-0.23.1-py3-none-win_amd64.whl", hash = "sha256:21c8c56e436eb707b7d54f705652e030d48e5cfcba24cf953823eb652e30e714", size = 22150760, upload-time = "2025-12-03T02:25:05.106Z" },
+    { url = "https://files.pythonhosted.org/packages/30/20/6c091d451e2a07689bfbfaeb7592d488011420e721de170884fedd68c644/wandb-0.23.1-py3-none-win_arm64.whl", hash = "sha256:8aee7f3bb573f2c0acf860f497ca9c684f9b35f2ca51011ba65af3d4592b77c1", size = 20137463, upload-time = "2025-12-03T02:25:08.317Z" },
+]
+
 [[package]]
 name = "watchfiles"
 version = "1.1.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "anyio" },
+    { name = "anyio", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/c2/c9/8869df9b2a2d6c59d79220a4db37679e74f807c559ffe5265e08b227a210/watchfiles-1.1.1.tar.gz", hash = "sha256:a173cb5c16c4f40ab19cecf48a534c409f7ea983ab8fed0741304a1c0a31b3f2", size = 94440, upload-time = "2025-10-14T15:06:21.08Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/74/d5/f039e7e3c639d9b1d09b07ea412a6806d38123f0508e5f9b48a87b0a76cc/watchfiles-1.1.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:8c89f9f2f740a6b7dcc753140dd5e1ab9215966f7a3530d0c0705c83b401bd7d", size = 404745, upload-time = "2025-10-14T15:04:46.731Z" },
-    { url = "https://files.pythonhosted.org/packages/a5/96/a881a13aa1349827490dab2d363c8039527060cfcc2c92cc6d13d1b1049e/watchfiles-1.1.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:bd404be08018c37350f0d6e34676bd1e2889990117a2b90070b3007f172d0610", size = 391769, upload-time = "2025-10-14T15:04:48.003Z" },
     { url = "https://files.pythonhosted.org/packages/4b/5b/d3b460364aeb8da471c1989238ea0e56bec24b6042a68046adf3d9ddb01c/watchfiles-1.1.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8526e8f916bb5b9a0a777c8317c23ce65de259422bba5b31325a6fa6029d33af", size = 449374, upload-time = "2025-10-14T15:04:49.179Z" },
     { url = "https://files.pythonhosted.org/packages/b9/44/5769cb62d4ed055cb17417c0a109a92f007114a4e07f30812a73a4efdb11/watchfiles-1.1.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2edc3553362b1c38d9f06242416a5d8e9fe235c204a4072e988ce2e5bb1f69f6", size = 459485, upload-time = "2025-10-14T15:04:50.155Z" },
     { url = "https://files.pythonhosted.org/packages/19/0c/286b6301ded2eccd4ffd0041a1b726afda999926cf720aab63adb68a1e36/watchfiles-1.1.1-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:30f7da3fb3f2844259cba4720c3fc7138eb0f7b659c38f3bfa65084c7fc7abce", size = 488813, upload-time = "2025-10-14T15:04:51.059Z" },
@@ -2808,9 +2828,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/cf/68/5707da262a119fb06fbe214d82dd1fe4a6f4af32d2d14de368d0349eb52a/watchfiles-1.1.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1db5d7ae38ff20153d542460752ff397fcf5c96090c1230803713cf3147a6803", size = 456812, upload-time = "2025-10-14T15:04:55.174Z" },
     { url = "https://files.pythonhosted.org/packages/66/ab/3cbb8756323e8f9b6f9acb9ef4ec26d42b2109bce830cc1f3468df20511d/watchfiles-1.1.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:28475ddbde92df1874b6c5c8aaeb24ad5be47a11f87cde5a28ef3835932e3e94", size = 630196, upload-time = "2025-10-14T15:04:56.22Z" },
     { url = "https://files.pythonhosted.org/packages/78/46/7152ec29b8335f80167928944a94955015a345440f524d2dfe63fc2f437b/watchfiles-1.1.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:36193ed342f5b9842edd3532729a2ad55c4160ffcfa3700e0d54be496b70dd43", size = 622657, upload-time = "2025-10-14T15:04:57.521Z" },
-    { url = "https://files.pythonhosted.org/packages/0a/bf/95895e78dd75efe9a7f31733607f384b42eb5feb54bd2eb6ed57cc2e94f4/watchfiles-1.1.1-cp312-cp312-win32.whl", hash = "sha256:859e43a1951717cc8de7f4c77674a6d389b106361585951d9e69572823f311d9", size = 272042, upload-time = "2025-10-14T15:04:59.046Z" },
-    { url = "https://files.pythonhosted.org/packages/87/0a/90eb755f568de2688cb220171c4191df932232c20946966c27a59c400850/watchfiles-1.1.1-cp312-cp312-win_amd64.whl", hash = "sha256:91d4c9a823a8c987cce8fa2690923b069966dabb196dd8d137ea2cede885fde9", size = 288410, upload-time = "2025-10-14T15:05:00.081Z" },
-    { url = "https://files.pythonhosted.org/packages/36/76/f322701530586922fbd6723c4f91ace21364924822a8772c549483abed13/watchfiles-1.1.1-cp312-cp312-win_arm64.whl", hash = "sha256:a625815d4a2bdca61953dbba5a39d60164451ef34c88d751f6c368c3ea73d404", size = 278209, upload-time = "2025-10-14T15:05:01.168Z" },
 ]
 
 [[package]]
@@ -2819,50 +2836,32 @@ version = "15.0.1"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/21/e6/26d09fab466b7ca9c7737474c52be4f76a40301b08362eb2dbc19dcc16c1/websockets-15.0.1.tar.gz", hash = "sha256:82544de02076bafba038ce055ee6412d68da13ab47f0c60cab827346de828dee", size = 177016, upload-time = "2025-03-05T20:03:41.606Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/51/6b/4545a0d843594f5d0771e86463606a3988b5a09ca5123136f8a76580dd63/websockets-15.0.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:3e90baa811a5d73f3ca0bcbf32064d663ed81318ab225ee4f427ad4e26e5aff3", size = 175437, upload-time = "2025-03-05T20:02:16.706Z" },
-    { url = "https://files.pythonhosted.org/packages/f4/71/809a0f5f6a06522af902e0f2ea2757f71ead94610010cf570ab5c98e99ed/websockets-15.0.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:592f1a9fe869c778694f0aa806ba0374e97648ab57936f092fd9d87f8bc03665", size = 173096, upload-time = "2025-03-05T20:02:18.832Z" },
-    { url = "https://files.pythonhosted.org/packages/3d/69/1a681dd6f02180916f116894181eab8b2e25b31e484c5d0eae637ec01f7c/websockets-15.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0701bc3cfcb9164d04a14b149fd74be7347a530ad3bbf15ab2c678a2cd3dd9a2", size = 173332, upload-time = "2025-03-05T20:02:20.187Z" },
     { url = "https://files.pythonhosted.org/packages/a6/02/0073b3952f5bce97eafbb35757f8d0d54812b6174ed8dd952aa08429bcc3/websockets-15.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e8b56bdcdb4505c8078cb6c7157d9811a85790f2f2b3632c7d1462ab5783d215", size = 183152, upload-time = "2025-03-05T20:02:22.286Z" },
     { url = "https://files.pythonhosted.org/packages/74/45/c205c8480eafd114b428284840da0b1be9ffd0e4f87338dc95dc6ff961a1/websockets-15.0.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0af68c55afbd5f07986df82831c7bff04846928ea8d1fd7f30052638788bc9b5", size = 182096, upload-time = "2025-03-05T20:02:24.368Z" },
     { url = "https://files.pythonhosted.org/packages/14/8f/aa61f528fba38578ec553c145857a181384c72b98156f858ca5c8e82d9d3/websockets-15.0.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:64dee438fed052b52e4f98f76c5790513235efaa1ef7f3f2192c392cd7c91b65", size = 182523, upload-time = "2025-03-05T20:02:25.669Z" },
     { url = "https://files.pythonhosted.org/packages/ec/6d/0267396610add5bc0d0d3e77f546d4cd287200804fe02323797de77dbce9/websockets-15.0.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:d5f6b181bb38171a8ad1d6aa58a67a6aa9d4b38d0f8c5f496b9e42561dfc62fe", size = 182790, upload-time = "2025-03-05T20:02:26.99Z" },
     { url = "https://files.pythonhosted.org/packages/02/05/c68c5adbf679cf610ae2f74a9b871ae84564462955d991178f95a1ddb7dd/websockets-15.0.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:5d54b09eba2bada6011aea5375542a157637b91029687eb4fdb2dab11059c1b4", size = 182165, upload-time = "2025-03-05T20:02:30.291Z" },
     { url = "https://files.pythonhosted.org/packages/29/93/bb672df7b2f5faac89761cb5fa34f5cec45a4026c383a4b5761c6cea5c16/websockets-15.0.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3be571a8b5afed347da347bfcf27ba12b069d9d7f42cb8c7028b5e98bbb12597", size = 182160, upload-time = "2025-03-05T20:02:31.634Z" },
-    { url = "https://files.pythonhosted.org/packages/ff/83/de1f7709376dc3ca9b7eeb4b9a07b4526b14876b6d372a4dc62312bebee0/websockets-15.0.1-cp312-cp312-win32.whl", hash = "sha256:c338ffa0520bdb12fbc527265235639fb76e7bc7faafbb93f6ba80d9c06578a9", size = 176395, upload-time = "2025-03-05T20:02:33.017Z" },
-    { url = "https://files.pythonhosted.org/packages/7d/71/abf2ebc3bbfa40f391ce1428c7168fb20582d0ff57019b69ea20fa698043/websockets-15.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:fcd5cf9e305d7b8338754470cf69cf81f420459dbae8a3b40cee57417f4614a7", size = 176841, upload-time = "2025-03-05T20:02:34.498Z" },
     { url = "https://files.pythonhosted.org/packages/fa/a8/5b41e0da817d64113292ab1f8247140aac61cbf6cfd085d6a0fa77f4984f/websockets-15.0.1-py3-none-any.whl", hash = "sha256:f7a866fbc1e97b5c617ee4116daaa09b722101d4a3c170c787450ba409f9736f", size = 169743, upload-time = "2025-03-05T20:03:39.41Z" },
 ]
 
-[[package]]
-name = "win32-setctime"
-version = "1.2.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/b3/8f/705086c9d734d3b663af0e9bb3d4de6578d08f46b1b101c2442fd9aecaa2/win32_setctime-1.2.0.tar.gz", hash = "sha256:ae1fdf948f5640aae05c511ade119313fb6a30d7eabe25fef9764dca5873c4c0", size = 4867, upload-time = "2024-12-07T15:28:28.314Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/e1/07/c6fe3ad3e685340704d314d765b7912993bcb8dc198f0e7a89382d37974b/win32_setctime-1.2.0-py3-none-any.whl", hash = "sha256:95d644c4e708aba81dc3704a116d8cbc974d70b3bdb8be1d150e36be6e9d1390", size = 4083, upload-time = "2024-12-07T15:28:26.465Z" },
-]
-
 [[package]]
 name = "xgrammar"
 version = "0.1.27"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "mlx-lm", marker = "platform_machine == 'arm64' and sys_platform == 'darwin'" },
-    { name = "ninja" },
-    { name = "numpy" },
-    { name = "pydantic" },
-    { name = "torch" },
-    { name = "transformers" },
+    { name = "ninja", marker = "sys_platform == 'linux'" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux'" },
+    { name = "pydantic", marker = "sys_platform == 'linux'" },
+    { name = "torch", version = "2.9.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "sys_platform == 'linux'" },
+    { name = "transformers", marker = "sys_platform == 'linux'" },
     { name = "triton", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "typing-extensions" },
+    { name = "typing-extensions", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/62/e1/b522b1e50fddd773d368c2945ef5ed628aa90c0c972027f9aa5a51d6d4f9/xgrammar-0.1.27.tar.gz", hash = "sha256:40af7bb2891f1633ec7f660723c74a92a963307d283aca9e3b4e53a0feaf1d46", size = 2303435, upload-time = "2025-11-04T03:11:53.512Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/39/b6/09b43e2adff45d30ebcf9110d0ff753f4c96b368adaa2d166df3dee88d5f/xgrammar-0.1.27-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:6404a7714440eb86ab0379d749f33591274eeef04787dc00d61f22069f3ed51d", size = 663319, upload-time = "2025-11-04T03:11:28.682Z" },
-    { url = "https://files.pythonhosted.org/packages/88/8b/53eb5c6d0df8df9f6350f182516a5b8c7b8b11d62650300d2c04af2bc4ea/xgrammar-0.1.27-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d01fa9894bc44a7f6a70b0301b59f3e310c0e0e7b7ea4cf5ce190b12d8220dd8", size = 636168, upload-time = "2025-11-04T03:11:30.373Z" },
     { url = "https://files.pythonhosted.org/packages/08/1b/53d30395bb973f13255d3e3a72961f95fdfb4083877c3f93bb626e3d1522/xgrammar-0.1.27-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:906c0601bac9170e1bab77ca985259035ff9c386c347efcb191555eab86e984e", size = 8676340, upload-time = "2025-11-04T03:11:32.203Z" },
     { url = "https://files.pythonhosted.org/packages/48/74/70cfac0171d9f309cfe18c5384330e3edc9466c436b258495fd30ecf29a3/xgrammar-0.1.27-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eb68988a122f544301c496f2cac8ee82960ca7f5b3a42a952b2a00c0a55e6ca5", size = 8870650, upload-time = "2025-11-04T03:11:34.322Z" },
-    { url = "https://files.pythonhosted.org/packages/6a/a1/0392aa9c7669c56f7f88e4423b246476a74a72c3bb9db944e1bfc029985e/xgrammar-0.1.27-cp312-cp312-win_amd64.whl", hash = "sha256:3aac335ea052afc8f8dc34b9f2afcb9462a68189423aed9f60b0941db6cfc310", size = 708811, upload-time = "2025-11-04T03:11:36.214Z" },
 ]
 
 [[package]]