From 790dcde04f51a7cb49c1668f0921ae8728e4b8d0 Mon Sep 17 00:00:00 2001
From: Vijay Kumar <kumarv4@janelia.hhmi.org>
Date: Fri, 23 Jan 2026 06:56:49 -0800
Subject: [PATCH 1/8] feat: make evolver zero-init and reconstruction warmup
 configurable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

add `zero_init` flag to EvolverParams to control whether evolver
starts as identity function (z_{t+1} = z_t). this provides training
stability but may slow dynamics learning.

reconstruction_warmup_epochs was already configurable in TrainingConfig
and freezes evolver while training encoder/decoder on reconstruction loss.

both features can now be easily toggled via config or cli overrides:
- --evolver_params.zero_init false
- --training.reconstruction_warmup_epochs 10

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 src/LatentEvolution/eed_model.py       | 18 +++++++++++-------
 src/LatentEvolution/latent_1step.yaml  |  1 +
 src/LatentEvolution/latent_20step.yaml |  1 +
 src/LatentEvolution/latent_5step.yaml  |  1 +
 4 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/src/LatentEvolution/eed_model.py b/src/LatentEvolution/eed_model.py
index 94b02e10..7aa0dde9 100644
--- a/src/LatentEvolution/eed_model.py
+++ b/src/LatentEvolution/eed_model.py
@@ -51,6 +51,9 @@ class EvolverParams(BaseModel):
     time_units: int = Field(
         1, description="DEPRECATED: Use training.time_units instead. Kept for backwards compatibility."
     )
+    zero_init: bool = Field(
+        True, description="If True, zero-initialize final layer so evolver starts as identity (z_{t+1} = z_t). Provides stability but may slow dynamics learning."
+    )
     model_config = ConfigDict(extra="forbid", validate_assignment=True)
 
     @field_validator("activation")
@@ -215,13 +218,14 @@ def __init__(self, latent_dims: int, stim_dims: int, evolver_params: EvolverPara
             )
         )
 
-        # zero-init final layer so evolver starts as identity (z_{t+1} = z_t)
-        if evolver_params.use_input_skips:
-            nn.init.zeros_(self.evolver.output_layer.weight)
-            nn.init.zeros_(self.evolver.output_layer.bias)
-        else:
-            nn.init.zeros_(self.evolver.layers[-1].weight)
-            nn.init.zeros_(self.evolver.layers[-1].bias)
+        # optionally zero-init final layer so evolver starts as identity (z_{t+1} = z_t)
+        if evolver_params.zero_init:
+            if evolver_params.use_input_skips:
+                nn.init.zeros_(self.evolver.output_layer.weight)
+                nn.init.zeros_(self.evolver.output_layer.bias)
+            else:
+                nn.init.zeros_(self.evolver.layers[-1].weight)
+                nn.init.zeros_(self.evolver.layers[-1].bias)
 
     def forward(self, proj_t, proj_stim_t):
         """Evolve one time step in latent space."""
diff --git a/src/LatentEvolution/latent_1step.yaml b/src/LatentEvolution/latent_1step.yaml
index c8bcdf14..e345b52e 100644
--- a/src/LatentEvolution/latent_1step.yaml
+++ b/src/LatentEvolution/latent_1step.yaml
@@ -25,6 +25,7 @@ evolver_params:
   l1_reg_loss: 0.0
   activation: ReLU
   use_input_skips: true # Use MLPWithSkips (input fed to each hidden layer via concatenation)
+  zero_init: true # Zero-initialize final layer so evolver starts as identity (z_{t+1} = z_t)
 
 stimulus_encoder_params:
   num_input_dims: 1736
diff --git a/src/LatentEvolution/latent_20step.yaml b/src/LatentEvolution/latent_20step.yaml
index 3012ce63..ead0fd44 100644
--- a/src/LatentEvolution/latent_20step.yaml
+++ b/src/LatentEvolution/latent_20step.yaml
@@ -25,6 +25,7 @@ evolver_params:
   l1_reg_loss: 0.0
   activation: Tanh
   use_input_skips: true # Use MLPWithSkips (input fed to each hidden layer via concatenation)
+  zero_init: true # Zero-initialize final layer so evolver starts as identity (z_{t+1} = z_t)
 
 stimulus_encoder_params:
   num_input_dims: 1736
diff --git a/src/LatentEvolution/latent_5step.yaml b/src/LatentEvolution/latent_5step.yaml
index 4da34b2b..16055445 100644
--- a/src/LatentEvolution/latent_5step.yaml
+++ b/src/LatentEvolution/latent_5step.yaml
@@ -25,6 +25,7 @@ evolver_params:
   l1_reg_loss: 0.0
   activation: Tanh
   use_input_skips: true # Use MLPWithSkips (input fed to each hidden layer via concatenation)
+  zero_init: true # Zero-initialize final layer so evolver starts as identity (z_{t+1} = z_t)
 
 stimulus_encoder_params:
   num_input_dims: 1736

From d77e2d45ee16eec4799fc1233d0bd077b20bce0b Mon Sep 17 00:00:00 2001
From: Vijay Kumar <kumarv4@janelia.hhmi.org>
Date: Fri, 23 Jan 2026 07:02:19 -0800
Subject: [PATCH 2/8] feat: add total variation (tv) norm regularization to
 evolver
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

implement option b tv norm regularization: directly penalize the
magnitude of evolver updates (Δz) using l1 norm. this stabilizes
dynamics and prevents explosive rollouts during long-horizon evolution.

changes:
- add `tv_reg_loss` parameter to EvolverParams (default: 0.0)
- compute tv loss as ||Δz||₁ at each evolver step
- add TV_LOSS to LossType enum and logging
- conditional computation: only compute delta_z explicitly when tv_reg_loss > 0
- update all config files with tv_reg_loss (default 0.0, typical: 1e-5 to 1e-3)

implementation:
- when tv_reg_loss > 0: explicitly compute delta_z = evolver(z_t, stim)
  then accumulate tv_loss += ||delta_z||₁ * coeff before updating z_{t+1} = z_t + delta_z
- when tv_reg_loss = 0: use original path for efficiency

typical usage:
python latent.py exp latent_20step.yaml --evolver_params.tv_reg_loss 0.0001

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 src/LatentEvolution/eed_model.py       |  3 ++
 src/LatentEvolution/latent.py          | 71 +++++++++++++++++++-------
 src/LatentEvolution/latent_1step.yaml  |  1 +
 src/LatentEvolution/latent_20step.yaml |  1 +
 src/LatentEvolution/latent_5step.yaml  |  1 +
 5 files changed, 59 insertions(+), 18 deletions(-)

diff --git a/src/LatentEvolution/eed_model.py b/src/LatentEvolution/eed_model.py
index 7aa0dde9..57fbb0cd 100644
--- a/src/LatentEvolution/eed_model.py
+++ b/src/LatentEvolution/eed_model.py
@@ -54,6 +54,9 @@ class EvolverParams(BaseModel):
     zero_init: bool = Field(
         True, description="If True, zero-initialize final layer so evolver starts as identity (z_{t+1} = z_t). Provides stability but may slow dynamics learning."
     )
+    tv_reg_loss: float = Field(
+        0.0, description="total variation regularization on evolver updates. penalizes ||Δz|| to stabilize dynamics and prevent explosive rollouts. typical range: 1e-5 to 1e-3."
+    )
     model_config = ConfigDict(extra="forbid", validate_assignment=True)
 
     @field_validator("activation")
diff --git a/src/LatentEvolution/latent.py b/src/LatentEvolution/latent.py
index bb6bd6b9..078d1984 100644
--- a/src/LatentEvolution/latent.py
+++ b/src/LatentEvolution/latent.py
@@ -178,6 +178,7 @@ class LossType(Enum):
     EVOLVE = auto()
     REG = auto()
     AUG_LOSS = auto()
+    TV_LOSS = auto()
 
 
 # -------------------------------------------------------------------
@@ -334,8 +335,9 @@ def train_step_reconstruction_only_nocompile(
     # return same tuple format for compatibility
     evolve_loss = torch.tensor(0.0, device=device)
     aug_loss = torch.tensor(0.0, device=device)
+    tv_loss = torch.tensor(0.0, device=device)
     loss = recon_loss + reg_loss
-    return (loss, recon_loss, evolve_loss, reg_loss, aug_loss)
+    return (loss, recon_loss, evolve_loss, reg_loss, aug_loss, tv_loss)
 
 
 train_step_reconstruction_only = torch.compile(
@@ -370,6 +372,9 @@ def train_step_nocompile(
         for p in model.evolver.parameters():
             reg_loss += torch.abs(p).mean()*cfg.evolver_params.l1_reg_loss
 
+    # total variation regularization on evolver updates
+    tv_loss = torch.tensor(0.0, device=device)
+
 
     # b = batch size
     # N = neurons
@@ -404,7 +409,13 @@ def train_step_nocompile(
 
     # Evolve by 1 time step. This is a special case since we may opt to apply
     # a connectome constraint via data augmentation.
-    proj_t = model.evolver(proj_t, proj_stim_t[0])
+    if cfg.evolver_params.tv_reg_loss > 0.:
+        # compute delta_z explicitly for TV norm
+        delta_z = model.evolver.evolver(torch.cat([proj_t, proj_stim_t[0]], dim=1))
+        tv_loss += torch.abs(delta_z).mean() * cfg.evolver_params.tv_reg_loss
+        proj_t = proj_t + delta_z
+    else:
+        proj_t = model.evolver(proj_t, proj_stim_t[0])
     # apply connectome loss after evolving by 1 time step
     aug_loss = torch.tensor(0.0, device=device)
     if (
@@ -422,8 +433,14 @@ def train_step_nocompile(
 
     # evolve for remaining dt-1 time steps (first window)
     evolve_loss = torch.tensor(0.0, device=device)
-    for i in range(1, dt):
-        proj_t = model.evolver(proj_t, proj_stim_t[i])
+    if cfg.evolver_params.tv_reg_loss > 0.:
+        for i in range(1, dt):
+            delta_z = model.evolver.evolver(torch.cat([proj_t, proj_stim_t[i]], dim=1))
+            tv_loss += torch.abs(delta_z).mean() * cfg.evolver_params.tv_reg_loss
+            proj_t = proj_t + delta_z
+    else:
+        for i in range(1, dt):
+            proj_t = model.evolver(proj_t, proj_stim_t[i])
 
     # loss at first multiple (dt)
     pred_t_plus_dt = model.decoder(proj_t)
@@ -433,20 +450,35 @@ def train_step_nocompile(
     evolve_loss = evolve_loss + loss_fn(pred_t_plus_dt, x_t_plus_dt)
 
     # additional multiples (2, 3, ..., num_multiples)
-    for m in range(2, num_multiples + 1):
-        # evolve dt more steps
-        start_idx = (m - 1) * dt
-        for i in range(dt):
-            proj_t = model.evolver(proj_t, proj_stim_t[start_idx + i])
-        # loss at this multiple
-        pred = model.decoder(proj_t)
-        # target at t + m*dt
-        target_indices_m = observation_indices + m * dt
-        x_target = train_data[target_indices_m, neuron_indices]  # (b, N)
-        evolve_loss = evolve_loss + loss_fn(pred, x_target)
-
-    loss = evolve_loss + recon_loss + reg_loss + aug_loss
-    return (loss, recon_loss, evolve_loss, reg_loss, aug_loss)
+    if cfg.evolver_params.tv_reg_loss > 0.:
+        for m in range(2, num_multiples + 1):
+            # evolve dt more steps
+            start_idx = (m - 1) * dt
+            for i in range(dt):
+                delta_z = model.evolver.evolver(torch.cat([proj_t, proj_stim_t[start_idx + i]], dim=1))
+                tv_loss += torch.abs(delta_z).mean() * cfg.evolver_params.tv_reg_loss
+                proj_t = proj_t + delta_z
+            # loss at this multiple
+            pred = model.decoder(proj_t)
+            # target at t + m*dt
+            target_indices_m = observation_indices + m * dt
+            x_target = train_data[target_indices_m, neuron_indices]  # (b, N)
+            evolve_loss = evolve_loss + loss_fn(pred, x_target)
+    else:
+        for m in range(2, num_multiples + 1):
+            # evolve dt more steps
+            start_idx = (m - 1) * dt
+            for i in range(dt):
+                proj_t = model.evolver(proj_t, proj_stim_t[start_idx + i])
+            # loss at this multiple
+            pred = model.decoder(proj_t)
+            # target at t + m*dt
+            target_indices_m = observation_indices + m * dt
+            x_target = train_data[target_indices_m, neuron_indices]  # (b, N)
+            evolve_loss = evolve_loss + loss_fn(pred, x_target)
+
+    loss = evolve_loss + recon_loss + reg_loss + aug_loss + tv_loss
+    return (loss, recon_loss, evolve_loss, reg_loss, aug_loss, tv_loss)
 
 train_step = torch.compile(train_step_nocompile, fullgraph=True, mode="reduce-overhead")
 
@@ -618,6 +650,7 @@ def train(cfg: ModelParams, run_dir: Path):
                             LossType.EVOLVE: loss_tuple[2],
                             LossType.REG: loss_tuple[3],
                             LossType.AUG_LOSS: loss_tuple[4],
+                            LossType.TV_LOSS: loss_tuple[5],
                         })
 
                 warmup_epoch_duration = (datetime.now() - warmup_epoch_start).total_seconds()
@@ -759,6 +792,7 @@ def train(cfg: ModelParams, run_dir: Path):
                         LossType.EVOLVE: loss_tuple[2],
                         LossType.REG: loss_tuple[3],
                         LossType.AUG_LOSS: loss_tuple[4],
+                        LossType.TV_LOSS: loss_tuple[5],
                     })
 
                     # sample timing every 10 batches
@@ -785,6 +819,7 @@ def train(cfg: ModelParams, run_dir: Path):
             writer.add_scalar("Loss/train_evolve", mean_losses[LossType.EVOLVE], epoch)
             writer.add_scalar("Loss/train_reg", mean_losses[LossType.REG], epoch)
             writer.add_scalar("Loss/train_aug_loss", mean_losses[LossType.AUG_LOSS], epoch)
+            writer.add_scalar("Loss/train_tv_loss", mean_losses[LossType.TV_LOSS], epoch)
             writer.add_scalar("Time/epoch_duration", epoch_duration, epoch)
             writer.add_scalar("Time/total_elapsed", total_elapsed, epoch)
 
diff --git a/src/LatentEvolution/latent_1step.yaml b/src/LatentEvolution/latent_1step.yaml
index e345b52e..de588829 100644
--- a/src/LatentEvolution/latent_1step.yaml
+++ b/src/LatentEvolution/latent_1step.yaml
@@ -26,6 +26,7 @@ evolver_params:
   activation: ReLU
   use_input_skips: true # Use MLPWithSkips (input fed to each hidden layer via concatenation)
   zero_init: true # Zero-initialize final layer so evolver starts as identity (z_{t+1} = z_t)
+  tv_reg_loss: 0.0 # Total variation regularization on evolver updates (typical: 1e-5 to 1e-3)
 
 stimulus_encoder_params:
   num_input_dims: 1736
diff --git a/src/LatentEvolution/latent_20step.yaml b/src/LatentEvolution/latent_20step.yaml
index ead0fd44..5abe817d 100644
--- a/src/LatentEvolution/latent_20step.yaml
+++ b/src/LatentEvolution/latent_20step.yaml
@@ -26,6 +26,7 @@ evolver_params:
   activation: Tanh
   use_input_skips: true # Use MLPWithSkips (input fed to each hidden layer via concatenation)
   zero_init: true # Zero-initialize final layer so evolver starts as identity (z_{t+1} = z_t)
+  tv_reg_loss: 0.0 # Total variation regularization on evolver updates (typical: 1e-5 to 1e-3)
 
 stimulus_encoder_params:
   num_input_dims: 1736
diff --git a/src/LatentEvolution/latent_5step.yaml b/src/LatentEvolution/latent_5step.yaml
index 16055445..ff9d24a4 100644
--- a/src/LatentEvolution/latent_5step.yaml
+++ b/src/LatentEvolution/latent_5step.yaml
@@ -26,6 +26,7 @@ evolver_params:
   activation: Tanh
   use_input_skips: true # Use MLPWithSkips (input fed to each hidden layer via concatenation)
   zero_init: true # Zero-initialize final layer so evolver starts as identity (z_{t+1} = z_t)
+  tv_reg_loss: 0.0 # Total variation regularization on evolver updates (typical: 1e-5 to 1e-3)
 
 stimulus_encoder_params:
   num_input_dims: 1736

From d2e7ac593b63a898016724b37bcf02a63d237c3e Mon Sep 17 00:00:00 2001
From: Vijay Kumar <kumarv4@janelia.hhmi.org>
Date: Fri, 23 Jan 2026 07:11:40 -0800
Subject: [PATCH 3/8] test: benchmark torch.compile with different return types
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

add benchmark comparing tuple, namedtuple, and dict return types
with torch.compile to determine best approach for loss returns.

results (cpu, reduce-overhead mode):
- tuple:            236.69 ± 24.02 µs/iter (baseline)
- namedtuple:       245.24 ± 21.82 µs/iter (+3.6% overhead)
- dict (enum keys): 244.83 ± 19.62 µs/iter (+3.4% overhead)
- dict (str keys):  322.72 ± 157.59 µs/iter (+36.4%, high variance)

conclusion: namedtuple has negligible overhead (<4%) and provides
semantic access, type safety, and flexibility to omit unused fields.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 test_compile_returns.py | 174 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 174 insertions(+)
 create mode 100644 test_compile_returns.py

diff --git a/test_compile_returns.py b/test_compile_returns.py
new file mode 100644
index 00000000..9f8976bc
--- /dev/null
+++ b/test_compile_returns.py
@@ -0,0 +1,174 @@
+"""Test torch.compile compatibility with different return types."""
+import torch
+import time
+import numpy as np
+from enum import Enum, auto
+from typing import NamedTuple, Dict
+
+
+class LossType(Enum):
+    """Loss component types."""
+    TOTAL = auto()
+    RECON = auto()
+    EVOLVE = auto()
+
+
+# NamedTuple version
+class LossDict(NamedTuple):
+    total: torch.Tensor
+    recon: torch.Tensor
+    evolve: torch.Tensor
+
+
+# Test functions
+def train_step_tuple(x: torch.Tensor) -> tuple:
+    """Return regular tuple."""
+    loss1 = x.mean()
+    loss2 = x.std()
+    loss3 = loss1 + loss2
+    return (loss3, loss1, loss2)
+
+
+def train_step_namedtuple(x: torch.Tensor) -> LossDict:
+    """Return NamedTuple."""
+    loss1 = x.mean()
+    loss2 = x.std()
+    loss3 = loss1 + loss2
+    return LossDict(total=loss3, recon=loss1, evolve=loss2)
+
+
+def train_step_dict_literal(x: torch.Tensor) -> dict:
+    """Return dict with literal {} syntax."""
+    loss1 = x.mean()
+    loss2 = x.std()
+    loss3 = loss1 + loss2
+    return {"total": loss3, "recon": loss1, "evolve": loss2}
+
+
+def train_step_dict_enum_keys(x: torch.Tensor) -> Dict[LossType, torch.Tensor]:
+    """Return dict with enum keys."""
+    loss1 = x.mean()
+    loss2 = x.std()
+    loss3 = loss1 + loss2
+    return {LossType.TOTAL: loss3, LossType.RECON: loss1, LossType.EVOLVE: loss2}
+
+
+def benchmark(fn, x, name, num_iters=1000, num_trials=10):
+    """Benchmark a function with error bars."""
+    # Warmup
+    for _ in range(10):
+        result = fn(x)
+
+    # Multiple trials
+    trial_times = []
+    for _ in range(num_trials):
+        torch.cuda.synchronize() if x.is_cuda else None
+        start = time.time()
+        for _ in range(num_iters):
+            result = fn(x)
+        torch.cuda.synchronize() if x.is_cuda else None
+        elapsed = time.time() - start
+        trial_times.append(elapsed)
+
+    # Statistics
+    mean_time = np.mean(trial_times)
+    std_time = np.std(trial_times)
+    mean_us = mean_time * 1000000 / num_iters
+    std_us = std_time * 1000000 / num_iters
+
+    print(f"{name:30s}: {mean_us:.2f} ± {std_us:.2f} µs/iter")
+    return result, mean_us, std_us
+
+
+if __name__ == "__main__":
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Testing on device: {device}\n")
+
+    x = torch.randn(1000, 1000, device=device)
+
+    print("=" * 70)
+    print("COMPILATION TEST (checking if each compiles without error)")
+    print("=" * 70)
+
+    # Test 1: Regular tuple
+    try:
+        compiled_tuple = torch.compile(train_step_tuple, fullgraph=True, mode="reduce-overhead")
+        result = compiled_tuple(x)
+        print(f"✓ Regular tuple: SUCCESS - returns {type(result)}")
+        print(f"  Values: {[f'{v.item():.4f}' for v in result]}")
+    except Exception as e:
+        print(f"✗ Regular tuple: FAILED - {e}")
+
+    # Test 2: NamedTuple
+    try:
+        compiled_namedtuple = torch.compile(train_step_namedtuple, fullgraph=True, mode="reduce-overhead")
+        result = compiled_namedtuple(x)
+        print(f"✓ NamedTuple: SUCCESS - returns {type(result)}")
+        print(f"  Values: total={result.total.item():.4f}, recon={result.recon.item():.4f}, evolve={result.evolve.item():.4f}")
+        print(f"  Can access by name: result.total = {result.total.item():.4f}")
+    except Exception as e:
+        print(f"✗ NamedTuple: FAILED - {e}")
+
+    # Test 3: Dict with string keys (literal {})
+    try:
+        compiled_dict_literal = torch.compile(train_step_dict_literal, fullgraph=True, mode="reduce-overhead")
+        result = compiled_dict_literal(x)
+        print(f"✓ Dict (string keys): SUCCESS - returns {type(result)}")
+        print(f"  Values: {[(k, f'{v.item():.4f}') for k, v in result.items()]}")
+    except Exception as e:
+        print(f"✗ Dict (string keys): FAILED - {e}")
+
+    # Test 4: Dict with enum keys
+    try:
+        compiled_dict_enum = torch.compile(train_step_dict_enum_keys, fullgraph=True, mode="reduce-overhead")
+        result = compiled_dict_enum(x)
+        print(f"✓ Dict (enum keys): SUCCESS - returns {type(result)}")
+        print(f"  Values: {[(k, f'{v.item():.4f}') for k, v in result.items()]}")
+    except Exception as e:
+        print(f"✗ Dict (enum keys): FAILED - {e}")
+
+    print("\n" + "=" * 70)
+    print("PERFORMANCE BENCHMARK (mean ± std over 10 trials)")
+    print("=" * 70)
+
+    # Benchmark each version
+    results = {}
+
+    print("\nUncompiled:")
+    _, results['tuple_uncompiled'], _ = benchmark(train_step_tuple, x, "  Tuple")
+    _, results['namedtuple_uncompiled'], _ = benchmark(train_step_namedtuple, x, "  NamedTuple")
+    _, results['dict_str_uncompiled'], _ = benchmark(train_step_dict_literal, x, "  Dict (string keys)")
+    _, results['dict_enum_uncompiled'], _ = benchmark(train_step_dict_enum_keys, x, "  Dict (enum keys)")
+
+    print("\nCompiled (reduce-overhead):")
+    _, results['tuple_compiled'], _ = benchmark(compiled_tuple, x, "  Tuple")
+    _, results['namedtuple_compiled'], _ = benchmark(compiled_namedtuple, x, "  NamedTuple")
+    _, results['dict_str_compiled'], _ = benchmark(compiled_dict_literal, x, "  Dict (string keys)")
+    _, results['dict_enum_compiled'], _ = benchmark(compiled_dict_enum, x, "  Dict (enum keys)")
+
+    print("\n" + "=" * 70)
+    print("SPEEDUP vs TUPLE (compiled)")
+    print("=" * 70)
+    baseline = results['tuple_compiled']
+    print("  Tuple:              1.00x (baseline)")
+    print(f"  NamedTuple:         {baseline/results['namedtuple_compiled']:.2f}x")
+    print(f"  Dict (string keys): {baseline/results['dict_str_compiled']:.2f}x")
+    print(f"  Dict (enum keys):   {baseline/results['dict_enum_compiled']:.2f}x")
+
+    print("\n" + "=" * 70)
+    print("OVERHEAD vs TUPLE (compiled)")
+    print("=" * 70)
+    print(f"  NamedTuple:         {(results['namedtuple_compiled']/baseline - 1)*100:+.1f}%")
+    print(f"  Dict (string keys): {(results['dict_str_compiled']/baseline - 1)*100:+.1f}%")
+    print(f"  Dict (enum keys):   {(results['dict_enum_compiled']/baseline - 1)*100:+.1f}%")
+
+    print("\n" + "=" * 70)
+    print("RECOMMENDATION")
+    print("=" * 70)
+    overhead_namedtuple = (results['namedtuple_compiled']/baseline - 1)*100
+    if overhead_namedtuple < 20:
+        print(f"✓ NamedTuple has only {overhead_namedtuple:.1f}% overhead → RECOMMENDED")
+        print("  Benefits: semantic access (result.recon), type safety, immutable")
+    else:
+        print(f"✗ NamedTuple has {overhead_namedtuple:.1f}% overhead → Use regular tuple")
+    print("\nFor this codebase: NamedTuple with field names matching LossType enum")

From d79e77ae903947717e5f2ac38e0fbdeb78110d8b Mon Sep 17 00:00:00 2001
From: Vijay Kumar <kumarv4@janelia.hhmi.org>
Date: Fri, 23 Jan 2026 07:16:18 -0800
Subject: [PATCH 4/8] test: improve benchmark with realistic computation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fix benchmark to use realistic training step computation instead of
trivial mean/std operations. previous version was too small and showed
compiled code being slower than uncompiled (nonsensical).

changes:
- simulate encoder/decoder with matrix multiplies and relu
- add multiple loss computations (recon, l1 reg, temporal smoothness)
- use batch_size=256, neurons=1000, latent=256 (realistic sizes)
- ensure proper cuda synchronization
- add compilation speedup metrics

this should show proper speedup from torch.compile and accurate
overhead comparison between tuple, namedtuple, and dict returns.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 test_compile_returns.py | 114 ++++++++++++++++++++++++++--------------
 1 file changed, 75 insertions(+), 39 deletions(-)

diff --git a/test_compile_returns.py b/test_compile_returns.py
index 9f8976bc..a3f961ab 100644
--- a/test_compile_returns.py
+++ b/test_compile_returns.py
@@ -20,53 +20,73 @@ class LossDict(NamedTuple):
     evolve: torch.Tensor
 
 
-# Test functions
-def train_step_tuple(x: torch.Tensor) -> tuple:
+# Test functions - simulate realistic training step
+def train_step_tuple(x: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor) -> tuple:
     """Return regular tuple."""
-    loss1 = x.mean()
-    loss2 = x.std()
-    loss3 = loss1 + loss2
-    return (loss3, loss1, loss2)
+    # Simulate encoder/decoder operations
+    h1 = torch.relu(x @ w1)  # hidden layer
+    out = h1 @ w2  # output
 
+    # Multiple loss computations
+    loss1 = ((out - x) ** 2).mean()  # reconstruction
+    loss2 = torch.abs(h1).mean()  # l1 reg
+    loss3 = ((out[1:] - out[:-1]) ** 2).mean()  # temporal smoothness
+    total = loss1 + 0.1 * loss2 + 0.01 * loss3
+    return (total, loss1, loss2, loss3)
 
-def train_step_namedtuple(x: torch.Tensor) -> LossDict:
+
+def train_step_namedtuple(x: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor) -> LossDict:
     """Return NamedTuple."""
-    loss1 = x.mean()
-    loss2 = x.std()
-    loss3 = loss1 + loss2
-    return LossDict(total=loss3, recon=loss1, evolve=loss2)
+    h1 = torch.relu(x @ w1)
+    out = h1 @ w2
+
+    loss1 = ((out - x) ** 2).mean()
+    loss2 = torch.abs(h1).mean()
+    loss3 = ((out[1:] - out[:-1]) ** 2).mean()
+    total = loss1 + 0.1 * loss2 + 0.01 * loss3
+    return LossDict(total=total, recon=loss1, evolve=loss3)
 
 
-def train_step_dict_literal(x: torch.Tensor) -> dict:
+def train_step_dict_literal(x: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor) -> dict:
     """Return dict with literal {} syntax."""
-    loss1 = x.mean()
-    loss2 = x.std()
-    loss3 = loss1 + loss2
-    return {"total": loss3, "recon": loss1, "evolve": loss2}
+    h1 = torch.relu(x @ w1)
+    out = h1 @ w2
 
+    loss1 = ((out - x) ** 2).mean()
+    loss2 = torch.abs(h1).mean()
+    loss3 = ((out[1:] - out[:-1]) ** 2).mean()
+    total = loss1 + 0.1 * loss2 + 0.01 * loss3
+    return {"total": total, "recon": loss1, "evolve": loss3}
 
-def train_step_dict_enum_keys(x: torch.Tensor) -> Dict[LossType, torch.Tensor]:
+
+def train_step_dict_enum_keys(x: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor) -> Dict[LossType, torch.Tensor]:
     """Return dict with enum keys."""
-    loss1 = x.mean()
-    loss2 = x.std()
-    loss3 = loss1 + loss2
-    return {LossType.TOTAL: loss3, LossType.RECON: loss1, LossType.EVOLVE: loss2}
+    h1 = torch.relu(x @ w1)
+    out = h1 @ w2
+
+    loss1 = ((out - x) ** 2).mean()
+    loss2 = torch.abs(h1).mean()
+    loss3 = ((out[1:] - out[:-1]) ** 2).mean()
+    total = loss1 + 0.1 * loss2 + 0.01 * loss3
+    return {LossType.TOTAL: total, LossType.RECON: loss1, LossType.EVOLVE: loss3}
 
 
-def benchmark(fn, x, name, num_iters=1000, num_trials=10):
+def benchmark(fn, args, name, num_iters=1000, num_trials=10):
     """Benchmark a function with error bars."""
     # Warmup
     for _ in range(10):
-        result = fn(x)
+        result = fn(*args)
 
     # Multiple trials
     trial_times = []
     for _ in range(num_trials):
-        torch.cuda.synchronize() if x.is_cuda else None
+        if args[0].is_cuda:
+            torch.cuda.synchronize()
         start = time.time()
         for _ in range(num_iters):
-            result = fn(x)
-        torch.cuda.synchronize() if x.is_cuda else None
+            result = fn(*args)
+        if args[0].is_cuda:
+            torch.cuda.synchronize()
         elapsed = time.time() - start
         trial_times.append(elapsed)
 
@@ -84,7 +104,15 @@ def benchmark(fn, x, name, num_iters=1000, num_trials=10):
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     print(f"Testing on device: {device}\n")
 
-    x = torch.randn(1000, 1000, device=device)
+    # Create test data - simulate batch of neural activity
+    batch_size = 256
+    num_neurons = 1000
+    latent_dim = 256
+
+    x = torch.randn(batch_size, num_neurons, device=device)
+    w1 = torch.randn(num_neurons, latent_dim, device=device) / (num_neurons ** 0.5)
+    w2 = torch.randn(latent_dim, num_neurons, device=device) / (latent_dim ** 0.5)
+    args = (x, w1, w2)
 
     print("=" * 70)
     print("COMPILATION TEST (checking if each compiles without error)")
@@ -93,7 +121,7 @@ def benchmark(fn, x, name, num_iters=1000, num_trials=10):
     # Test 1: Regular tuple
     try:
         compiled_tuple = torch.compile(train_step_tuple, fullgraph=True, mode="reduce-overhead")
-        result = compiled_tuple(x)
+        result = compiled_tuple(*args)
         print(f"✓ Regular tuple: SUCCESS - returns {type(result)}")
         print(f"  Values: {[f'{v.item():.4f}' for v in result]}")
     except Exception as e:
@@ -102,7 +130,7 @@ def benchmark(fn, x, name, num_iters=1000, num_trials=10):
     # Test 2: NamedTuple
     try:
         compiled_namedtuple = torch.compile(train_step_namedtuple, fullgraph=True, mode="reduce-overhead")
-        result = compiled_namedtuple(x)
+        result = compiled_namedtuple(*args)
         print(f"✓ NamedTuple: SUCCESS - returns {type(result)}")
         print(f"  Values: total={result.total.item():.4f}, recon={result.recon.item():.4f}, evolve={result.evolve.item():.4f}")
         print(f"  Can access by name: result.total = {result.total.item():.4f}")
@@ -112,7 +140,7 @@ def benchmark(fn, x, name, num_iters=1000, num_trials=10):
     # Test 3: Dict with string keys (literal {})
     try:
         compiled_dict_literal = torch.compile(train_step_dict_literal, fullgraph=True, mode="reduce-overhead")
-        result = compiled_dict_literal(x)
+        result = compiled_dict_literal(*args)
         print(f"✓ Dict (string keys): SUCCESS - returns {type(result)}")
         print(f"  Values: {[(k, f'{v.item():.4f}') for k, v in result.items()]}")
     except Exception as e:
@@ -121,7 +149,7 @@ def benchmark(fn, x, name, num_iters=1000, num_trials=10):
     # Test 4: Dict with enum keys
     try:
         compiled_dict_enum = torch.compile(train_step_dict_enum_keys, fullgraph=True, mode="reduce-overhead")
-        result = compiled_dict_enum(x)
+        result = compiled_dict_enum(*args)
         print(f"✓ Dict (enum keys): SUCCESS - returns {type(result)}")
         print(f"  Values: {[(k, f'{v.item():.4f}') for k, v in result.items()]}")
     except Exception as e:
@@ -135,16 +163,24 @@ def benchmark(fn, x, name, num_iters=1000, num_trials=10):
     results = {}
 
     print("\nUncompiled:")
-    _, results['tuple_uncompiled'], _ = benchmark(train_step_tuple, x, "  Tuple")
-    _, results['namedtuple_uncompiled'], _ = benchmark(train_step_namedtuple, x, "  NamedTuple")
-    _, results['dict_str_uncompiled'], _ = benchmark(train_step_dict_literal, x, "  Dict (string keys)")
-    _, results['dict_enum_uncompiled'], _ = benchmark(train_step_dict_enum_keys, x, "  Dict (enum keys)")
+    _, results['tuple_uncompiled'], _ = benchmark(train_step_tuple, args, "  Tuple")
+    _, results['namedtuple_uncompiled'], _ = benchmark(train_step_namedtuple, args, "  NamedTuple")
+    _, results['dict_str_uncompiled'], _ = benchmark(train_step_dict_literal, args, "  Dict (string keys)")
+    _, results['dict_enum_uncompiled'], _ = benchmark(train_step_dict_enum_keys, args, "  Dict (enum keys)")
 
     print("\nCompiled (reduce-overhead):")
-    _, results['tuple_compiled'], _ = benchmark(compiled_tuple, x, "  Tuple")
-    _, results['namedtuple_compiled'], _ = benchmark(compiled_namedtuple, x, "  NamedTuple")
-    _, results['dict_str_compiled'], _ = benchmark(compiled_dict_literal, x, "  Dict (string keys)")
-    _, results['dict_enum_compiled'], _ = benchmark(compiled_dict_enum, x, "  Dict (enum keys)")
+    _, results['tuple_compiled'], _ = benchmark(compiled_tuple, args, "  Tuple")
+    _, results['namedtuple_compiled'], _ = benchmark(compiled_namedtuple, args, "  NamedTuple")
+    _, results['dict_str_compiled'], _ = benchmark(compiled_dict_literal, args, "  Dict (string keys)")
+    _, results['dict_enum_compiled'], _ = benchmark(compiled_dict_enum, args, "  Dict (enum keys)")
+
+    print("\n" + "=" * 70)
+    print("COMPILATION SPEEDUP (compiled vs uncompiled)")
+    print("=" * 70)
+    print(f"  Tuple:              {results['tuple_uncompiled']/results['tuple_compiled']:.2f}x faster")
+    print(f"  NamedTuple:         {results['namedtuple_uncompiled']/results['namedtuple_compiled']:.2f}x faster")
+    print(f"  Dict (string keys): {results['dict_str_uncompiled']/results['dict_str_compiled']:.2f}x faster")
+    print(f"  Dict (enum keys):   {results['dict_enum_uncompiled']/results['dict_enum_compiled']:.2f}x faster")
 
     print("\n" + "=" * 70)
     print("SPEEDUP vs TUPLE (compiled)")

From 2e9164ce16883edbeb727efb44463b96c77dbd84 Mon Sep 17 00:00:00 2001
From: Vijay Kumar <kumarv4@janelia.hhmi.org>
Date: Fri, 23 Jan 2026 07:31:46 -0800
Subject: [PATCH 5/8] refactor: use dict[LossType, Tensor] for train step
 returns
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

replace tuple returns with dict[LossType, Tensor] for semantic access
and programmatic tensorboard logging.

changes:
- train_step_nocompile: returns dict, builds incrementally with losses[LossType.X] = value
- train_step_reconstruction_only_nocompile: returns dict with only computed losses (total, recon, reg)
- loss accumulation: updated to work with dict instead of tuple indexing
- tensorboard logging: now programmatic using loss_type.name.lower() iteration

benefits:
- semantic access: losses[LossType.RECON] instead of loss_tuple[1]
- flexible returns: warmup only returns computed losses
- programmatic logging: automatically logs all loss components
- type safe: enum keys prevent typos

benchmark showed dict with enum keys has <2% overhead vs tuple on gpu.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 src/LatentEvolution/latent.py | 78 ++++++++++++++++-------------------
 1 file changed, 36 insertions(+), 42 deletions(-)

diff --git a/src/LatentEvolution/latent.py b/src/LatentEvolution/latent.py
index 078d1984..1efc8f3c 100644
--- a/src/LatentEvolution/latent.py
+++ b/src/LatentEvolution/latent.py
@@ -308,7 +308,7 @@ def train_step_reconstruction_only_nocompile(
         _selected_neurons: torch.Tensor,
         _needed_indices: torch.Tensor,
         cfg: ModelParams
-    ):
+    ) -> dict[LossType, torch.Tensor]:
     """Train encoder/decoder only with reconstruction loss."""
     device = train_data.device
     loss_fn = getattr(torch.nn.functional, cfg.training.loss_function)
@@ -332,12 +332,15 @@ def train_step_reconstruction_only_nocompile(
     recon_t = model.decoder(proj_t)
     recon_loss = loss_fn(recon_t, x_t)
 
-    # return same tuple format for compatibility
-    evolve_loss = torch.tensor(0.0, device=device)
-    aug_loss = torch.tensor(0.0, device=device)
-    tv_loss = torch.tensor(0.0, device=device)
-    loss = recon_loss + reg_loss
-    return (loss, recon_loss, evolve_loss, reg_loss, aug_loss, tv_loss)
+    # total loss
+    total_loss = recon_loss + reg_loss
+
+    # return only computed losses (no evolve, aug, tv during warmup)
+    return {
+        LossType.TOTAL: total_loss,
+        LossType.RECON: recon_loss,
+        LossType.REG: reg_loss,
+    }
 
 
 train_step_reconstruction_only = torch.compile(
@@ -353,10 +356,13 @@ def train_step_nocompile(
         selected_neurons: torch.Tensor,
         needed_indices: torch.Tensor,
         cfg: ModelParams
-    ):
+    ) -> dict[LossType, torch.Tensor]:
 
     device=train_data.device
 
+    # initialize loss dict
+    losses: dict[LossType, torch.Tensor] = {}
+
     # Get loss function from config
     loss_fn = getattr(torch.nn.functional, cfg.training.loss_function)
 
@@ -371,6 +377,7 @@ def train_step_nocompile(
     if cfg.evolver_params.l1_reg_loss > 0.:
         for p in model.evolver.parameters():
             reg_loss += torch.abs(p).mean()*cfg.evolver_params.l1_reg_loss
+    losses[LossType.REG] = reg_loss
 
     # total variation regularization on evolver updates
     tv_loss = torch.tensor(0.0, device=device)
@@ -405,7 +412,7 @@ def train_step_nocompile(
 
     # reconstruction loss
     recon_t = model.decoder(proj_t)
-    recon_loss = loss_fn(recon_t, x_t)
+    losses[LossType.RECON] = loss_fn(recon_t, x_t)
 
     # Evolve by 1 time step. This is a special case since we may opt to apply
     # a connectome constraint via data augmentation.
@@ -430,6 +437,7 @@ def train_step_nocompile(
         proj_t_aug = model.evolver(model.encoder(x_t_aug), proj_stim_t[0])
         pred_t_plus_1_aug = model.decoder(proj_t_aug)
         aug_loss += cfg.training.unconnected_to_zero.loss_coeff * loss_fn(pred_t_plus_1_aug[:, selected_neurons], pred_t_plus_1[:, selected_neurons])
+    losses[LossType.AUG_LOSS] = aug_loss
 
     # evolve for remaining dt-1 time steps (first window)
     evolve_loss = torch.tensor(0.0, device=device)
@@ -477,8 +485,13 @@ def train_step_nocompile(
             x_target = train_data[target_indices_m, neuron_indices]  # (b, N)
             evolve_loss = evolve_loss + loss_fn(pred, x_target)
 
-    loss = evolve_loss + recon_loss + reg_loss + aug_loss + tv_loss
-    return (loss, recon_loss, evolve_loss, reg_loss, aug_loss, tv_loss)
+    losses[LossType.EVOLVE] = evolve_loss
+    losses[LossType.TV_LOSS] = tv_loss
+
+    # compute total loss
+    losses[LossType.TOTAL] = sum(losses.values())
+
+    return losses
 
 train_step = torch.compile(train_step_nocompile, fullgraph=True, mode="reduce-overhead")
 
@@ -638,29 +651,21 @@ def train(cfg: ModelParams, run_dir: Path):
                         needed_indices = torch.empty(0, dtype=torch.long, device=device)
 
                         # use nocompile version for warmup
-                        loss_tuple = train_step_reconstruction_only_nocompile(
+                        losses = train_step_reconstruction_only_nocompile(
                             model, chunk_data, chunk_stim, observation_indices,
                             selected_neurons, needed_indices, cfg
                         )
-                        loss_tuple[0].backward()
+                        losses[LossType.TOTAL].backward()
                         optimizer.step()
-                        warmup_losses.accumulate({
-                            LossType.TOTAL: loss_tuple[0],
-                            LossType.RECON: loss_tuple[1],
-                            LossType.EVOLVE: loss_tuple[2],
-                            LossType.REG: loss_tuple[3],
-                            LossType.AUG_LOSS: loss_tuple[4],
-                            LossType.TV_LOSS: loss_tuple[5],
-                        })
+                        warmup_losses.accumulate(losses)
 
                 warmup_epoch_duration = (datetime.now() - warmup_epoch_start).total_seconds()
                 mean_warmup = warmup_losses.mean()
                 print(f"warmup {warmup_epoch+1}/{recon_warmup_epochs} | recon loss: {mean_warmup[LossType.RECON]:.4e} | duration: {warmup_epoch_duration:.2f}s")
 
-                # log to tensorboard
-                writer.add_scalar("ReconWarmup/loss", mean_warmup[LossType.TOTAL], warmup_epoch)
-                writer.add_scalar("ReconWarmup/recon_loss", mean_warmup[LossType.RECON], warmup_epoch)
-                writer.add_scalar("ReconWarmup/reg_loss", mean_warmup[LossType.REG], warmup_epoch)
+                # log to tensorboard (only computed losses during warmup)
+                for loss_type, loss_value in mean_warmup.items():
+                    writer.add_scalar(f"ReconWarmup/{loss_type.name.lower()}", loss_value, warmup_epoch)
                 writer.add_scalar("ReconWarmup/epoch_duration", warmup_epoch_duration, warmup_epoch)
 
             model.evolver.requires_grad_(True)
@@ -771,13 +776,13 @@ def train(cfg: ModelParams, run_dir: Path):
 
                     # training step (timing for latency tracking)
                     forward_start = time.time()
-                    loss_tuple = train_step_fn(
+                    loss_dict = train_step_fn(
                         model, chunk_data, chunk_stim, observation_indices, selected_neurons, needed_indices, cfg
                     )
                     forward_time = time.time() - forward_start
 
                     backward_start = time.time()
-                    loss_tuple[0].backward()
+                    loss_dict[LossType.TOTAL].backward()
                     backward_time = time.time() - backward_start
 
                     step_start = time.time()
@@ -786,14 +791,7 @@ def train(cfg: ModelParams, run_dir: Path):
                     optimizer.step()
                     step_time = time.time() - step_start
 
-                    losses.accumulate({
-                        LossType.TOTAL: loss_tuple[0],
-                        LossType.RECON: loss_tuple[1],
-                        LossType.EVOLVE: loss_tuple[2],
-                        LossType.REG: loss_tuple[3],
-                        LossType.AUG_LOSS: loss_tuple[4],
-                        LossType.TV_LOSS: loss_tuple[5],
-                    })
+                    losses.accumulate(loss_dict)
 
                     # sample timing every 10 batches
                     if batch_in_chunk % 10 == 0:
@@ -813,13 +811,9 @@ def train(cfg: ModelParams, run_dir: Path):
                 f"Duration: {epoch_duration:.2f}s (Total: {total_elapsed:.1f}s)"
             )
 
-            # log to tensorboard
-            writer.add_scalar("Loss/train", mean_losses[LossType.TOTAL], epoch)
-            writer.add_scalar("Loss/train_recon", mean_losses[LossType.RECON], epoch)
-            writer.add_scalar("Loss/train_evolve", mean_losses[LossType.EVOLVE], epoch)
-            writer.add_scalar("Loss/train_reg", mean_losses[LossType.REG], epoch)
-            writer.add_scalar("Loss/train_aug_loss", mean_losses[LossType.AUG_LOSS], epoch)
-            writer.add_scalar("Loss/train_tv_loss", mean_losses[LossType.TV_LOSS], epoch)
+            # log to tensorboard (programmatically iterate over all losses)
+            for loss_type, loss_value in mean_losses.items():
+                writer.add_scalar(f"Loss/train_{loss_type.name.lower()}", loss_value, epoch)
             writer.add_scalar("Time/epoch_duration", epoch_duration, epoch)
             writer.add_scalar("Time/total_elapsed", total_elapsed, epoch)
 

From c4438982622bfb757988a5c226a16c446e8063d1 Mon Sep 17 00:00:00 2001
From: Vijay Kumar <kumarv4@janelia.hhmi.org>
Date: Fri, 23 Jan 2026 07:31:59 -0800
Subject: [PATCH 6/8] chore: remove benchmark test script
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

test completed, results confirmed dict/namedtuple have <2% overhead.
keeping results in git history for reference.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 test_compile_returns.py | 210 ----------------------------------------
 1 file changed, 210 deletions(-)
 delete mode 100644 test_compile_returns.py

diff --git a/test_compile_returns.py b/test_compile_returns.py
deleted file mode 100644
index a3f961ab..00000000
--- a/test_compile_returns.py
+++ /dev/null
@@ -1,210 +0,0 @@
-"""Test torch.compile compatibility with different return types."""
-import torch
-import time
-import numpy as np
-from enum import Enum, auto
-from typing import NamedTuple, Dict
-
-
-class LossType(Enum):
-    """Loss component types."""
-    TOTAL = auto()
-    RECON = auto()
-    EVOLVE = auto()
-
-
-# NamedTuple version
-class LossDict(NamedTuple):
-    total: torch.Tensor
-    recon: torch.Tensor
-    evolve: torch.Tensor
-
-
-# Test functions - simulate realistic training step
-def train_step_tuple(x: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor) -> tuple:
-    """Return regular tuple."""
-    # Simulate encoder/decoder operations
-    h1 = torch.relu(x @ w1)  # hidden layer
-    out = h1 @ w2  # output
-
-    # Multiple loss computations
-    loss1 = ((out - x) ** 2).mean()  # reconstruction
-    loss2 = torch.abs(h1).mean()  # l1 reg
-    loss3 = ((out[1:] - out[:-1]) ** 2).mean()  # temporal smoothness
-    total = loss1 + 0.1 * loss2 + 0.01 * loss3
-    return (total, loss1, loss2, loss3)
-
-
-def train_step_namedtuple(x: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor) -> LossDict:
-    """Return NamedTuple."""
-    h1 = torch.relu(x @ w1)
-    out = h1 @ w2
-
-    loss1 = ((out - x) ** 2).mean()
-    loss2 = torch.abs(h1).mean()
-    loss3 = ((out[1:] - out[:-1]) ** 2).mean()
-    total = loss1 + 0.1 * loss2 + 0.01 * loss3
-    return LossDict(total=total, recon=loss1, evolve=loss3)
-
-
-def train_step_dict_literal(x: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor) -> dict:
-    """Return dict with literal {} syntax."""
-    h1 = torch.relu(x @ w1)
-    out = h1 @ w2
-
-    loss1 = ((out - x) ** 2).mean()
-    loss2 = torch.abs(h1).mean()
-    loss3 = ((out[1:] - out[:-1]) ** 2).mean()
-    total = loss1 + 0.1 * loss2 + 0.01 * loss3
-    return {"total": total, "recon": loss1, "evolve": loss3}
-
-
-def train_step_dict_enum_keys(x: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor) -> Dict[LossType, torch.Tensor]:
-    """Return dict with enum keys."""
-    h1 = torch.relu(x @ w1)
-    out = h1 @ w2
-
-    loss1 = ((out - x) ** 2).mean()
-    loss2 = torch.abs(h1).mean()
-    loss3 = ((out[1:] - out[:-1]) ** 2).mean()
-    total = loss1 + 0.1 * loss2 + 0.01 * loss3
-    return {LossType.TOTAL: total, LossType.RECON: loss1, LossType.EVOLVE: loss3}
-
-
-def benchmark(fn, args, name, num_iters=1000, num_trials=10):
-    """Benchmark a function with error bars."""
-    # Warmup
-    for _ in range(10):
-        result = fn(*args)
-
-    # Multiple trials
-    trial_times = []
-    for _ in range(num_trials):
-        if args[0].is_cuda:
-            torch.cuda.synchronize()
-        start = time.time()
-        for _ in range(num_iters):
-            result = fn(*args)
-        if args[0].is_cuda:
-            torch.cuda.synchronize()
-        elapsed = time.time() - start
-        trial_times.append(elapsed)
-
-    # Statistics
-    mean_time = np.mean(trial_times)
-    std_time = np.std(trial_times)
-    mean_us = mean_time * 1000000 / num_iters
-    std_us = std_time * 1000000 / num_iters
-
-    print(f"{name:30s}: {mean_us:.2f} ± {std_us:.2f} µs/iter")
-    return result, mean_us, std_us
-
-
-if __name__ == "__main__":
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    print(f"Testing on device: {device}\n")
-
-    # Create test data - simulate batch of neural activity
-    batch_size = 256
-    num_neurons = 1000
-    latent_dim = 256
-
-    x = torch.randn(batch_size, num_neurons, device=device)
-    w1 = torch.randn(num_neurons, latent_dim, device=device) / (num_neurons ** 0.5)
-    w2 = torch.randn(latent_dim, num_neurons, device=device) / (latent_dim ** 0.5)
-    args = (x, w1, w2)
-
-    print("=" * 70)
-    print("COMPILATION TEST (checking if each compiles without error)")
-    print("=" * 70)
-
-    # Test 1: Regular tuple
-    try:
-        compiled_tuple = torch.compile(train_step_tuple, fullgraph=True, mode="reduce-overhead")
-        result = compiled_tuple(*args)
-        print(f"✓ Regular tuple: SUCCESS - returns {type(result)}")
-        print(f"  Values: {[f'{v.item():.4f}' for v in result]}")
-    except Exception as e:
-        print(f"✗ Regular tuple: FAILED - {e}")
-
-    # Test 2: NamedTuple
-    try:
-        compiled_namedtuple = torch.compile(train_step_namedtuple, fullgraph=True, mode="reduce-overhead")
-        result = compiled_namedtuple(*args)
-        print(f"✓ NamedTuple: SUCCESS - returns {type(result)}")
-        print(f"  Values: total={result.total.item():.4f}, recon={result.recon.item():.4f}, evolve={result.evolve.item():.4f}")
-        print(f"  Can access by name: result.total = {result.total.item():.4f}")
-    except Exception as e:
-        print(f"✗ NamedTuple: FAILED - {e}")
-
-    # Test 3: Dict with string keys (literal {})
-    try:
-        compiled_dict_literal = torch.compile(train_step_dict_literal, fullgraph=True, mode="reduce-overhead")
-        result = compiled_dict_literal(*args)
-        print(f"✓ Dict (string keys): SUCCESS - returns {type(result)}")
-        print(f"  Values: {[(k, f'{v.item():.4f}') for k, v in result.items()]}")
-    except Exception as e:
-        print(f"✗ Dict (string keys): FAILED - {e}")
-
-    # Test 4: Dict with enum keys
-    try:
-        compiled_dict_enum = torch.compile(train_step_dict_enum_keys, fullgraph=True, mode="reduce-overhead")
-        result = compiled_dict_enum(*args)
-        print(f"✓ Dict (enum keys): SUCCESS - returns {type(result)}")
-        print(f"  Values: {[(k, f'{v.item():.4f}') for k, v in result.items()]}")
-    except Exception as e:
-        print(f"✗ Dict (enum keys): FAILED - {e}")
-
-    print("\n" + "=" * 70)
-    print("PERFORMANCE BENCHMARK (mean ± std over 10 trials)")
-    print("=" * 70)
-
-    # Benchmark each version
-    results = {}
-
-    print("\nUncompiled:")
-    _, results['tuple_uncompiled'], _ = benchmark(train_step_tuple, args, "  Tuple")
-    _, results['namedtuple_uncompiled'], _ = benchmark(train_step_namedtuple, args, "  NamedTuple")
-    _, results['dict_str_uncompiled'], _ = benchmark(train_step_dict_literal, args, "  Dict (string keys)")
-    _, results['dict_enum_uncompiled'], _ = benchmark(train_step_dict_enum_keys, args, "  Dict (enum keys)")
-
-    print("\nCompiled (reduce-overhead):")
-    _, results['tuple_compiled'], _ = benchmark(compiled_tuple, args, "  Tuple")
-    _, results['namedtuple_compiled'], _ = benchmark(compiled_namedtuple, args, "  NamedTuple")
-    _, results['dict_str_compiled'], _ = benchmark(compiled_dict_literal, args, "  Dict (string keys)")
-    _, results['dict_enum_compiled'], _ = benchmark(compiled_dict_enum, args, "  Dict (enum keys)")
-
-    print("\n" + "=" * 70)
-    print("COMPILATION SPEEDUP (compiled vs uncompiled)")
-    print("=" * 70)
-    print(f"  Tuple:              {results['tuple_uncompiled']/results['tuple_compiled']:.2f}x faster")
-    print(f"  NamedTuple:         {results['namedtuple_uncompiled']/results['namedtuple_compiled']:.2f}x faster")
-    print(f"  Dict (string keys): {results['dict_str_uncompiled']/results['dict_str_compiled']:.2f}x faster")
-    print(f"  Dict (enum keys):   {results['dict_enum_uncompiled']/results['dict_enum_compiled']:.2f}x faster")
-
-    print("\n" + "=" * 70)
-    print("SPEEDUP vs TUPLE (compiled)")
-    print("=" * 70)
-    baseline = results['tuple_compiled']
-    print("  Tuple:              1.00x (baseline)")
-    print(f"  NamedTuple:         {baseline/results['namedtuple_compiled']:.2f}x")
-    print(f"  Dict (string keys): {baseline/results['dict_str_compiled']:.2f}x")
-    print(f"  Dict (enum keys):   {baseline/results['dict_enum_compiled']:.2f}x")
-
-    print("\n" + "=" * 70)
-    print("OVERHEAD vs TUPLE (compiled)")
-    print("=" * 70)
-    print(f"  NamedTuple:         {(results['namedtuple_compiled']/baseline - 1)*100:+.1f}%")
-    print(f"  Dict (string keys): {(results['dict_str_compiled']/baseline - 1)*100:+.1f}%")
-    print(f"  Dict (enum keys):   {(results['dict_enum_compiled']/baseline - 1)*100:+.1f}%")
-
-    print("\n" + "=" * 70)
-    print("RECOMMENDATION")
-    print("=" * 70)
-    overhead_namedtuple = (results['namedtuple_compiled']/baseline - 1)*100
-    if overhead_namedtuple < 20:
-        print(f"✓ NamedTuple has only {overhead_namedtuple:.1f}% overhead → RECOMMENDED")
-        print("  Benefits: semantic access (result.recon), type safety, immutable")
-    else:
-        print(f"✗ NamedTuple has {overhead_namedtuple:.1f}% overhead → Use regular tuple")
-    print("\nFor this codebase: NamedTuple with field names matching LossType enum")

From 10d923aff7085d76551f332908665b5626e27382 Mon Sep 17 00:00:00 2001
From: Vijay Kumar <kumarv4@janelia.hhmi.org>
Date: Fri, 23 Jan 2026 07:33:13 -0800
Subject: [PATCH 7/8] docs: add benchmarking results to LossType docstring
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

document that dict[LossType, Tensor] has <2% overhead vs tuple
based on gpu benchmarking with realistic computation.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 src/LatentEvolution/latent.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/LatentEvolution/latent.py b/src/LatentEvolution/latent.py
index 1efc8f3c..ad9f9217 100644
--- a/src/LatentEvolution/latent.py
+++ b/src/LatentEvolution/latent.py
@@ -172,7 +172,16 @@ def load_val_only(
 
 
 class LossType(Enum):
-    """loss component types for EED model."""
+    """loss component types for EED model.
+
+    train steps return dict[LossType, Tensor] for semantic access.
+
+    benchmarking results (gpu, 256 batch, realistic encoder/decoder ops):
+    - dict with enum keys: +0.5% to +1.4% overhead vs tuple
+    - namedtuple: +0.8% to +2.0% overhead vs tuple
+    - compilation speedup: 1.87-1.93x faster (all return types)
+    - conclusion: negligible overhead, semantic access worth it
+    """
     TOTAL = auto()
     RECON = auto()
     EVOLVE = auto()

From 9a7c8d81e8f69e90009710858ca8065fdb54363f Mon Sep 17 00:00:00 2001
From: Vijay Kumar <kumarv4@janelia.hhmi.org>
Date: Fri, 23 Jan 2026 07:50:24 -0800
Subject: [PATCH 8/8] add experiments:

---
 .../experiments/flyvis_voltage_100ms.md       | 42 +++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/src/LatentEvolution/experiments/flyvis_voltage_100ms.md b/src/LatentEvolution/experiments/flyvis_voltage_100ms.md
index d7ee70f6..55f15d38 100644
--- a/src/LatentEvolution/experiments/flyvis_voltage_100ms.md
+++ b/src/LatentEvolution/experiments/flyvis_voltage_100ms.md
@@ -312,3 +312,45 @@ for ems in 1 2 3 5; do \
         python src/LatentEvolution/latent.py ems_sweep latent_20step.yaml \
         --training.evolve-multiple-steps $ems
 ```
+
+## Derivative experiment
+
+Prior to changing the evolver (tanh activation, initialized to 0) and using
+pretraining for reconstruction, we observed an instability in training in the
+`tu20_seed_sweep_20260115_f6144bd` experiment. We want to revisit and see if
+adding TV norm regularization can rescue that phenotype.
+
+Understand which feature contributes to the stability. We changed many things at
+the same time to get to a working point.
+
+```bash
+for zero_init in zero-init no-zero-init; do \
+    for activation in Tanh ReLU; do \
+        for warmup in 0 10; do \
+        name="z${zero_init}_${activation}_w${warmup}"
+        bsub -J $name -q gpu_a100 -gpu "num=1" -n 8 -o ${name}.log \
+            python src/LatentEvolution/latent.py test_stability latent_20step.yaml \
+            --evolver_params.${zero_init} \
+            --evolver_params.activation $activation \
+            --training.reconstruction-warmup-epochs $warmup \
+            --training.seed 35235
+        done
+    done
+done
+```
+
+Understand if TV norm can bring stability to the training without pretraining for
+reconstruction or the other features we added.
+
+```bash
+
+for tv in 0.0 0.00001 0.0001 0.001; do \
+    bsub -J tv${tv} -q gpu_a100 -gpu "num=1" -n 8 -o tv${tv}.log \
+        python src/LatentEvolution/latent.py tv_sweep latent_20step.yaml \
+        --evolver_params.no-zero-init \
+        --evolver_params.activation ReLU \
+        --training.reconstruction-warmup-epochs 0 \
+        --evolver_params.tv-reg-loss $tv \
+        --training.seed 97651
+done
+```