From ec93aa419bfcd45b9fc064945a3aad8252cf5a2d Mon Sep 17 00:00:00 2001
From: ElmoPA <elmoworld2005@gmail.com>
Date: Sat, 28 Feb 2026 00:44:36 -0500
Subject: [PATCH 1/2] Latent Flow matching that works regardless of dimension
 dim

---
 .../hpt_cotrain_flow_shared_head_latent.yaml  |  75 +++++
 egomimic/models/conv/temporal_enc_dec.py      | 305 ++++++++++++++++++
 egomimic/models/denoising_policy.py           |  18 +-
 egomimic/models/fm_policy.py                  |   2 +
 4 files changed, 391 insertions(+), 9 deletions(-)
 create mode 100644 egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent.yaml
 create mode 100644 egomimic/models/conv/temporal_enc_dec.py

diff --git a/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent.yaml b/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent.yaml
new file mode 100644
index 00000000..2d45f799
--- /dev/null
+++ b/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent.yaml
@@ -0,0 +1,75 @@
+defaults:
+  - hpt_cotrain_enc_dec_base
+
+robomimic_model:
+  ac_keys:
+    eva_bimanual: "actions_cartesian"
+    aria_bimanual: "actions_cartesian"
+  shared_ac_key: "actions_cartesian"
+
+  6dof: true
+  diffusion: true
+
+  head_specs:
+    aria_bimanual: null
+    eva_bimanual: null
+    shared:
+      _target_: egomimic.models.fm_policy.FMPolicy
+      action_horizon: 100
+      num_inference_steps: 50
+      pooling: null
+      padding: "zero"
+      time_dist: "beta"
+      infer_ac_dims:
+        eva_bimanual: 14
+        aria_bimanual: 14
+      model:
+        _target_: egomimic.models.denoising_nets.CrossTransformer
+        nblocks: 6
+        cond_dim: 256
+        hidden_dim: 128
+        act_dim: 14
+        act_seq: 100
+        n_heads: 4
+        dropout: 0.1
+        mlp_layers: 4
+        mlp_ratio: 4
+      latent_map:
+        eva_bimanual:
+          encoder:
+            _target_: egomimic.models.conv.temporal_enc_dec.SmallTemporalEncoder
+            action_dim: 14
+            hidden_dim: 128
+            activation: "gelu"
+            use_layernorm: false
+          decoder:
+            _target_: egomimic.models.conv.temporal_enc_dec.SmallTemporalDecoder
+            action_dim: 14
+            hidden_dim: 128
+            activation: "gelu"
+            use_layernorm: true
+        aria_keypoints:
+          encoder:
+            _target_: egomimic.models.conv.temporal_enc_dec.LargeTemporalEncoder
+            action_dim: 140
+            hidden_dim: 128
+            activation: "gelu"
+            use_layernorm: false
+          decoder:
+            _target_: egomimic.models.conv.temporal_enc_dec.LargeTemporalDecoder
+            action_dim: 140
+            hidden_dim: 128
+            activation: "gelu"
+            use_layernorm: true
+
+optimizer:
+  _target_: torch.optim.AdamW
+  _partial_: true
+  lr: 1e-4
+  weight_decay: 0.0001
+
+scheduler:
+  _target_: torch.optim.lr_scheduler.CosineAnnealingLR
+  _partial_: true
+  T_max: 1400
+  eta_min: 1e-5
diff --git a/egomimic/models/conv/temporal_enc_dec.py b/egomimic/models/conv/temporal_enc_dec.py
new file mode 100644
index 00000000..4d438c82
--- /dev/null
+++ b/egomimic/models/conv/temporal_enc_dec.py
@@ -0,0 +1,305 @@
+from __future__ import annotations
+
+from typing import List
+
+import torch
+import torch.nn as nn
+
+
+class SmallTemporalEncoder(nn.Module):
+    """
+    Fix temporal encoder for 100 seq of actiona
+    """
+    def __init__(
+        self,
+        *,
+        action_dim: int,
+        activation: str = "gelu",
+        use_layernorm: bool = True,
+    ):
+        super().__init__()
+        if activation == "relu":
+            act = nn.ReLU()
+        elif activation == "gelu":
+            act = nn.GELU()
+        elif activation == "silu":
+            act = nn.SiLU()
+        else:
+            raise ValueError(f"Unknown activation: {activation}")
+        
+        layers = [nn.Conv1d(action_dim, action_dim*2, kernel_size=8, stride=2, padding=3),
+                  act,
+                  nn.Conv1d(action_dim*2, action_dim*2, kernel_size=8, stride=2, padding=2),
+                  act,
+                  nn.Conv1d(action_dim*2, action_dim*2, kernel_size=8, stride=2, padding=3),
+                  act,
+                ]        
+
+
+        hidden_dim = 64
+        self.down = nn.Sequential(*layers)
+        self.proj = nn.Linear(action_dim*2, hidden_dim)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Input:  (B, T, D) or (T, D)
+        Output: (B, K, H) or (K, H)
+        """
+        squeeze_B = False
+        if x.dim() == 2:
+            x = x.unsqueeze(0)
+            squeeze_B = True
+        elif x.dim() != 3:
+            raise ValueError(f"Expected (T,D) or (B,T,D), got {tuple(x.shape)}")
+
+        x = x.transpose(1, 2)          # (B, D, T)
+        x = self.down(x)               # (B, D, K)
+        x = x.transpose(1, 2)          # (B, K, D)
+        x = self.proj(x)    # (B, K, H)
+
+        return x.squeeze(0) if squeeze_B else x
+
+class SmallTemporalDecoder(nn.Module):
+    """
+    Decoder that mirrors SmallTemporalEncoder:
+        Enc convs (over time, channels-first):
+            (D -> 2D) k=8 s=2 p=3
+            (2D -> 2D) k=8 s=2 p=2
+            (2D -> 2D) k=8 s=2 p=3
+        For T=100 this encoder produces K=12.
+
+    This decoder maps:
+        Input:  (B, K=12, H=64) or (K, H)
+        Output: (B, T=100, D)   or (T, D)
+    """
+    def __init__(
+        self,
+        *,
+        action_dim: int,
+        hidden_dim: int = 64,
+        activation: str = "gelu",
+        use_layernorm: bool = True,
+        K: int = 12,
+        T: int = 100,
+    ):
+        super().__init__()
+        self.action_dim = action_dim
+        self.hidden_dim = hidden_dim
+        self.K = K
+        self.T = T
+
+        if activation == "relu":
+            act = nn.ReLU()
+        elif activation == "gelu":
+            act = nn.GELU()
+        elif activation == "silu":
+            act = nn.SiLU()
+        else:
+            raise ValueError(f"Unknown activation: {activation}")
+
+        C2 = action_dim * 2
+
+        self.proj = nn.Linear(hidden_dim, C2)
+        self.norm = nn.LayerNorm(C2) if use_layernorm else nn.Identity()
+
+        self.up = nn.Sequential(
+            nn.ConvTranspose1d(C2, C2, kernel_size=8, stride=2, padding=3, output_padding=0),
+            act,
+            nn.ConvTranspose1d(C2, C2, kernel_size=8, stride=2, padding=2, output_padding=0),
+            act,
+            nn.ConvTranspose1d(C2, action_dim, kernel_size=8, stride=2, padding=3, output_padding=0),
+        )
+
+    def forward(self, z: torch.Tensor) -> torch.Tensor:
+        squeeze_B = False
+        if z.dim() == 2:
+            z = z.unsqueeze(0)
+            squeeze_B = True
+        elif z.dim() != 3:
+            raise ValueError(f"Expected (K,H) or (B,K,H), got {tuple(z.shape)}")
+
+        B, K, H = z.shape
+        if H != self.hidden_dim:
+            raise ValueError(f"Expected H={self.hidden_dim}, got {H}")
+        if K != self.K:
+            raise ValueError(f"Expected K={self.K}, got {K}")
+
+        x = self.norm(self.proj(z))     # (B, K, 2D)
+        x = x.transpose(1, 2)           # (B, 2D, K)
+        x = self.up(x)                  # (B, D, T)
+        if x.shape[-1] != self.T:
+            raise ValueError(f"Got T_out={x.shape[-1]}, expected T={self.T}")
+        x = x.transpose(1, 2)           # (B, T, D)
+
+        return x.squeeze(0) if squeeze_B else x
+
+class LargeTemporalEncoder(nn.Module):
+    """
+    Encoder for (B, T=100, D) that halves channels: D -> D/2,
+    and downsamples time: 100 -> 12.
+    Output: (B, K=12, H)
+    """
+    def __init__(
+        self,
+        *,
+        action_dim: int,
+        hidden_dim: int = 64,
+        activation: str = "gelu",
+        use_layernorm: bool = True,
+        expect_T: int | None = 100,
+    ):
+        super().__init__()
+        if action_dim % 2 != 0:
+            raise ValueError(f"action_dim must be even to halve. Got {action_dim}")
+
+        self.action_dim = action_dim
+        self.hidden_dim = hidden_dim
+        self.expect_T = expect_T
+
+        if activation == "relu":
+            act = nn.ReLU()
+        elif activation == "gelu":
+            act = nn.GELU()
+        elif activation == "silu":
+            act = nn.SiLU()
+        else:
+            raise ValueError(f"Unknown activation: {activation}")
+
+        D = action_dim
+
+        self.down = nn.Sequential(
+            nn.Conv1d(D,  action_dim, kernel_size=8, stride=2, padding=3),  # 100 -> 50
+            act,
+            nn.Conv1d(action_dim, action_dim, kernel_size=8, stride=2, padding=2),  # 50 -> 24
+            act,
+            nn.Conv1d(action_dim, action_dim, kernel_size=8, stride=2, padding=3),  # 24 -> 12
+            act,
+        )
+
+        self.proj = nn.Linear(action_dim, hidden_dim)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        squeeze_B = False
+        if x.dim() == 2:
+            x = x.unsqueeze(0)
+            squeeze_B = True
+        elif x.dim() != 3:
+            raise ValueError(f"Expected (T,D) or (B,T,D), got {tuple(x.shape)}")
+
+        B, T, D = x.shape
+        if D != self.action_dim:
+            raise ValueError(f"Expected D={self.action_dim}, got {D}")
+        if self.expect_T is not None and T != self.expect_T:
+            raise ValueError(f"Expected T={self.expect_T}, got {T}")
+
+        x = x.transpose(1, 2)           # (B, D, T)
+        x = self.down(x)                # (B, D/2, K=12)
+        x = x.transpose(1, 2)           # (B, K, D/2)
+        x = self.proj(x)     # (B, K, H)
+        return x.squeeze(0) if squeeze_B else x
+
+
+class LargeTemporalDecoder(nn.Module):
+    """
+    Decoder that mirrors LargeTemporalEncoder:
+        time: 12 -> 24 -> 50 -> 100
+        channels: H -> D/2 -> D
+    Input:  (B, K=12, H) or (K, H)
+    Output: (B, T=100, D) or (T, D)
+    """
+    def __init__(
+        self,
+        *,
+        action_dim: int,
+        hidden_dim: int = 64,
+        activation: str = "gelu",
+        use_layernorm: bool = True,
+        K: int = 12,
+        T: int = 100,
+    ):
+        super().__init__()
+        if action_dim % 2 != 0:
+            raise ValueError(f"action_dim must be even to halve. Got {action_dim}")
+
+        self.action_dim = action_dim
+        self.half_dim = action_dim // 2
+        self.hidden_dim = hidden_dim
+        self.K = K
+        self.T = T
+
+        if activation == "relu":
+            act = nn.ReLU()
+        elif activation == "gelu":
+            act = nn.GELU()
+        elif activation == "silu":
+            act = nn.SiLU()
+        else:
+            raise ValueError(f"Unknown activation: {activation}")
+
+        self.proj = nn.Linear(hidden_dim, action_dim)
+        self.norm = nn.LayerNorm(action_dim) if use_layernorm else nn.Identity()
+
+        # Mirrors paddings/strides/kernels in reverse.
+        # Lengths: 12 -> 24 -> 50 -> 100 with output_padding=0 for these params.
+        self.up = nn.Sequential(
+            nn.ConvTranspose1d(action_dim, action_dim, kernel_size=8, stride=2, padding=3, output_padding=0),
+            act,
+            nn.ConvTranspose1d(action_dim, action_dim, kernel_size=8, stride=2, padding=2, output_padding=0),
+            act,
+            nn.ConvTranspose1d(action_dim, action_dim, kernel_size=8, stride=2, padding=3, output_padding=0),
+        )
+
+    def forward(self, z: torch.Tensor) -> torch.Tensor:
+        squeeze_B = False
+        if z.dim() == 2:
+            z = z.unsqueeze(0)
+            squeeze_B = True
+        elif z.dim() != 3:
+            raise ValueError(f"Expected (K,H) or (B,K,H), got {tuple(z.shape)}")
+
+        B, K, H = z.shape
+        if H != self.hidden_dim:
+            raise ValueError(f"Expected H={self.hidden_dim}, got {H}")
+        if K != self.K:
+            raise ValueError(f"Expected K={self.K}, got {K}")
+
+        x = self.norm(self.proj(z))     # (B, K, D/2)
+        x = x.transpose(1, 2)           # (B, D/2, K)
+        x = self.up(x)                  # (B, D, T)
+        if x.shape[-1] != self.T:
+            raise ValueError(f"Got T_out={x.shape[-1]}, expected T={self.T}")
+        x = x.transpose(1, 2)           # (B, T, D)
+        return x.squeeze(0) if squeeze_B else x
+
+
+def count_params(module: nn.Module, trainable_only: bool = False) -> int:
+    if trainable_only:
+        return sum(p.numel() for p in module.parameters() if p.requires_grad)
+    return sum(p.numel() for p in module.parameters())
+
+
+def print_param_breakdown(module: nn.Module, trainable_only: bool = False) -> None:
+    total = 0
+    for name, p in module.named_parameters():
+        if trainable_only and not p.requires_grad:
+            continue
+        n = p.numel()
+        total += n
+        print(f"{name:60s} {tuple(p.shape)!s:20s} {n}")
+    print(f"\nTOTAL params: {total}")
+
+if __name__ == "__main__":
+    B, T, D = 8, 100, 140 
+
+    enc = LargeTemporalEncoder(action_dim=D)
+    dec = LargeTemporalDecoder(action_dim=D, use_layernorm=True)
+
+    x = torch.randn(B, T, D)
+    z = enc(x)
+    x_hat = dec(z)
+    
+    print(count_params(enc))
+    print(count_params(enc, trainable_only=True))
+    print_param_breakdown(enc)
+    
+    
\ No newline at end of file
diff --git a/egomimic/models/denoising_policy.py b/egomimic/models/denoising_policy.py
index 645a8c44..25ccc641 100644
--- a/egomimic/models/denoising_policy.py
+++ b/egomimic/models/denoising_policy.py
@@ -68,7 +68,7 @@ def preprocess_sampling(self, global_cond, embodiment_name, generator=None):
         )
         return noise, global_cond
 
-    def inference(self, noise, global_cond, generator=None) -> torch.Tensor:
+    def inference(self, noise, global_cond, generator=None) -> torch.Tensor:  # pyright: ignore[reportUnusedParameter]
         """
         To be implemented in subclass: predict actions from noise and conditioning.
         """
@@ -78,13 +78,13 @@ def sample_action(self, global_cond, embodiment_name, generator=None):
         noise, global_cond = self.preprocess_sampling(
             global_cond, embodiment_name, generator
         )
-        return self.inference(noise, global_cond, generator)
+        return self.inference(noise, global_cond, generator, embodiment_name)
 
-    def forward(self, global_cond):
+    def forward(self, global_cond, embodiment_name):
         cond, embodiment = global_cond
-        return self.sample_action(cond, embodiment)
+        return self.sample_action(cond, embodiment, embodiment_name)
 
-    def predict(self, actions, global_cond) -> Tuple[torch.Tensor, torch.Tensor]:
+    def predict(self, actions, global_cond, embodiment_name) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         To be implemented in subclass: returns (prediction, target) given action input and conditioning.
         """
@@ -96,7 +96,7 @@ def loss_fn(self, pred, target):
         """
         return F.mse_loss(pred, target)
 
-    def preprocess_compute_loss(self, global_cond, data):
+    def preprocess_compute_loss(self, global_cond, data, embodiment_name):
         if self.pooling == "mean":
             global_cond = global_cond.mean(dim=1)
         elif self.pooling == "flatten":
@@ -121,7 +121,7 @@ def preprocess_compute_loss(self, global_cond, data):
 
         return actions, global_cond
 
-    def compute_loss(self, global_cond, data):
-        actions, global_cond = self.preprocess_compute_loss(global_cond, data)
-        pred, target = self.predict(actions, global_cond)
+    def compute_loss(self, global_cond, data, embodiment_name):
+        actions, global_cond = self.preprocess_compute_loss(global_cond, data, embodiment_name)
+        pred, target = self.predict(actions, global_cond, embodiment_name)
         return self.loss_fn(pred, target)
diff --git a/egomimic/models/fm_policy.py b/egomimic/models/fm_policy.py
index e41f4943..551853a6 100644
--- a/egomimic/models/fm_policy.py
+++ b/egomimic/models/fm_policy.py
@@ -26,12 +26,14 @@ def __init__(
         action_horizon,
         infer_ac_dims,
         num_inference_steps=None,
+        encoder_map=None,
         **kwargs,
     ):
         super().__init__(
             model, action_horizon, infer_ac_dims, num_inference_steps, **kwargs
         )
         self.time_dist = kwargs.get("time_dist", "beta")
+        self.encoder_map = encoder_map
 
     def step(self, x_t, t, global_cond):
         if len(t.shape) != 1:

From 65cf274898202da2626eff38c8181cc22b7efa23 Mon Sep 17 00:00:00 2001
From: ElmoPA <elmoworld2005@gmail.com>
Date: Thu, 5 Mar 2026 16:14:20 -0500
Subject: [PATCH 2/2] Changes for latent flow

---
 egomimic/algo/hpt.py                          |   9 +-
 .../data/eva_human_keypoints_cotrain.yaml     |  73 +++
 .../hydra/launcher/submitit.yaml              |  20 +-
 .../hpt_cotrain_flow_shared_head_latent.yaml  |  33 +-
 ...cotrain_flow_shared_head_latent_large.yaml |  74 +++
 ...t_cotrain_flow_shared_head_latent_mlp.yaml |  70 +++
 .../model/hpt_cotrain_keypoints_base.yaml     | 147 ++++++
 egomimic/hydra_configs/train_zarr_latent.yaml | 111 ++++
 egomimic/hydra_configs/trainer/ddp.yaml       |   4 +-
 egomimic/hydra_configs/trainer/debug.yaml     |   2 +-
 .../eva_cartesian_aria_keypoints.yaml         |  20 +-
 egomimic/models/codec/mlp.py                  |  16 +
 egomimic/models/codec/temporal_enc_dec.py     | 477 ++++++++++++++++++
 egomimic/models/conv/temporal_enc_dec.py      | 305 -----------
 egomimic/models/denoising_policy.py           |  68 ++-
 egomimic/models/fm_policy.py                  |  60 ++-
 egomimic/rldb/embodiment/human.py             |  51 +-
 egomimic/trainHydra.py                        |   6 +-
 egomimic/train_zarr.yaml                      | 111 ++++
 egomimic/utils/viz_utils.py                   |  12 +-
 20 files changed, 1287 insertions(+), 382 deletions(-)
 create mode 100644 egomimic/hydra_configs/data/eva_human_keypoints_cotrain.yaml
 create mode 100644 egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent_large.yaml
 create mode 100644 egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent_mlp.yaml
 create mode 100644 egomimic/hydra_configs/model/hpt_cotrain_keypoints_base.yaml
 create mode 100644 egomimic/hydra_configs/train_zarr_latent.yaml
 create mode 100644 egomimic/models/codec/mlp.py
 create mode 100644 egomimic/models/codec/temporal_enc_dec.py
 delete mode 100644 egomimic/models/conv/temporal_enc_dec.py
 create mode 100644 egomimic/train_zarr.yaml

diff --git a/egomimic/algo/hpt.py b/egomimic/algo/hpt.py
index 6ae47832..1deca207 100644
--- a/egomimic/algo/hpt.py
+++ b/egomimic/algo/hpt.py
@@ -829,7 +829,6 @@ def __init__(
         self.domains = domains.copy()
         self.auxiliary_ac_keys = auxiliary_ac_keys.copy()
         self.shared_ac_key = kwargs.get("shared_ac_key", None)
-        self.is_6dof = kwargs.get("6dof", False)
         self.kinematics_solver = kwargs.get("kinematics_solver", None)
 
         model = HPTModel(**trunk)
@@ -1282,13 +1281,16 @@ def compute_losses(self, predictions, batch):
             embodiment_name = get_embodiment(embodiment_id).lower()
             bc_loss = predictions[f"{embodiment_name}_loss"]
             scaled_bc_loss = bc_weight * bc_loss
-            total_action_loss += scaled_bc_loss
+            total_action_loss = total_action_loss + scaled_bc_loss
             loss_dict[f"{embodiment_name}_loss"] = bc_loss  # for logging
 
         if self.ot:
             loss_dict["ot_loss"] = predictions["ot_loss"]
             loss_dict["avg_feature_distance"] = predictions["avg_feature_distance"]
-            total_action_loss += ot_weight * self.temperature * predictions["ot_loss"]
+            total_action_loss = (
+                total_action_loss
+                + ot_weight * self.temperature * predictions["ot_loss"]
+            )
 
         loss_dict["action_loss"] = total_action_loss / len(self.domains)
         return loss_dict
@@ -1372,7 +1374,6 @@ def _robomimic_to_hpt_data(
             if key in batch:
                 data[key] = batch[key]
 
-        data["is_6dof"] = self.is_6dof
         data["pad_mask"] = batch["pad_mask"]
         data["embodiment"] = batch["embodiment"]
 
diff --git a/egomimic/hydra_configs/data/eva_human_keypoints_cotrain.yaml b/egomimic/hydra_configs/data/eva_human_keypoints_cotrain.yaml
new file mode 100644
index 00000000..111ef609
--- /dev/null
+++ b/egomimic/hydra_configs/data/eva_human_keypoints_cotrain.yaml
@@ -0,0 +1,73 @@
+_target_: egomimic.pl_utils.pl_data_utils.MultiDataModuleWrapper
+train_datasets:
+  eva_bimanual:
+    _target_: egomimic.rldb.zarr.zarr_dataset_multi.MultiDataset._from_resolver
+    resolver:
+      _target_: egomimic.rldb.zarr.zarr_dataset_multi.S3EpisodeResolver
+      folder_path: /coc/flash7/scratch/egoverseDebugDatasets/egoverseS3DatasetTest/
+      key_map:
+        _target_: egomimic.rldb.embodiment.eva.Eva.get_keymap
+      transform_list:
+        _target_: egomimic.rldb.embodiment.eva.Eva.get_transform_list
+    filters:
+      robot_name: "eva_bimanual"
+      task: "fold_clothes"
+    mode: total
+  aria_bimanual:
+    _target_: egomimic.rldb.zarr.zarr_dataset_multi.MultiDataset._from_resolver
+    resolver:
+      _target_: egomimic.rldb.zarr.zarr_dataset_multi.S3EpisodeResolver
+      folder_path: /coc/flash7/scratch/egoverseDebugDatasets/aria
+      key_map:
+        _target_: egomimic.rldb.embodiment.human.Aria.get_keymap
+        mode: keypoints
+      transform_list:
+        _target_: egomimic.rldb.embodiment.human.Aria.get_transform_list
+        mode: keypoints
+    filters:
+      robot_name: "aria_bimanual"
+      task: "fold_clothes_indomain"
+    mode: total
+valid_datasets:
+  eva_bimanual:
+    _target_: egomimic.rldb.zarr.zarr_dataset_multi.MultiDataset._from_resolver
+    resolver:
+      _target_: egomimic.rldb.zarr.zarr_dataset_multi.S3EpisodeResolver
+      folder_path: /coc/flash7/scratch/egoverseDebugDatasets/egoverseS3DatasetTest/
+      key_map:
+        _target_: egomimic.rldb.embodiment.eva.Eva.get_keymap
+      transform_list:
+        _target_: egomimic.rldb.embodiment.eva.Eva.get_transform_list
+    filters:
+      robot_name: "eva_bimanual"
+      task: "fold_clothes"
+    mode: total
+  aria_bimanual:
+    _target_: egomimic.rldb.zarr.zarr_dataset_multi.MultiDataset._from_resolver
+    resolver:
+      _target_: egomimic.rldb.zarr.zarr_dataset_multi.S3EpisodeResolver
+      folder_path: /coc/flash7/scratch/egoverseDebugDatasets/aria
+      key_map:
+        _target_: egomimic.rldb.embodiment.human.Aria.get_keymap
+        mode: keypoints
+      transform_list:
+        _target_: egomimic.rldb.embodiment.human.Aria.get_transform_list
+        mode: keypoints
+    filters:
+      robot_name: "aria_bimanual"
+      task: "fold_clothes_indomain"
+    mode: total
+train_dataloader_params:
+  eva_bimanual:
+    batch_size: 64
+    num_workers: 10
+  aria_bimanual:
+    batch_size: 64
+    num_workers: 10
+valid_dataloader_params:
+  eva_bimanual:
+    batch_size: 64
+    num_workers: 10
+  aria_bimanual:
+    batch_size: 64
+    num_workers: 10
diff --git a/egomimic/hydra_configs/hydra/launcher/submitit.yaml b/egomimic/hydra_configs/hydra/launcher/submitit.yaml
index c56f2cd5..b068685e 100644
--- a/egomimic/hydra_configs/hydra/launcher/submitit.yaml
+++ b/egomimic/hydra_configs/hydra/launcher/submitit.yaml
@@ -4,15 +4,15 @@ defaults:
 _target_: hydra_plugins.hydra_submitit_launcher.submitit_launcher.SlurmLauncher
 
 # Slurm configuration
-name: ${hydra.job.name}                             # Default job name
-partition: "rl2-lab"                            # Slurm partition (e.g., 'gpu' or 'compute')
-account: "rl2-lab"                              # Slurm account (e.g., 'my_account')
-cpus_per_task: 12                                   # Number of CPUs per task
-nodes: ${launch_params.nodes}                       # Number of nodes
-tasks_per_node: ${launch_params.gpus_per_node}      # Use variable for tasks per node
+name: ${hydra.job.name} # Default job name
+partition: "hoffman-lab" # Slurm partition (e.g., 'gpu' or 'compute')
+account: "hoffman-lab" # Slurm account (e.g., 'my_account')
+cpus_per_task: 12 # Number of CPUs per task
+nodes: ${launch_params.nodes} # Number of nodes
+tasks_per_node: ${launch_params.gpus_per_node} # Use variable for tasks per node
 gres: "gpu:a40:${eval:'${launch_params.gpus_per_node} * ${launch_params.nodes}'}" # GPU type and count
-qos: "short"                                        # Slurm QoS
-timeout_min: 2880                                   # Timeout in minutes (48 hours)
-exclude: "protocol, puma"     # Nodes to exclude
+qos: "short" # Slurm QoS
+timeout_min: 2880 # Timeout in minutes (48 hours)
+exclude: "protocol, puma" # Nodes to exclude
 additional_parameters:
-  requeue: true
\ No newline at end of file
+  requeue: true
diff --git a/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent.yaml b/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent.yaml
index 2d45f799..b256d18d 100644
--- a/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent.yaml
+++ b/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent.yaml
@@ -1,11 +1,11 @@
 defaults:
-  - hpt_cotrain_enc_dec_base
+  - hpt_cotrain_keypoints_base
 
 robomimic_model:
   ac_keys:
-    eva_bimanual: "actions_cartesian"
-    aria_bimanual: "actions_cartesian"
-  shared_ac_key: "actions_cartesian"
+    eva_bimanual: "actions_eva_cart_aria_keypoints"
+    aria_bimanual: "actions_eva_cart_aria_keypoints"
+  shared_ac_key: "actions_eva_cart_aria_keypoints"
 
   6dof: true
   diffusion: true
@@ -20,43 +20,42 @@ robomimic_model:
       pooling: null
       padding: "zero"
       time_dist: "beta"
-      infer_ac_dims:
-        eva_bimanual: 14
-        aria_bimanual: 14
       model:
         _target_: egomimic.models.denoising_nets.CrossTransformer
         nblocks: 6
         cond_dim: 256
-        hidden_dim: 128
-        act_dim: 14
-        act_seq: 100
+        hidden_dim: 256
+        act_dim: 128
+        act_seq: 12
         n_heads: 4
         dropout: 0.1
         mlp_layers: 4
         mlp_ratio: 4
-      latent_map:
+      embodiment_specs:
         eva_bimanual:
+          ac_dims: 14
           encoder:
-            _target_: egomimic.models.conv.temporal_enc_dec.SmallTemporalEncoder
+            _target_: egomimic.models.codec.temporal_enc_dec.SmallTemporalEncoder
             action_dim: 14
             hidden_dim: 128
             activation: "gelu"
             use_layernorm: false
           decoder:
-            _target_: egomimic.models.conv.temporal_enc_dec.SmallTemporalDecoder
+            _target_: egomimic.models.codec.temporal_enc_dec.SmallTemporalDecoder
             action_dim: 14
             hidden_dim: 128
             activation: "gelu"
             use_layernorm: true
-        aria_keypoints:
+        aria_bimanual:
+          ac_dims: 140
           encoder:
-            _target_: egomimic.models.conv.temporal_enc_dec.LargeTemporalEncoder
+            _target_: egomimic.models.codec.temporal_enc_dec.LargeTemporalEncoder
             action_dim: 140
             hidden_dim: 128
             activation: "gelu"
             use_layernorm: false
           decoder:
-            _target_: egomimic.models.conv.temporal_enc_dec.LargeTemporalDecoder
+            _target_: egomimic.models.codec.temporal_enc_dec.LargeTemporalDecoder
             action_dim: 140
             hidden_dim: 128
             activation: "gelu"
@@ -71,5 +70,5 @@ optimizer:
 scheduler:
   _target_: torch.optim.lr_scheduler.CosineAnnealingLR
   _partial_: true
-  T_max: 1400
+  T_max: 1800
   eta_min: 1e-5
diff --git a/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent_large.yaml b/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent_large.yaml
new file mode 100644
index 00000000..c22bfdf5
--- /dev/null
+++ b/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent_large.yaml
@@ -0,0 +1,74 @@
+defaults:
+  - hpt_cotrain_keypoints_base
+
+robomimic_model:
+  ac_keys:
+    eva_bimanual: "actions_eva_cart_aria_keypoints"
+    aria_bimanual: "actions_eva_cart_aria_keypoints"
+  shared_ac_key: "actions_eva_cart_aria_keypoints"
+
+  6dof: true
+  diffusion: true
+
+  head_specs:
+    aria_bimanual: null
+    eva_bimanual: null
+    shared:
+      _target_: egomimic.models.fm_policy.FMPolicy
+      action_horizon: 100
+      num_inference_steps: 50
+      pooling: null
+      padding: "zero"
+      time_dist: "beta"
+      model:
+        _target_: egomimic.models.denoising_nets.CrossTransformer
+        nblocks: 8
+        cond_dim: 256
+        hidden_dim: 256
+        act_dim: 256
+        act_seq: 32
+        n_heads: 4
+        dropout: 0.1
+        mlp_layers: 4
+        mlp_ratio: 4
+      embodiment_specs:
+        eva_bimanual:
+          ac_dims: 14
+          encoder:
+            _target_: egomimic.models.codec.temporal_enc_dec.SmallTemporalEncoder_32_256
+            action_dim: 14
+            hidden_dim: 256
+            activation: "gelu"
+            use_layernorm: false
+          decoder:
+            _target_: egomimic.models.codec.temporal_enc_dec.SmallTemporalDecoder_32_256
+            action_dim: 14
+            hidden_dim: 256
+            activation: "gelu"
+            use_layernorm: true
+        aria_bimanual:
+          ac_dims: 140
+          encoder:
+            _target_: egomimic.models.codec.temporal_enc_dec.LargeTemporalEncoder_32_256
+            action_dim: 140
+            hidden_dim: 256
+            activation: "gelu"
+            use_layernorm: false
+          decoder:
+            _target_: egomimic.models.codec.temporal_enc_dec.LargeTemporalDecoder_32_256
+            action_dim: 140
+            hidden_dim: 256
+            activation: "gelu"
+            use_layernorm: true
+
+optimizer:
+  _target_: torch.optim.AdamW
+  _partial_: true
+  lr: 1e-4
+  weight_decay: 0.0001
+
+scheduler:
+  _target_: torch.optim.lr_scheduler.CosineAnnealingLR
+  _partial_: true
+  T_max: 1800
+  eta_min: 1e-5
diff --git a/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent_mlp.yaml b/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent_mlp.yaml
new file mode 100644
index 00000000..305c8d62
--- /dev/null
+++ b/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent_mlp.yaml
@@ -0,0 +1,70 @@
+defaults:
+  - hpt_cotrain_keypoints_base
+
+robomimic_model:
+  ac_keys:
+    eva_bimanual: "actions_eva_cart_aria_keypoints"
+    aria_bimanual: "actions_eva_cart_aria_keypoints"
+  shared_ac_key: "actions_eva_cart_aria_keypoints"
+
+  6dof: true
+  diffusion: true
+
+  head_specs:
+    aria_bimanual: null
+    eva_bimanual: null
+    shared:
+      _target_: egomimic.models.fm_policy.FMPolicy
+      action_horizon: 100
+      num_inference_steps: 50
+      pooling: null
+      padding: "zero"
+      time_dist: "beta"
+      model:
+        _target_: egomimic.models.denoising_nets.CrossTransformer
+        nblocks: 8
+        cond_dim: 256
+        hidden_dim: 256
+        act_dim: 256
+        act_seq: 100
+        n_heads: 4
+        dropout: 0.1
+        mlp_layers: 4
+        mlp_ratio: 4
+      embodiment_specs:
+        eva_bimanual:
+          ac_dims: 14
+          encoder:
+            _target_: egomimic.models.codec.mlp.MLPProjection
+            input_dim: 14
+            hidden_dim: 256
+            output_dim: 256
+          decoder:
+            _target_: egomimic.models.codec.mlp.MLPProjection
+            input_dim: 256
+            hidden_dim: 256
+            output_dim: 14
+        aria_bimanual:
+          ac_dims: 140
+          encoder:
+            _target_: egomimic.models.codec.mlp.MLPProjection
+            input_dim: 140
+            hidden_dim: 256
+            output_dim: 256
+          decoder:
+            _target_: egomimic.models.codec.mlp.MLPProjection
+            input_dim: 256
+            hidden_dim: 256
+            output_dim: 140
+
+optimizer:
+  _target_: torch.optim.AdamW
+  _partial_: true
+  lr: 1e-4
+  weight_decay: 0.0001
+
+scheduler:
+  _target_: torch.optim.lr_scheduler.CosineAnnealingLR
+  _partial_: true
+  T_max: 1800
+  eta_min: 1e-5
diff --git a/egomimic/hydra_configs/model/hpt_cotrain_keypoints_base.yaml b/egomimic/hydra_configs/model/hpt_cotrain_keypoints_base.yaml
new file mode 100644
index 00000000..96e2d9ca
--- /dev/null
+++ b/egomimic/hydra_configs/model/hpt_cotrain_keypoints_base.yaml
@@ -0,0 +1,147 @@
+_target_: egomimic.pl_utils.pl_model.ModelWrapper
+robomimic_model:
+  _target_: egomimic.algo.hpt.HPT
+  data_schematic: _${data.dataset.data_schematic}
+  camera_transforms:
+    aria_bimanual:
+      _target_: egomimic.utils.egomimicUtils.CameraTransforms
+      intrinsics_key: "base" # change to base_half if using half res
+      extrinsics_key: "x5Dec13_2"
+    eva_bimanual:
+      _target_: egomimic.utils.egomimicUtils.CameraTransforms
+      intrinsics_key: "base" # change to base_half if using half res
+      extrinsics_key: "x5Dec13_2"
+  ac_keys:
+    aria_bimanual: "actions_eva_cart_aria_keypoints"
+    eva_bimanual: "actions_eva_cart_aria_keypoints"
+  shared_ac_key: "actions_eva_cart_aria_keypoints"
+
+  reverse_kl_samples: 8
+
+  trunk:
+    embed_dim: 256
+    num_blocks: 16
+    num_heads: 8
+    token_postprocessing: "action_token"
+    observation_horizon: 1
+    action_horizon: 64
+    no_trunk: false
+    use_domain_embedding: true
+    drop_path: 0.1
+    weight_init_style: "pytorch"
+
+  multitask: false
+  pretrained: false
+  pretrained_checkpoint: null
+  domains: ["eva_bimanual", "aria_bimanual"]
+  shared_obs_keys: ["front_img_1"]
+
+  shared_stem_specs:
+    front_img_1:
+      _target_: egomimic.models.hpt_nets.MLPPolicyStem
+      input_dim: 256
+      output_dim: 256
+      widths: [256]
+      specs:
+        random_horizon_masking: false
+        cross_attn:
+          crossattn_latent: 16
+          crossattn_heads: 8
+          crossattn_dim_head: 64
+          crossattn_modality_dropout: 0.1
+          modality_embed_dim: 256
+
+  stem_specs:
+    aria_bimanual:
+      state_keypoints: # TODO: check if this is added to dataschematic correctly
+        _target_: egomimic.models.hpt_nets.MLPPolicyStem
+        input_dim: 140
+        output_dim: 256
+        widths: [256]
+        specs:
+          random_horizon_masking: false
+          cross_attn:
+            crossattn_latent: 16
+            crossattn_heads: 8
+            crossattn_dim_head: 64
+            crossattn_modality_dropout: 0.1
+            modality_embed_dim: 256
+
+    eva_bimanual:
+      state_joint_positions:
+        _target_: egomimic.models.hpt_nets.MLPPolicyStem
+        input_dim: 14
+        output_dim: 256
+        widths: [256]
+        specs:
+          random_horizon_masking: false
+          cross_attn:
+            crossattn_latent: 16
+            crossattn_heads: 8
+            crossattn_dim_head: 64
+            crossattn_modality_dropout: 0.1
+            modality_embed_dim: 256
+      right_wrist_img:
+        _target_: egomimic.models.hpt_nets.MLPPolicyStem
+        input_dim: 256
+        output_dim: 256
+        widths: [256]
+        specs:
+          random_horizon_masking: false
+          cross_attn:
+            crossattn_latent: 16
+            crossattn_heads: 8
+            crossattn_dim_head: 64
+            crossattn_modality_dropout: 0.1
+            modality_embed_dim: 256
+      left_wrist_img:
+        _target_: egomimic.models.hpt_nets.MLPPolicyStem
+        input_dim: 256
+        output_dim: 256
+        widths: [256]
+        specs:
+          random_horizon_masking: false
+          cross_attn:
+            crossattn_latent: 16
+            crossattn_heads: 8
+            crossattn_dim_head: 64
+            crossattn_modality_dropout: 0.1
+            modality_embed_dim: 256
+
+  encoder_specs:
+    front_img_1:
+      _target_: egomimic.models.hpt_nets.ResNet
+      output_dim: 256
+      num_of_copy: 1
+    right_wrist_img:
+      _target_: egomimic.models.hpt_nets.ResNet
+      output_dim: 256
+      num_of_copy: 1
+    left_wrist_img:
+      _target_: egomimic.models.hpt_nets.ResNet
+      output_dim: 256
+      num_of_copy: 1
+
+  train_image_augs:
+    _target_: torchvision.transforms.Compose
+    transforms:
+      - _target_: torchvision.transforms.ColorJitter
+        brightness: 0.1
+        contrast: 0.1
+        saturation: 0.1
+        hue: 0.05
+      - _target_: torchvision.transforms.Normalize
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+  eval_image_augs:
+    _target_: torchvision.transforms.Compose
+    transforms:
+      - _target_: torchvision.transforms.Normalize
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+
+optimizer:
+  _target_: torch.optim.AdamW
+  _partial_: true
+  lr: 5e-5
+  weight_decay: 0.0001
diff --git a/egomimic/hydra_configs/train_zarr_latent.yaml b/egomimic/hydra_configs/train_zarr_latent.yaml
new file mode 100644
index 00000000..9280721a
--- /dev/null
+++ b/egomimic/hydra_configs/train_zarr_latent.yaml
@@ -0,0 +1,111 @@
+defaults:
+  - model: hpt_cotrain_flow_shared_head_latent_mlp
+  - visualization: eva_cartesian_aria_keypoints
+  - paths: default
+  - trainer: ddp
+  - debug: null
+  - logger: wandb
+  - data: eva_human_keypoints_cotrain
+  - callbacks: checkpoints
+  - override hydra/launcher: submitit
+  - _self_
+
+name: latent_flow
+description: latent_flow
+ckpt_path: null
+train: true
+eval: false
+
+eval_class:
+  _target_: egomimic.scripts.evaluation.Eve
+  mode: real
+  arm: both
+  eval_path: "./logs/eval/${name}_${now:%Y-%m-%d_%H-%M-%S}"
+
+hydra:
+  run:
+    # Dir should be experiment_name/description_{timestamp}
+    dir: ./logs/${name}/${description}_${now:%Y-%m-%d_%H-%M-%S}
+  sweep:
+    dir: ./logs/${name}/${description}_${now:%Y-%m-%d_%H-%M-%S}
+
+launch_params:
+  gpus_per_node: 1
+  nodes: 1
+
+data_schematic: # Dynamically fill in these shapes from the dataset
+  _target_: egomimic.rldb.zarr.utils.DataSchematic
+  norm_mode: quantile
+  schematic_dict:
+    eva_bimanual:
+      front_img_1: #batch key
+        key_type: camera_keys # key type
+        zarr_key: observations.images.front_img_1 # dataset key
+      right_wrist_img:
+        key_type: camera_keys
+        zarr_key: observations.images.right_wrist_img
+      left_wrist_img:
+        key_type: camera_keys
+        zarr_key: observations.images.left_wrist_img
+      ee_pose:
+        key_type: proprio_keys
+        zarr_key: observations.state.ee_pose
+      joint_positions:
+        key_type: proprio_keys
+        zarr_key: observations.state.joint_positions
+      actions_joints:
+        key_type: action_keys
+        zarr_key: actions_joints
+      actions_eva_cart_aria_keypoints:
+        key_type: action_keys
+        zarr_key: actions_cartesian
+      embodiment:
+        key_type: metadata_keys
+        zarr_key: metadata.embodiment
+    aria_bimanual:
+      front_img_1:
+        key_type: camera_keys
+        zarr_key: observations.images.front_img_1
+      ee_pose:
+        key_type: proprio_keys
+        zarr_key: observations.state.ee_pose
+      actions_cartesian:
+        key_type: action_keys
+        zarr_key: actions_cartesian
+      actions_eva_cart_aria_keypoints:
+        key_type: action_keys
+        zarr_key: actions_keypoints
+      keypoint_positions:
+        key_type: proprio_keys
+        zarr_key: observations.state.keypoints
+      embodiment:
+        key_type: metadata_keys
+        zarr_key: metadata.embodiment
+    mecka_bimanual:
+      front_img_1:
+        key_type: camera_keys
+        zarr_key: observations.images.front_img_1
+      ee_pose:
+        key_type: proprio_keys
+        zarr_key: observations.state.ee_pose
+      actions_cartesian:
+        key_type: action_keys
+        zarr_key: actions_cartesian
+      embodiment:
+        key_type: metadata_keys
+        zarr_key: metadata.embodiment
+    scale_bimanual:
+      front_img_1:
+        key_type: camera_keys
+        zarr_key: observations.images.front_img_1
+      ee_pose:
+        key_type: proprio_keys
+        zarr_key: observations.state.ee_pose
+      actions_cartesian:
+        key_type: action_keys
+        zarr_key: actions_cartesian
+      embodiment:
+        key_type: metadata_keys
+        zarr_key: metadata.embodiment
+
+seed: 42
diff --git a/egomimic/hydra_configs/trainer/ddp.yaml b/egomimic/hydra_configs/trainer/ddp.yaml
index d3d90aca..d4359f17 100644
--- a/egomimic/hydra_configs/trainer/ddp.yaml
+++ b/egomimic/hydra_configs/trainer/ddp.yaml
@@ -1,11 +1,11 @@
 defaults:
   - default
 
-strategy: ddp
+strategy: ddp_find_unused_parameters_true
 
 accelerator: gpu
 devices: ${eval:'${launch_params.gpus_per_node} * ${launch_params.nodes}'}
 num_nodes: ${launch_params.nodes}
 sync_batchnorm: True
 check_val_every_n_epoch: 200
-num_sanity_val_steps: 0
\ No newline at end of file
+num_sanity_val_steps: 0
diff --git a/egomimic/hydra_configs/trainer/debug.yaml b/egomimic/hydra_configs/trainer/debug.yaml
index e3a9a1a5..905d3711 100644
--- a/egomimic/hydra_configs/trainer/debug.yaml
+++ b/egomimic/hydra_configs/trainer/debug.yaml
@@ -3,7 +3,7 @@ defaults:
 
 strategy: ddp_find_unused_parameters_true
 limit_train_batches: 5
-limit_val_batches: 20
+limit_val_batches: 3
 check_val_every_n_epoch: 2
 profiler: simple
 max_epochs: 4
diff --git a/egomimic/hydra_configs/visualization/eva_cartesian_aria_keypoints.yaml b/egomimic/hydra_configs/visualization/eva_cartesian_aria_keypoints.yaml
index 8c4d1c91..33ae292c 100644
--- a/egomimic/hydra_configs/visualization/eva_cartesian_aria_keypoints.yaml
+++ b/egomimic/hydra_configs/visualization/eva_cartesian_aria_keypoints.yaml
@@ -1,14 +1,10 @@
 eva_bimanual:
-  action_keys: actions_cartesian
-  viz_function:
-    _target_: egomimic.rldb.embodiment.eva.Eva.viz
-    _partial_: true
-    mode: traj
-    intrinsics_key: base_half
+  _target_: egomimic.rldb.embodiment.eva.Eva.viz_cartesian_gt_preds
+  _partial_: true
+  image_key: front_img_1
+  action_key: actions_cartesian
 aria_bimanual:
-  action_keys: actions_cartesian
-  viz_function:
-    _target_: egomimic.rldb.embodiment.human.Aria.viz
-    _partial_: true
-    mode: keypoints
-    intrinsics_key: base_half
+  _target_: egomimic.rldb.embodiment.human.Human.viz_keypoints_gt_preds
+  _partial_: true
+  image_key: front_img_1
+  action_key: actions_keypoints
diff --git a/egomimic/models/codec/mlp.py b/egomimic/models/codec/mlp.py
new file mode 100644
index 00000000..43efe3a6
--- /dev/null
+++ b/egomimic/models/codec/mlp.py
@@ -0,0 +1,16 @@
+import torch
+import torch.nn as nn
+
+
+class MLPProjection(nn.Module):
+    def __init__(self, input_dim: int, hidden_dim: int, output_dim: int):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(input_dim, hidden_dim),
+            nn.GELU(),
+            nn.Linear(hidden_dim, output_dim),
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # x is in (B, T, D) -> (B, T, H)
+        return self.net(x)
diff --git a/egomimic/models/codec/temporal_enc_dec.py b/egomimic/models/codec/temporal_enc_dec.py
new file mode 100644
index 00000000..4547c567
--- /dev/null
+++ b/egomimic/models/codec/temporal_enc_dec.py
@@ -0,0 +1,477 @@
+from __future__ import annotations
+
+import torch
+import torch.nn as nn
+
+
+class SmallTemporalEncoder(nn.Module):
+    """
+    Fix temporal encoder for 100 seq of actiona
+    """
+
+    def __init__(
+        self,
+        *,
+        action_dim: int,
+        activation: str = "gelu",
+        hidden_dim: int = 64,
+        use_layernorm: bool = True,
+    ):
+        super().__init__()
+        if activation == "relu":
+            self.act = nn.ReLU()
+        elif activation == "gelu":
+            self.act = nn.GELU()
+        elif activation == "silu":
+            self.act = nn.SiLU()
+        else:
+            raise ValueError(f"Unknown activation: {activation}")
+
+        layers = [
+            nn.Conv1d(action_dim, action_dim * 2, kernel_size=8, stride=2, padding=3),
+            self.act,
+            nn.Conv1d(
+                action_dim * 2, action_dim * 2, kernel_size=8, stride=2, padding=2
+            ),
+            self.act,
+            nn.Conv1d(
+                action_dim * 2, action_dim * 2, kernel_size=8, stride=2, padding=3
+            ),
+            self.act,
+        ]
+
+        self.down = nn.Sequential(*layers)
+        self.proj = nn.Linear(action_dim * 2, hidden_dim)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Input:  (B, T, D) or (T, D)
+        Output: (B, K, H) or (K, H)
+        """
+        squeeze_B = False
+        if x.dim() == 2:
+            x = x.unsqueeze(0)
+            squeeze_B = True
+        elif x.dim() != 3:
+            raise ValueError(f"Expected (T,D) or (B,T,D), got {tuple(x.shape)}")
+
+        x = x.transpose(1, 2)  # (B, D, T)
+        x = self.down(x)  # (B, D, K)
+        x = x.transpose(1, 2)  # (B, K, D)
+        x = self.proj(x)  # (B, K, H)
+
+        return x.squeeze(0) if squeeze_B else x
+
+
+class SmallTemporalDecoder(nn.Module):
+    """
+    Decoder that mirrors SmallTemporalEncoder:
+        Enc convs (over time, channels-first):
+            (D -> 2D) k=8 s=2 p=3
+            (2D -> 2D) k=8 s=2 p=2
+            (2D -> 2D) k=8 s=2 p=3
+        For T=100 this encoder produces K=12.
+
+    This decoder maps:
+        Input:  (B, K=12, H=64) or (K, H)
+        Output: (B, T=100, D)   or (T, D)
+    """
+
+    def __init__(
+        self,
+        *,
+        action_dim: int,
+        hidden_dim: int = 64,
+        activation: str = "gelu",
+        use_layernorm: bool = True,
+        K: int = 12,
+        T: int = 100,
+    ):
+        super().__init__()
+        self.action_dim = action_dim
+        self.hidden_dim = hidden_dim
+        self.T = T
+
+        if activation == "relu":
+            self.act = nn.ReLU()
+        elif activation == "gelu":
+            self.act = nn.GELU()
+        elif activation == "silu":
+            self.act = nn.SiLU()
+        else:
+            raise ValueError(f"Unknown activation: {activation}")
+
+        C2 = action_dim * 2
+
+        self.proj = nn.Linear(hidden_dim, C2)
+        self.norm = nn.LayerNorm(C2) if use_layernorm else nn.Identity()
+
+        self.up = nn.Sequential(
+            nn.ConvTranspose1d(
+                C2, C2, kernel_size=8, stride=2, padding=3, output_padding=0
+            ),
+            self.act,
+            nn.ConvTranspose1d(
+                C2, C2, kernel_size=8, stride=2, padding=2, output_padding=0
+            ),
+            self.act,
+            nn.ConvTranspose1d(
+                C2, action_dim, kernel_size=8, stride=2, padding=3, output_padding=0
+            ),
+        )
+
+    def forward(self, z: torch.Tensor) -> torch.Tensor:
+        squeeze_B = False
+        if z.dim() == 2:
+            z = z.unsqueeze(0)
+            squeeze_B = True
+        elif z.dim() != 3:
+            raise ValueError(f"Expected (K,H) or (B,K,H), got {tuple(z.shape)}")
+
+        B, K, H = z.shape
+        if H != self.hidden_dim:
+            raise ValueError(f"Expected H={self.hidden_dim}, got {H}")
+
+        x = self.norm(self.proj(z))  # (B, K, 2D)
+        x = x.transpose(1, 2)  # (B, 2D, K)
+        x = self.up(x)  # (B, D, T)
+        if x.shape[-1] != self.T:
+            raise ValueError(f"Got T_out={x.shape[-1]}, expected T={self.T}")
+        x = x.transpose(1, 2)  # (B, T, D)
+
+        return x.squeeze(0) if squeeze_B else x
+
+
+class LargeTemporalEncoder(nn.Module):
+    """
+    Encoder for (B, T=100, D) that halves channels: D -> D/2,
+    and downsamples time: 100 -> 12.
+    Output: (B, K=12, H)
+    """
+
+    def __init__(
+        self,
+        *,
+        action_dim: int,
+        hidden_dim: int = 64,
+        activation: str = "gelu",
+        use_layernorm: bool = True,
+        expect_T: int | None = 100,
+    ):
+        super().__init__()
+        if action_dim % 2 != 0:
+            raise ValueError(f"action_dim must be even to halve. Got {action_dim}")
+
+        self.action_dim = action_dim
+        self.hidden_dim = hidden_dim
+        self.expect_T = expect_T
+
+        if activation == "relu":
+            self.act = nn.ReLU()
+        elif activation == "gelu":
+            self.act = nn.GELU()
+        elif activation == "silu":
+            self.act = nn.SiLU()
+        else:
+            raise ValueError(f"Unknown activation: {activation}")
+
+        D = action_dim
+
+        self.down = nn.Sequential(
+            nn.Conv1d(D, action_dim, kernel_size=8, stride=2, padding=3),  # 100 -> 50
+            self.act,
+            nn.Conv1d(
+                action_dim, action_dim, kernel_size=8, stride=2, padding=2
+            ),  # 50 -> 24
+            self.act,
+            nn.Conv1d(
+                action_dim, action_dim, kernel_size=8, stride=2, padding=3
+            ),  # 24 -> 12
+            self.act,
+        )
+
+        self.proj = nn.Linear(action_dim, hidden_dim)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        squeeze_B = False
+        if x.dim() == 2:
+            x = x.unsqueeze(0)
+            squeeze_B = True
+        elif x.dim() != 3:
+            raise ValueError(f"Expected (T,D) or (B,T,D), got {tuple(x.shape)}")
+
+        B, T, D = x.shape
+        if D != self.action_dim:
+            raise ValueError(f"Expected D={self.action_dim}, got {D}")
+        if self.expect_T is not None and T != self.expect_T:
+            raise ValueError(f"Expected T={self.expect_T}, got {T}")
+
+        x = x.transpose(1, 2)  # (B, D, T)
+        x = self.down(x)  # (B, D/2, K=12)
+        x = x.transpose(1, 2)  # (B, K, D/2)
+        x = self.proj(x)  # (B, K, H)
+        return x.squeeze(0) if squeeze_B else x
+
+
+class LargeTemporalDecoder(nn.Module):
+    """
+    Decoder that mirrors LargeTemporalEncoder:
+        time: 12 -> 24 -> 50 -> 100
+        channels: H -> D/2 -> D
+    Input:  (B, K=12, H) or (K, H)
+    Output: (B, T=100, D) or (T, D)
+    """
+
+    def __init__(
+        self,
+        *,
+        action_dim: int,
+        hidden_dim: int = 64,
+        activation: str = "gelu",
+        use_layernorm: bool = True,
+        K: int = 12,
+        T: int = 100,
+    ):
+        super().__init__()
+        if action_dim % 2 != 0:
+            raise ValueError(f"action_dim must be even to halve. Got {action_dim}")
+
+        self.action_dim = action_dim
+        self.half_dim = action_dim // 2
+        self.hidden_dim = hidden_dim
+        self.T = T
+
+        if activation == "relu":
+            self.act = nn.ReLU()
+        elif activation == "gelu":
+            self.act = nn.GELU()
+        elif activation == "silu":
+            self.act = nn.SiLU()
+        else:
+            raise ValueError(f"Unknown activation: {activation}")
+
+        self.proj = nn.Linear(hidden_dim, action_dim)
+        self.norm = nn.LayerNorm(action_dim) if use_layernorm else nn.Identity()
+
+        # Mirrors paddings/strides/kernels in reverse.
+        # Lengths: 12 -> 24 -> 50 -> 100 with output_padding=0 for these params.
+        self.up = nn.Sequential(
+            nn.ConvTranspose1d(
+                action_dim,
+                action_dim,
+                kernel_size=8,
+                stride=2,
+                padding=3,
+                output_padding=0,
+            ),
+            self.act,
+            nn.ConvTranspose1d(
+                action_dim,
+                action_dim,
+                kernel_size=8,
+                stride=2,
+                padding=2,
+                output_padding=0,
+            ),
+            self.act,
+            nn.ConvTranspose1d(
+                action_dim,
+                action_dim,
+                kernel_size=8,
+                stride=2,
+                padding=3,
+                output_padding=0,
+            ),
+        )
+
+    def forward(self, z: torch.Tensor) -> torch.Tensor:
+        squeeze_B = False
+        if z.dim() == 2:
+            z = z.unsqueeze(0)
+            squeeze_B = True
+        elif z.dim() != 3:
+            raise ValueError(f"Expected (K,H) or (B,K,H), got {tuple(z.shape)}")
+
+        B, K, H = z.shape
+        if H != self.hidden_dim:
+            raise ValueError(f"Expected H={self.hidden_dim}, got {H}")
+
+        x = self.norm(self.proj(z))  # (B, K, D/2)
+        x = x.transpose(1, 2)  # (B, D/2, K)
+        x = self.up(x)  # (B, D, T)
+        if x.shape[-1] != self.T:
+            raise ValueError(f"Got T_out={x.shape[-1]}, expected T={self.T}")
+        x = x.transpose(1, 2)  # (B, T, D)
+        return x.squeeze(0) if squeeze_B else x
+
+
+class SmallTemporalEncoder_32_256(SmallTemporalEncoder):
+    """
+    Fix temporal encoder for 100 seq of actiona
+    """
+
+    def __init__(
+        self,
+        *,
+        action_dim: int,
+        activation: str = "gelu",
+        hidden_dim: int = 256,
+        use_layernorm: bool = True,
+    ):
+        super().__init__(
+            action_dim=action_dim,
+            hidden_dim=hidden_dim,
+            activation=activation,
+            use_layernorm=use_layernorm,
+        )
+
+        layers = [
+            nn.Conv1d(action_dim, 512, kernel_size=9, stride=3, padding=2),
+            self.act,
+        ]
+
+        self.down = nn.Sequential(*layers)
+        self.proj = nn.Linear(512, hidden_dim)
+
+
+class SmallTemporalDecoder_32_256(SmallTemporalDecoder):
+    """
+    Decoder that mirrors SmallTemporalEncoder_32_128:
+    """
+
+    def __init__(
+        self,
+        *,
+        action_dim: int,
+        hidden_dim: int = 256,
+        activation: str = "gelu",
+        use_layernorm: bool = True,
+    ):
+        super().__init__(
+            action_dim=action_dim,
+            hidden_dim=hidden_dim,
+            activation=activation,
+            use_layernorm=use_layernorm,
+        )
+
+        layers = [
+            nn.ConvTranspose1d(512, action_dim, kernel_size=9, stride=3, padding=1),
+            self.act,
+        ]
+
+        self.up = nn.Sequential(*layers)
+        self.proj = nn.Linear(hidden_dim, 512)
+        self.norm = nn.LayerNorm(512) if use_layernorm else nn.Identity()
+
+
+class LargeTemporalEncoder_32_256(LargeTemporalEncoder):
+    """
+    Encoder for (B, T=100, D) that halves channels: D -> D/2,
+    and downsamples time: 100 -> 12.
+    Output: (B, K=12, H)
+    """
+
+    def __init__(
+        self,
+        *,
+        action_dim: int,
+        hidden_dim: int = 256,
+        activation: str = "gelu",
+        use_layernorm: bool = True,
+        expect_T: int | None = 100,
+    ):
+        super().__init__(
+            action_dim=action_dim,
+            hidden_dim=hidden_dim,
+            activation=activation,
+            use_layernorm=use_layernorm,
+            expect_T=expect_T,
+        )
+        layers = [
+            nn.Conv1d(action_dim, 1024, kernel_size=9, stride=3, padding=2),
+            self.act,
+        ]
+
+        self.down = nn.Sequential(*layers)
+        self.proj = nn.Linear(1024, hidden_dim)
+
+
+class LargeTemporalDecoder_32_256(LargeTemporalDecoder):
+    """
+    Decoder that mirrors LargeTemporalEncoder_32_128:
+    """
+
+    def __init__(
+        self,
+        *,
+        action_dim: int,
+        hidden_dim: int = 256,
+        activation: str = "gelu",
+        use_layernorm: bool = True,
+        K: int = 12,
+        T: int = 100,
+    ):
+        super().__init__(
+            action_dim=action_dim,
+            hidden_dim=hidden_dim,
+            activation=activation,
+            use_layernorm=use_layernorm,
+            K=K,
+            T=T,
+        )
+
+        layers = [
+            nn.ConvTranspose1d(1024, action_dim, kernel_size=9, stride=3, padding=1),
+            self.act,
+        ]
+
+        self.up = nn.Sequential(*layers)
+        self.proj = nn.Linear(hidden_dim, 1024)
+        self.norm = nn.LayerNorm(1024) if use_layernorm else nn.Identity()
+
+
+def count_params(module: nn.Module, trainable_only: bool = False) -> int:
+    if trainable_only:
+        return sum(p.numel() for p in module.parameters() if p.requires_grad)
+    return sum(p.numel() for p in module.parameters())
+
+
+def print_param_breakdown(module: nn.Module, trainable_only: bool = False) -> None:
+    total = 0
+    for name, p in module.named_parameters():
+        if trainable_only and not p.requires_grad:
+            continue
+        n = p.numel()
+        total += n
+        print(f"{name:60s} {tuple(p.shape)!s:20s} {n}")
+    print(f"\nTOTAL params: {total}")
+
+
+if __name__ == "__main__":
+    B, T, D = 8, 100, 140
+
+    enc = LargeTemporalEncoder_32_256(action_dim=D)
+    dec = LargeTemporalDecoder_32_256(action_dim=D, use_layernorm=True)
+
+    x = torch.randn(B, T, D)
+    z = enc(x)
+    x_hat = dec(z)
+    # z shape is 8, 12, 64
+    print("LargeTemporalEncoder_32_256")
+    print(count_params(enc))
+    print(count_params(enc, trainable_only=True))
+    print_param_breakdown(enc)
+
+    B, T, D = 8, 100, 14
+    enc = SmallTemporalEncoder_32_256(action_dim=D)
+    dec = SmallTemporalDecoder_32_256(action_dim=D, use_layernorm=True)
+
+    x = torch.randn(B, T, D)
+    z = enc(x)
+    x_hat = dec(z)
+    # z shape is 8, 12, 64
+
+    print("SmallTemporalEncoder_32_256")
+    print(count_params(enc))
+    print(count_params(enc, trainable_only=True))
+    print_param_breakdown(enc)
diff --git a/egomimic/models/conv/temporal_enc_dec.py b/egomimic/models/conv/temporal_enc_dec.py
deleted file mode 100644
index 4d438c82..00000000
--- a/egomimic/models/conv/temporal_enc_dec.py
+++ /dev/null
@@ -1,305 +0,0 @@
-from __future__ import annotations
-
-from typing import List
-
-import torch
-import torch.nn as nn
-
-
-class SmallTemporalEncoder(nn.Module):
-    """
-    Fix temporal encoder for 100 seq of actiona
-    """
-    def __init__(
-        self,
-        *,
-        action_dim: int,
-        activation: str = "gelu",
-        use_layernorm: bool = True,
-    ):
-        super().__init__()
-        if activation == "relu":
-            act = nn.ReLU()
-        elif activation == "gelu":
-            act = nn.GELU()
-        elif activation == "silu":
-            act = nn.SiLU()
-        else:
-            raise ValueError(f"Unknown activation: {activation}")
-        
-        layers = [nn.Conv1d(action_dim, action_dim*2, kernel_size=8, stride=2, padding=3),
-                  act,
-                  nn.Conv1d(action_dim*2, action_dim*2, kernel_size=8, stride=2, padding=2),
-                  act,
-                  nn.Conv1d(action_dim*2, action_dim*2, kernel_size=8, stride=2, padding=3),
-                  act,
-                ]        
-
-
-        hidden_dim = 64
-        self.down = nn.Sequential(*layers)
-        self.proj = nn.Linear(action_dim*2, hidden_dim)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        Input:  (B, T, D) or (T, D)
-        Output: (B, K, H) or (K, H)
-        """
-        squeeze_B = False
-        if x.dim() == 2:
-            x = x.unsqueeze(0)
-            squeeze_B = True
-        elif x.dim() != 3:
-            raise ValueError(f"Expected (T,D) or (B,T,D), got {tuple(x.shape)}")
-
-        x = x.transpose(1, 2)          # (B, D, T)
-        x = self.down(x)               # (B, D, K)
-        x = x.transpose(1, 2)          # (B, K, D)
-        x = self.proj(x)    # (B, K, H)
-
-        return x.squeeze(0) if squeeze_B else x
-
-class SmallTemporalDecoder(nn.Module):
-    """
-    Decoder that mirrors SmallTemporalEncoder:
-        Enc convs (over time, channels-first):
-            (D -> 2D) k=8 s=2 p=3
-            (2D -> 2D) k=8 s=2 p=2
-            (2D -> 2D) k=8 s=2 p=3
-        For T=100 this encoder produces K=12.
-
-    This decoder maps:
-        Input:  (B, K=12, H=64) or (K, H)
-        Output: (B, T=100, D)   or (T, D)
-    """
-    def __init__(
-        self,
-        *,
-        action_dim: int,
-        hidden_dim: int = 64,
-        activation: str = "gelu",
-        use_layernorm: bool = True,
-        K: int = 12,
-        T: int = 100,
-    ):
-        super().__init__()
-        self.action_dim = action_dim
-        self.hidden_dim = hidden_dim
-        self.K = K
-        self.T = T
-
-        if activation == "relu":
-            act = nn.ReLU()
-        elif activation == "gelu":
-            act = nn.GELU()
-        elif activation == "silu":
-            act = nn.SiLU()
-        else:
-            raise ValueError(f"Unknown activation: {activation}")
-
-        C2 = action_dim * 2
-
-        self.proj = nn.Linear(hidden_dim, C2)
-        self.norm = nn.LayerNorm(C2) if use_layernorm else nn.Identity()
-
-        self.up = nn.Sequential(
-            nn.ConvTranspose1d(C2, C2, kernel_size=8, stride=2, padding=3, output_padding=0),
-            act,
-            nn.ConvTranspose1d(C2, C2, kernel_size=8, stride=2, padding=2, output_padding=0),
-            act,
-            nn.ConvTranspose1d(C2, action_dim, kernel_size=8, stride=2, padding=3, output_padding=0),
-        )
-
-    def forward(self, z: torch.Tensor) -> torch.Tensor:
-        squeeze_B = False
-        if z.dim() == 2:
-            z = z.unsqueeze(0)
-            squeeze_B = True
-        elif z.dim() != 3:
-            raise ValueError(f"Expected (K,H) or (B,K,H), got {tuple(z.shape)}")
-
-        B, K, H = z.shape
-        if H != self.hidden_dim:
-            raise ValueError(f"Expected H={self.hidden_dim}, got {H}")
-        if K != self.K:
-            raise ValueError(f"Expected K={self.K}, got {K}")
-
-        x = self.norm(self.proj(z))     # (B, K, 2D)
-        x = x.transpose(1, 2)           # (B, 2D, K)
-        x = self.up(x)                  # (B, D, T)
-        if x.shape[-1] != self.T:
-            raise ValueError(f"Got T_out={x.shape[-1]}, expected T={self.T}")
-        x = x.transpose(1, 2)           # (B, T, D)
-
-        return x.squeeze(0) if squeeze_B else x
-
-class LargeTemporalEncoder(nn.Module):
-    """
-    Encoder for (B, T=100, D) that halves channels: D -> D/2,
-    and downsamples time: 100 -> 12.
-    Output: (B, K=12, H)
-    """
-    def __init__(
-        self,
-        *,
-        action_dim: int,
-        hidden_dim: int = 64,
-        activation: str = "gelu",
-        use_layernorm: bool = True,
-        expect_T: int | None = 100,
-    ):
-        super().__init__()
-        if action_dim % 2 != 0:
-            raise ValueError(f"action_dim must be even to halve. Got {action_dim}")
-
-        self.action_dim = action_dim
-        self.hidden_dim = hidden_dim
-        self.expect_T = expect_T
-
-        if activation == "relu":
-            act = nn.ReLU()
-        elif activation == "gelu":
-            act = nn.GELU()
-        elif activation == "silu":
-            act = nn.SiLU()
-        else:
-            raise ValueError(f"Unknown activation: {activation}")
-
-        D = action_dim
-
-        self.down = nn.Sequential(
-            nn.Conv1d(D,  action_dim, kernel_size=8, stride=2, padding=3),  # 100 -> 50
-            act,
-            nn.Conv1d(action_dim, action_dim, kernel_size=8, stride=2, padding=2),  # 50 -> 24
-            act,
-            nn.Conv1d(action_dim, action_dim, kernel_size=8, stride=2, padding=3),  # 24 -> 12
-            act,
-        )
-
-        self.proj = nn.Linear(action_dim, hidden_dim)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        squeeze_B = False
-        if x.dim() == 2:
-            x = x.unsqueeze(0)
-            squeeze_B = True
-        elif x.dim() != 3:
-            raise ValueError(f"Expected (T,D) or (B,T,D), got {tuple(x.shape)}")
-
-        B, T, D = x.shape
-        if D != self.action_dim:
-            raise ValueError(f"Expected D={self.action_dim}, got {D}")
-        if self.expect_T is not None and T != self.expect_T:
-            raise ValueError(f"Expected T={self.expect_T}, got {T}")
-
-        x = x.transpose(1, 2)           # (B, D, T)
-        x = self.down(x)                # (B, D/2, K=12)
-        x = x.transpose(1, 2)           # (B, K, D/2)
-        x = self.proj(x)     # (B, K, H)
-        return x.squeeze(0) if squeeze_B else x
-
-
-class LargeTemporalDecoder(nn.Module):
-    """
-    Decoder that mirrors LargeTemporalEncoder:
-        time: 12 -> 24 -> 50 -> 100
-        channels: H -> D/2 -> D
-    Input:  (B, K=12, H) or (K, H)
-    Output: (B, T=100, D) or (T, D)
-    """
-    def __init__(
-        self,
-        *,
-        action_dim: int,
-        hidden_dim: int = 64,
-        activation: str = "gelu",
-        use_layernorm: bool = True,
-        K: int = 12,
-        T: int = 100,
-    ):
-        super().__init__()
-        if action_dim % 2 != 0:
-            raise ValueError(f"action_dim must be even to halve. Got {action_dim}")
-
-        self.action_dim = action_dim
-        self.half_dim = action_dim // 2
-        self.hidden_dim = hidden_dim
-        self.K = K
-        self.T = T
-
-        if activation == "relu":
-            act = nn.ReLU()
-        elif activation == "gelu":
-            act = nn.GELU()
-        elif activation == "silu":
-            act = nn.SiLU()
-        else:
-            raise ValueError(f"Unknown activation: {activation}")
-
-        self.proj = nn.Linear(hidden_dim, action_dim)
-        self.norm = nn.LayerNorm(action_dim) if use_layernorm else nn.Identity()
-
-        # Mirrors paddings/strides/kernels in reverse.
-        # Lengths: 12 -> 24 -> 50 -> 100 with output_padding=0 for these params.
-        self.up = nn.Sequential(
-            nn.ConvTranspose1d(action_dim, action_dim, kernel_size=8, stride=2, padding=3, output_padding=0),
-            act,
-            nn.ConvTranspose1d(action_dim, action_dim, kernel_size=8, stride=2, padding=2, output_padding=0),
-            act,
-            nn.ConvTranspose1d(action_dim, action_dim, kernel_size=8, stride=2, padding=3, output_padding=0),
-        )
-
-    def forward(self, z: torch.Tensor) -> torch.Tensor:
-        squeeze_B = False
-        if z.dim() == 2:
-            z = z.unsqueeze(0)
-            squeeze_B = True
-        elif z.dim() != 3:
-            raise ValueError(f"Expected (K,H) or (B,K,H), got {tuple(z.shape)}")
-
-        B, K, H = z.shape
-        if H != self.hidden_dim:
-            raise ValueError(f"Expected H={self.hidden_dim}, got {H}")
-        if K != self.K:
-            raise ValueError(f"Expected K={self.K}, got {K}")
-
-        x = self.norm(self.proj(z))     # (B, K, D/2)
-        x = x.transpose(1, 2)           # (B, D/2, K)
-        x = self.up(x)                  # (B, D, T)
-        if x.shape[-1] != self.T:
-            raise ValueError(f"Got T_out={x.shape[-1]}, expected T={self.T}")
-        x = x.transpose(1, 2)           # (B, T, D)
-        return x.squeeze(0) if squeeze_B else x
-
-
-def count_params(module: nn.Module, trainable_only: bool = False) -> int:
-    if trainable_only:
-        return sum(p.numel() for p in module.parameters() if p.requires_grad)
-    return sum(p.numel() for p in module.parameters())
-
-
-def print_param_breakdown(module: nn.Module, trainable_only: bool = False) -> None:
-    total = 0
-    for name, p in module.named_parameters():
-        if trainable_only and not p.requires_grad:
-            continue
-        n = p.numel()
-        total += n
-        print(f"{name:60s} {tuple(p.shape)!s:20s} {n}")
-    print(f"\nTOTAL params: {total}")
-
-if __name__ == "__main__":
-    B, T, D = 8, 100, 140 
-
-    enc = LargeTemporalEncoder(action_dim=D)
-    dec = LargeTemporalDecoder(action_dim=D, use_layernorm=True)
-
-    x = torch.randn(B, T, D)
-    z = enc(x)
-    x_hat = dec(z)
-    
-    print(count_params(enc))
-    print(count_params(enc, trainable_only=True))
-    print_param_breakdown(enc)
-    
-    
\ No newline at end of file
diff --git a/egomimic/models/denoising_policy.py b/egomimic/models/denoising_policy.py
index 25ccc641..5c5225b0 100644
--- a/egomimic/models/denoising_policy.py
+++ b/egomimic/models/denoising_policy.py
@@ -5,6 +5,7 @@
 import torch.nn.functional as F
 
 from egomimic.models.denoising_nets import ConditionalUnet1D
+from egomimic.rldb.embodiment.embodiment import get_embodiment
 
 
 class DenoisingPolicy(nn.Module):
@@ -23,29 +24,59 @@ def __init__(
         self,
         model: ConditionalUnet1D,
         action_horizon: int,
-        infer_ac_dims: dict,
         num_inference_steps: int = None,
+        embodiment_specs: dict = None,
         **kwargs,
     ):
         super().__init__()
 
         self.model = model
         self.action_horizon = action_horizon
-        self.infer_ac_dims = infer_ac_dims
         self.num_inference_steps = num_inference_steps
+        self.embodiment_specs = embodiment_specs
+        self.codec_enabled = False
+
+        _codecs = {}
+        if embodiment_specs is not None:
+            for _emb_name, _spec in embodiment_specs.items():
+                if _spec.get("encoder") is not None:
+                    _codecs[f"{_emb_name}_encoder"] = _spec["encoder"]
+                if _spec.get("decoder") is not None:
+                    _codecs[f"{_emb_name}_decoder"] = _spec["decoder"]
+        if _codecs:
+            self.codecs = nn.ModuleDict(_codecs)
 
         self.padding = kwargs.get("padding", None)
         self.pooling = kwargs.get("pooling", None)
-        self.model_type = kwargs.get("model_type", None)
-
-        if not infer_ac_dims:
-            raise ValueError("infer_ac_dims must be a non-empty dict")
 
         for name, param in self.model.named_parameters():
             if not param.requires_grad:
                 print(f"[warn] {name} has requires_grad=False")
 
         total_params = sum(p.numel() for p in self.model.parameters())
+        if self.embodiment_specs is not None:
+            for embodiment_name, spec in self.embodiment_specs.items():
+                if spec.get("ac_dims") is None:
+                    raise ValueError(f"ac_dims must be specified for {embodiment_name}")
+            for embodiment_name, spec in self.embodiment_specs.items():
+                if spec.get("encoder") is not None:
+                    encoder_params = sum(
+                        p.numel() for p in spec["encoder"].parameters()
+                    )
+                    self.codec_enabled = True
+                if spec.get("decoder") is not None:
+                    decoder_params = sum(
+                        p.numel() for p in spec["decoder"].parameters()
+                    )
+                    self.codec_enabled = True
+                print(
+                    f"[{embodiment_name}] Encoder params: {encoder_params / 1e6:.2f}M"
+                )
+                print(
+                    f"[{embodiment_name}] Decoder params: {decoder_params / 1e6:.2f}M"
+                )
+                total_params += encoder_params + decoder_params
+
         print(
             f"[{self.__class__.__name__}] Total trainable parameters: {total_params / 1e6:.2f}M"
         )
@@ -60,7 +91,7 @@ def preprocess_sampling(self, global_cond, embodiment_name, generator=None):
             (
                 len(global_cond),
                 self.action_horizon,
-                self.infer_ac_dims[embodiment_name],
+                self.embodiment_specs[embodiment_name].get("ac_dims"),
             ),
             dtype=global_cond.dtype,
             device=global_cond.device,
@@ -68,7 +99,9 @@ def preprocess_sampling(self, global_cond, embodiment_name, generator=None):
         )
         return noise, global_cond
 
-    def inference(self, noise, global_cond, generator=None) -> torch.Tensor:  # pyright: ignore[reportUnusedParameter]
+    def inference(
+        self, noise, global_cond, embodiment_name, generator=None
+    ) -> torch.Tensor:  # pyright: ignore[reportUnusedParameter]
         """
         To be implemented in subclass: predict actions from noise and conditioning.
         """
@@ -78,13 +111,15 @@ def sample_action(self, global_cond, embodiment_name, generator=None):
         noise, global_cond = self.preprocess_sampling(
             global_cond, embodiment_name, generator
         )
-        return self.inference(noise, global_cond, generator, embodiment_name)
+        return self.inference(noise, global_cond, embodiment_name, generator)
 
-    def forward(self, global_cond, embodiment_name):
-        cond, embodiment = global_cond
-        return self.sample_action(cond, embodiment, embodiment_name)
+    def forward(self, global_cond):
+        cond, embodiment_name = global_cond
+        return self.sample_action(cond, embodiment_name)
 
-    def predict(self, actions, global_cond, embodiment_name) -> Tuple[torch.Tensor, torch.Tensor]:
+    def predict(
+        self, actions, global_cond, embodiment_name
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         To be implemented in subclass: returns (prediction, target) given action input and conditioning.
         """
@@ -121,7 +156,10 @@ def preprocess_compute_loss(self, global_cond, data, embodiment_name):
 
         return actions, global_cond
 
-    def compute_loss(self, global_cond, data, embodiment_name):
-        actions, global_cond = self.preprocess_compute_loss(global_cond, data, embodiment_name)
+    def compute_loss(self, global_cond, data):
+        embodiment_name = get_embodiment(data["embodiment"][0].item()).lower()
+        actions, global_cond = self.preprocess_compute_loss(
+            global_cond, data, embodiment_name
+        )
         pred, target = self.predict(actions, global_cond, embodiment_name)
         return self.loss_fn(pred, target)
diff --git a/egomimic/models/fm_policy.py b/egomimic/models/fm_policy.py
index 551853a6..27e74ee5 100644
--- a/egomimic/models/fm_policy.py
+++ b/egomimic/models/fm_policy.py
@@ -24,34 +24,37 @@ def __init__(
         self,
         model: ConditionalUnet1D,
         action_horizon,
-        infer_ac_dims,
         num_inference_steps=None,
-        encoder_map=None,
+        embodiment_specs=None,
         **kwargs,
     ):
         super().__init__(
-            model, action_horizon, infer_ac_dims, num_inference_steps, **kwargs
+            model, action_horizon, num_inference_steps, embodiment_specs, **kwargs
         )
         self.time_dist = kwargs.get("time_dist", "beta")
-        self.encoder_map = encoder_map
+        self.dt = -1.0 / self.num_inference_steps
 
-    def step(self, x_t, t, global_cond):
+    def step(self, x_t, t, global_cond, embodiment_name):
         if len(t.shape) != 1:
             t = torch.tensor([t], device=global_cond.device)
-        v_t = self.model(x_t, t, global_cond)
+        v_t = self.denoising_model(x_t, t, global_cond, embodiment_name)
         return x_t + self.dt * v_t, t + self.dt
 
     @override
-    def inference(self, noise, global_cond, generator=None) -> torch.Tensor:
+    def inference(
+        self, noise, global_cond, embodiment_name, generator=None
+    ) -> torch.Tensor:
         self.dt = -1.0 / self.num_inference_steps
         x_t = noise
         time = torch.ones((len(global_cond)), device=global_cond.device)
         while time[0] >= -self.dt / 2:
-            x_t, time = self.step(x_t, time, global_cond)
+            x_t, time = self.step(x_t, time, global_cond, embodiment_name)
         return x_t
 
     @override
-    def predict(self, actions, global_cond) -> Tuple[torch.Tensor, torch.Tensor]:
+    def predict(
+        self, actions, global_cond, embodiment_name
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         noise = torch.randn(actions.shape, device=actions.device)
         batch_shape = (actions.shape[0],)
         if self.time_dist == "beta":
@@ -67,8 +70,45 @@ def predict(self, actions, global_cond) -> Tuple[torch.Tensor, torch.Tensor]:
         x_t = time_expanded * noise + (1 - time_expanded) * actions
         u_t = noise - actions
 
-        v_t = self.model(x_t, time, global_cond)
+        v_t = self.denoising_model(x_t, time, global_cond, embodiment_name)
 
         target = u_t
         pred = v_t
         return pred, target
+
+    def denoising_model(self, x_t, time, global_cond, embodiment_name):
+        if self.codec_enabled:
+            x_t = self.embodiment_specs[embodiment_name]["encoder"](x_t)
+        else:
+            x_t = x_t
+        v_t = self.model(x_t, time, global_cond)
+        if self.codec_enabled:
+            v_t = self.embodiment_specs[embodiment_name]["decoder"](v_t)
+        else:
+            v_t = v_t
+        return v_t
+
+
+if __name__ == "__main__":
+    import hydra
+    from omegaconf import OmegaConf
+
+    cfg = OmegaConf.load(
+        "/coc/flash7/paphiwetsa3/projects/EgoVerse/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent.yaml"
+    )
+    model = hydra.utils.instantiate(cfg.robomimic_model.head_specs.shared)
+
+    # test the model
+    aria_input = torch.randn(8, 100, 140)
+    global_cond = torch.randn(8, 64, 256)
+    aria_output = model.step(
+        aria_input, torch.tensor([0.0]), global_cond, "aria_bimanual"
+    )
+    aria_output_inference = model.inference(aria_input, global_cond, "aria_bimanual")
+    aria_output_predict = model.predict(aria_input, global_cond, "aria_bimanual")
+
+    eva_input = torch.randn(8, 100, 14)
+    eva_output = model.step(eva_input, torch.tensor([0.0]), global_cond, "eva_bimanual")
+    eva_output_inference = model.inference(eva_input, global_cond, "eva_bimanual")
+    eva_output_predict = model.predict(eva_input, global_cond, "eva_bimanual")
+    breakpoint()
diff --git a/egomimic/rldb/embodiment/human.py b/egomimic/rldb/embodiment/human.py
index bab7432b..5c8e3da3 100644
--- a/egomimic/rldb/embodiment/human.py
+++ b/egomimic/rldb/embodiment/human.py
@@ -2,7 +2,9 @@
 
 from typing import Literal
 
-from egomimic.rldb.embodiment.embodiment import Embodiment
+import numpy as np
+
+from egomimic.rldb.embodiment.embodiment import Embodiment, get_embodiment
 from egomimic.rldb.zarr.action_chunk_transforms import (
     ActionChunkCoordinateFrameTransform,
     ConcatKeys,
@@ -15,6 +17,7 @@
 )
 from egomimic.utils.type_utils import _to_numpy
 from egomimic.utils.viz_utils import (
+    ColorPalette,
     _viz_axes,
     _viz_keypoints,
     _viz_traj,
@@ -43,6 +46,30 @@ def get_transform_list(
                 f"Unsupported mode '{mode}'. Expected one of: 'cartesian', 'keypoints'."
             )
 
+    @classmethod
+    def viz_keypoints_gt_preds(
+        cls, predictions, batch, image_key, action_key, **kwargs
+    ):
+        embodiment_id = batch["embodiment"][0].item()
+        embodiment_name = get_embodiment(embodiment_id).lower()
+
+        images = batch[image_key]
+        actions = batch[action_key]
+        pred_actions = predictions[f"{embodiment_name}_{action_key}"]
+        ims_list = []
+        images = _to_numpy(images)
+        actions = _to_numpy(actions)
+        pred_actions = _to_numpy(pred_actions)
+        for i in range(images.shape[0]):
+            image = images[i]
+            action = actions[i]
+            pred_action = pred_actions[i]
+            ims = cls.viz(image, action, mode="keypoints", color="Reds", **kwargs)
+            ims = cls.viz(ims, pred_action, mode="keypoints", color="Greens", **kwargs)
+            ims_list.append(ims)
+        ims = np.stack(ims_list, axis=0)
+        return ims
+
     @classmethod
     def viz_transformed_batch(
         cls,
@@ -50,6 +77,7 @@ def viz_transformed_batch(
         mode=Literal["traj", "axes", "keypoints"],
         action_key="actions_cartesian",
         image_key=None,
+        **kwargs,
     ):
         image_key = image_key or cls.VIZ_IMAGE_KEY
         action_key = action_key or "actions_cartesian"
@@ -59,7 +87,11 @@ def viz_transformed_batch(
         actions = _to_numpy(batch[action_key][0])
 
         return cls.viz(
-            images=images, actions=actions, mode=mode, intrinsics_key=intrinsics_key
+            images=images,
+            actions=actions,
+            mode=mode,
+            intrinsics_key=intrinsics_key,
+            **kwargs,
         )
 
     @classmethod
@@ -87,13 +119,25 @@ def viz(
                 **kwargs,
             )
         if mode == "keypoints":
+            color = kwargs.get("color", None)
+            if color is not None and ColorPalette.is_valid(color):
+                n = len(cls.FINGER_COLORS)
+                colors = {
+                    finger: ColorPalette.to_rgb(color, value=(i + 1) / (n + 1))
+                    for i, finger in enumerate(cls.FINGER_COLORS)
+                }
+                dot_color = ColorPalette.to_rgb(color, value=0.7)
+            else:
+                colors = cls.FINGER_COLORS
+                dot_color = cls.DOT_COLOR
             return _viz_keypoints(
                 images=images,
                 actions=actions,
                 intrinsics_key=intrinsics_key,
                 edges=cls.FINGER_EDGES,
-                colors=cls.FINGER_COLORS,
                 edge_ranges=cls.FINGER_EDGE_RANGES,
+                colors=colors,
+                dot_color=dot_color,
                 **kwargs,
             )
         raise ValueError(
@@ -227,6 +271,7 @@ class Aria(Human):
         ("ring", 9, 12),
         ("pinky", 12, 15),
     ]
+    DOT_COLOR = (255, 165, 0)
 
 
 class Scale(Human):
diff --git a/egomimic/trainHydra.py b/egomimic/trainHydra.py
index 7f2644c5..431d2f7d 100644
--- a/egomimic/trainHydra.py
+++ b/egomimic/trainHydra.py
@@ -1,6 +1,7 @@
 import copy
 import os
 import signal
+import subprocess
 from collections.abc import Mapping
 from typing import Any, Dict, List, Optional, Tuple
 
@@ -114,7 +115,7 @@ def train(cfg: DictConfig) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         data_schematic.infer_norm_from_dataset(
             norm_dataset,
             dataset_name,
-            sample_frac=0.005,
+            sample_frac=0.0001,
             benchmark_dir=os.path.join(
                 cfg.trainer.default_root_dir, "benchmark_stats.json"
             ),
@@ -216,6 +217,9 @@ def main(cfg: DictConfig) -> Optional[float]:
     :param cfg: DictConfig configuration composed by Hydra.
     :return: Optional[float] with optimized metric value.
     """
+    script = os.path.join(os.path.dirname(__file__), "utils/aws/setup_secret.sh")
+    subprocess.run(["bash", script], check=True)
+
     # apply extra utilities
     # (e.g. ask for tags if none are provided in cfg, print cfg tree, etc.)
     extras(cfg)
diff --git a/egomimic/train_zarr.yaml b/egomimic/train_zarr.yaml
new file mode 100644
index 00000000..26f14b3b
--- /dev/null
+++ b/egomimic/train_zarr.yaml
@@ -0,0 +1,111 @@
+defaults:
+  - model: hpt_cotrain_flow_shared_head_latent
+  - visualization: eva_cartesian_aria_cartesian
+  - paths: default
+  - trainer: ddp
+  - debug: null
+  - logger: wandb
+  - data: eva_human_keypoints_cotrain
+  - callbacks: checkpoints
+  - override hydra/launcher: submitit
+  - _self_
+
+name: latent_flow
+description: latent_flow
+ckpt_path: null
+train: true
+eval: false
+
+eval_class:
+  _target_: egomimic.scripts.evaluation.Eve
+  mode: real
+  arm: both
+  eval_path: "./logs/eval/${name}_${now:%Y-%m-%d_%H-%M-%S}"
+
+hydra:
+  run:
+    # Dir should be experiment_name/description_{timestamp}
+    dir: ./logs/${name}/${description}_${now:%Y-%m-%d_%H-%M-%S}
+  sweep:
+    dir: ./logs/${name}/${description}_${now:%Y-%m-%d_%H-%M-%S}
+
+launch_params:
+  gpus_per_node: 1
+  nodes: 1
+
+data_schematic: # Dynamically fill in these shapes from the dataset
+  _target_: egomimic.rldb.zarr.utils.DataSchematic
+  norm_mode: quantile
+  schematic_dict:
+    eva_bimanual:
+      front_img_1: #batch key
+        key_type: camera_keys # key type
+        zarr_key: observations.images.front_img_1 # dataset key
+      right_wrist_img:
+        key_type: camera_keys
+        zarr_key: observations.images.right_wrist_img
+      left_wrist_img:
+        key_type: camera_keys
+        zarr_key: observations.images.left_wrist_img
+      ee_pose:
+        key_type: proprio_keys
+        zarr_key: observations.state.ee_pose
+      joint_positions:
+        key_type: proprio_keys
+        zarr_key: observations.state.joint_positions
+      actions_joints:
+        key_type: action_keys
+        zarr_key: actions_joints
+      actions_cartesian:
+        key_type: action_keys
+        zarr_key: actions_cartesian
+      embodiment:
+        key_type: metadata_keys
+        zarr_key: metadata.embodiment
+    aria_bimanual:
+      front_img_1:
+        key_type: camera_keys
+        zarr_key: observations.images.front_img_1
+      ee_pose:
+        key_type: proprio_keys
+        zarr_key: observations.state.ee_pose
+      actions_cartesian:
+        key_type: action_keys
+        zarr_key: actions_cartesian
+      actions_keypoints:
+        key_type: action_keys
+        zarr_key: actions_keypoints
+      keypoint_positions:
+        key_type: proprio_keys
+        zarr_key: observations.state.keypoints
+      embodiment:
+        key_type: metadata_keys
+        zarr_key: metadata.embodiment
+    mecka_bimanual:
+      front_img_1:
+        key_type: camera_keys
+        zarr_key: observations.images.front_img_1
+      ee_pose:
+        key_type: proprio_keys
+        zarr_key: observations.state.ee_pose
+      actions_cartesian:
+        key_type: action_keys
+        zarr_key: actions_cartesian
+      embodiment:
+        key_type: metadata_keys
+        zarr_key: metadata.embodiment
+    scale_bimanual:
+      front_img_1:
+        key_type: camera_keys
+        zarr_key: observations.images.front_img_1
+      ee_pose:
+        key_type: proprio_keys
+        zarr_key: observations.state.ee_pose
+      actions_cartesian:
+        key_type: action_keys
+        zarr_key: actions_cartesian
+      embodiment:
+        key_type: metadata_keys
+        zarr_key: metadata.embodiment
+
+seed: 42
diff --git a/egomimic/utils/viz_utils.py b/egomimic/utils/viz_utils.py
index bff862bd..d4289a74 100644
--- a/egomimic/utils/viz_utils.py
+++ b/egomimic/utils/viz_utils.py
@@ -1,4 +1,5 @@
 import cv2
+import matplotlib.pyplot as plt
 import numpy as np
 from scipy.spatial.transform import Rotation as R
 
@@ -22,6 +23,13 @@ class ColorPalette:
     def is_valid(cls, name: str) -> bool:
         return name in vars(cls).values()
 
+    @classmethod
+    def to_rgb(cls, cmap_name: str, value: float = 0.7) -> tuple[int, int, int]:
+        """Convert a ColorPalette cmap name to an RGB tuple (0-255).
+        value: 0-1, where higher = darker shade."""
+        rgba = plt.get_cmap(cmap_name)(value)
+        return tuple(int(c * 255) for c in rgba[:3])
+
 
 def _prepare_viz_image(img):
     if img.ndim == 3 and img.shape[0] in (1, 3):
@@ -161,7 +169,7 @@ def _draw_rotation_at_anchor(
 
 
 def _viz_keypoints(
-    images, actions, intrinsics_key, edges, colors, edge_ranges, **kwargs
+    images, actions, intrinsics_key, edges, edge_ranges, colors, dot_color, **kwargs
 ):
     """Visualize all 21 MANO keypoints per hand, projected onto the image."""
     # Prepare image
@@ -178,7 +186,7 @@ def _viz_keypoints(
     keypoints = {}
     keypoints["left"] = left_keypoints.reshape(-1, 3)
     keypoints["right"] = right_keypoints.reshape(-1, 3)
-    for hand, dot_color in [("left", (0, 120, 255)), ("right", (255, 80, 0))]:
+    for hand, dot_color in [("left", dot_color), ("right", dot_color)]:
         kps_cam = keypoints[hand]
         # Camera frame -> pixels
         kps_px = cam_frame_to_cam_pixels(kps_cam, intrinsics)  # (42, 3+) 21 per arm