GaTech-RL2 · SimarKareer · Feb 24, 2026 · Feb 9, 2026
diff --git a/.gitignore b/.gitignore
@@ -33,4 +33,7 @@ lerobot_test/
 **/lerobot_test/
 **/lerobot_test/**
 **/robot/models/**
-**/robot/models/
+**/robot/models/
+external/scale/scripts/datasets
+_turbojpeg_lib/
+external/scale/scripts/scale_data/
diff --git a/egomimic/algo/hpt.py b/egomimic/algo/hpt.py
@@ -971,14 +971,12 @@ def process_batch_for_training(self, batch):
         """
         processed_batch = {}
 
-        for embodiment_id, _batch in batch.items():
+        for embodiment_name, _batch in batch.items():
+            embodiment_id = get_embodiment_id(embodiment_name)
             processed_batch[embodiment_id] = {}
             for key, value in _batch.items():
-                key_name = self.data_schematic.lerobot_key_to_keyname(
-                    key, embodiment_id
-                )
-                if key_name is not None:
-                    processed_batch[embodiment_id][key_name] = value
+                if key is not None:
+                    processed_batch[embodiment_id][key] = value
 
             ac_key = self.ac_keys[embodiment_id]
             if len(processed_batch[embodiment_id][ac_key].shape) != 3:
@@ -992,6 +990,9 @@ def process_batch_for_training(self, batch):
             processed_batch[embodiment_id] = self.data_schematic.normalize_data(
                 processed_batch[embodiment_id], embodiment_id
             )
+            processed_batch[embodiment_id]["embodiment"] = torch.tensor(
+            [embodiment_id], device=self.device, dtype=torch.int64
+            )
 
         return processed_batch
 
@@ -1009,12 +1010,12 @@ def forward_training(self, batch):
         predictions = OrderedDict()
         hpt_batches = {}
         self.training_step += 1
-        for embodiment_id, _batch in batch.items():
+        for embodiment_id, _batch in batch.items(): # TODO why don't we use batch with embodiment_name to keep things consistent
+            embodiment_name = get_embodiment(embodiment_id).lower()
             cam_keys = self.camera_keys[embodiment_id]
             proprio_keys = self.proprio_keys[embodiment_id]
             lang_keys = self.lang_keys[embodiment_id]
             ac_key = self.ac_keys[embodiment_id]
-            embodiment_name = get_embodiment(embodiment_id).lower()
             aux_ac_keys = self.auxiliary_ac_keys.get(embodiment_name, [])
             data = self._robomimic_to_hpt_data(
                 _batch, cam_keys, proprio_keys, lang_keys, ac_key, aux_ac_keys
@@ -1059,11 +1060,11 @@ def forward_eval(self, batch):
         """
         unnorm_preds = {}
         for embodiment_id, _batch in batch.items():
+            embodiment_name = get_embodiment(embodiment_id).lower()
             cam_keys = self.camera_keys[embodiment_id]
             proprio_keys = self.proprio_keys[embodiment_id]
             lang_keys = self.lang_keys[embodiment_id]
             ac_key = self.ac_keys[embodiment_id]
-            embodiment_name = get_embodiment(embodiment_id).lower()
             aux_ac_keys = self.auxiliary_ac_keys.get(embodiment_name, [])
             data = self._robomimic_to_hpt_data(
                 _batch, cam_keys, proprio_keys, lang_keys, ac_key, aux_ac_keys
@@ -1250,6 +1251,7 @@ def visualize_preds(self, predictions, batch):
         Returns:
             ims (np.ndarray): (B, H, W, 3) - images with actions drawn on top
         """
+
         embodiment_id = batch["embodiment"][0].item()
         embodiment_name = get_embodiment(embodiment_id).lower()
         ac_key = self.ac_keys[embodiment_id]

diff --git a/egomimic/hydra_configs/data/zarr_test.yaml b/egomimic/hydra_configs/data/zarr_test.yaml
@@ -0,0 +1,60 @@
+_target_: egomimic.pl_utils.pl_data_utils.MultiDataModuleWrapper
+
+train_datasets:
+  scale_bimanual:
+    _target_: egomimic.rldb.zarr.ZarrDataset
+    Episode_path: external/scale/scripts/datasets/2026-02-19-03-21-23-570038/697c1e6c0cac8cd3c4873844_episode_000000.zarr
+    key_map:
+      front_img_1:
+        key_type: camera_keys
+        zarr_key: observations.images.front_img_1
+      ee_pose:
+        key_type: proprio_keys
+        zarr_key: observations.state.ee_pose
+        horizon: 100
+      actions_cartesian:
+        key_type: action_keys
+        zarr_key: actions_ee_se3_world
+        horizon: 100
+      actions_keypoints:
+        key_type: action_keys
+        zarr_key: actions_keypoint_world
+        horizon: 100
+      actions_head_cartesian:
+        key_type: action_keys
+        zarr_key: actions_head_se3_world
+        horizon: 100
+valid_datasets:
+  scale_bimanual:
+    _target_: egomimic.rldb.zarr.ZarrDataset
+    Episode_path: external/scale/scripts/datasets/2026-02-19-03-21-23-570038/697c1e6c0cac8cd3c4873844_episode_000000.zarr
+    key_map:
+      front_img_1:
+        key_type: camera_keys
+        zarr_key: observations.images.front_img_1
+      ee_pose:
+        key_type: proprio_keys
+        zarr_key: observations.state.ee_pose
+        horizon: 100
+      actions_cartesian:
+        key_type: action_keys
+        zarr_key: actions_ee_se3_world
+        horizon: 100
+      actions_keypoints:
+        key_type: action_keys
+        zarr_key: actions_keypoint_world
+        horizon: 100
+      actions_head_cartesian:
+        key_type: action_keys
+        zarr_key: actions_head_se3_world
+        horizon: 100
+
+train_dataloader_params:
+  dataset1:
+    batch_size: 32
+    num_workers: 10
+
+valid_dataloader_params:
+  dataset1:
+    batch_size: 32
+    num_workers: 10
diff --git a/egomimic/hydra_configs/model/hpt_bc_flow_scale.yaml b/egomimic/hydra_configs/model/hpt_bc_flow_scale.yaml
@@ -0,0 +1,122 @@
+_target_: egomimic.pl_utils.pl_model.ModelWrapper
+robomimic_model:
+  _target_: egomimic.algo.hpt.HPT
+  data_schematic: _${data.dataset.data_schematic}
+  camera_transforms:
+    scale_bimanual:
+      _target_: egomimic.utils.egomimicUtils.CameraTransforms
+      intrinsics_key: "scale" # change to base_half if using half res
+      extrinsics_key: "scale"
+
+  diffusion: true 
+  6dof: true
+
+  ac_keys:
+    scale_bimanual: "actions_cartesian"
+
+  trunk:
+    embed_dim: 256
+    num_blocks: 64
+    num_heads: 8
+    token_postprocessing: "action_token"
+    observation_horizon: 1
+    action_horizon: 64
+    no_trunk: false
+    use_domain_embedding: true
+    drop_path: 0.1
+    weight_init_style: "pytorch"
+
+  multitask: false
+  pretrained: false
+  pretrained_checkpoint: "" # TODO
+  reverse_kl_samples: 8
+
+  domains: ["scale_bimanual"]
+  shared_obs_keys: ["front_img_1"]
+
+  shared_stem_specs:
+    front_img_1:
+      _target_: egomimic.models.hpt_nets.MLPPolicyStem
+      input_dim: 256
+      output_dim: 256
+      widths: [256]
+      specs:
+        random_horizon_masking: false
+        cross_attn:
+          crossattn_latent: 16
+          crossattn_heads: 8
+          crossattn_dim_head: 64
+          crossattn_modality_dropout: 0.1
+          modality_embed_dim: 256
+
+  stem_specs:
+    scale_bimanual:
+      state_ee_pose:
+        _target_: egomimic.models.hpt_nets.MLPPolicyStem
+        input_dim: 14
+        output_dim: 256
+        widths: [256]
+        specs:
+          random_horizon_masking: false
+          cross_attn:
+            crossattn_latent: 16
+            crossattn_heads: 8
+            crossattn_dim_head: 64
+            crossattn_modality_dropout: 0.1
+            modality_embed_dim: 256
+
+  head_specs:
+    scale_bimanual: 
+      _target_: egomimic.models.fm_policy.FMPolicy
+      action_horizon: 100
+      num_inference_steps: 50
+      pooling: null
+      time_dist: "beta"
+      infer_ac_dims:
+        scale_bimanual: 14
+      model:
+        _target_: egomimic.models.denoising_nets.CrossTransformer
+        nblocks: 6
+        cond_dim: 256
+        hidden_dim: 128
+        act_dim: 14
+        act_seq: 100
+        n_heads: 4
+        dropout: 0.1
+        mlp_layers: 4
+        mlp_ratio: 4
+
+  encoder_specs:
+    front_img_1:
+      _target_: egomimic.models.hpt_nets.ResNet
+      output_dim: 256
+
+  train_image_augs:
+    _target_: torchvision.transforms.Compose
+    transforms:
+      - _target_: torchvision.transforms.ColorJitter
+        brightness: 0.1
+        contrast: 0.1
+        saturation: 0.1
+        hue: 0.05
+      - _target_: torchvision.transforms.Normalize
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+  eval_image_augs:
+    _target_: torchvision.transforms.Compose
+    transforms:
+      - _target_: torchvision.transforms.Normalize
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+
+optimizer:
+  _target_: torch.optim.AdamW
+  _partial_: true
+  lr: 3e-4
+  weight_decay: 0.0001
+
+scheduler:
+  _target_: torch.optim.lr_scheduler.CosineAnnealingLR
+  _partial_: true
+  T_max: 1400
+  eta_min: 1e-5
diff --git a/egomimic/hydra_configs/train_zarr.yaml b/egomimic/hydra_configs/train_zarr.yaml
@@ -0,0 +1,33 @@
+defaults:
+  - model: hpt_bc_flow_aria
+  - paths: default
+  - trainer: ddp
+  - debug: null
+  - logger: wandb
+  - data: test_multi_zarr
+  - callbacks: checkpoints
+  - override hydra/launcher: submitit
+  - _self_
+
+name: test
+description: test
+ckpt_path: null
+train: true
+eval: false
+
+eval_class:
+  _target_: egomimic.scripts.evaluation.Eve
+  mode: real
+  arm: both
+  eval_path: "./logs/eval/${name}_${now:%Y-%m-%d_%H-%M-%S}"
+
+hydra:
+  run:
+    # Dir should be experiment_name/description_{timestamp}
+    dir: ./logs/${name}/${description}_${now:%Y-%m-%d_%H-%M-%S}
+  sweep:
+    dir: ./logs/${name}/${description}_${now:%Y-%m-%d_%H-%M-%S}
+
+launch_params:
+  gpus_per_node: 1
+  nodes: 1
diff --git a/egomimic/pl_utils/pl_data_utils.py b/egomimic/pl_utils/pl_data_utils.py
@@ -94,7 +94,7 @@ def val_dataloader(self):
         iterables = dict()
         for dataset_name, dataset in self.valid_datasets.items():
             dataset_params = self.valid_dataloader_params.get(dataset_name, {})
-            iterables[dataset.embodiment] = DataLoader(
+            iterables[dataset_name] = DataLoader(
                 dataset,
                 shuffle=False,
                 collate_fn=self.collate_fn,

diff --git a/egomimic/rldb/.gitignore b/egomimic/rldb/.gitignore
@@ -2,4 +2,6 @@ benchmark_files/
 *.egg-info
 *.parquet
 *.pyc
-*.hdf5
+*.hdf5
+*/lerobot
+*/zarr
diff --git a/egomimic/rldb/utils.py b/egomimic/rldb/utils.py
@@ -94,6 +94,9 @@ class EMBODIMENT(Enum):
     MECKA_BIMANUAL = 9
     MECKA_RIGHT_ARM = 10
     MECKA_LEFT_ARM = 11
+    SCALE_BIMANUAL = 12
+    SCALE_RIGHT_ARM = 13
+    SCALE_LEFT_ARM = 14
 
 
 SEED = 42

diff --git a/egomimic/rldb/zarr/__init__.py b/egomimic/rldb/zarr/__init__.py
@@ -7,11 +7,17 @@
     MultiDataset,
     ZarrDataset,
     ZarrEpisode,
+    LocalEpisodeResolver,
+    S3EpisodeResolver,
 )
+#from egomimic.rldb.zarr.zarr_writer import ZarrWriter
 
 __all__ = [
     "EpisodeResolver",
     "MultiDataset",
     "ZarrDataset",
     "ZarrEpisode",
+    "ZarrWriter",
+    "LocalEpisodeResolver",
+    "S3EpisodeResolver",
 ]
diff --git a/egomimic/rldb/zarr/action_chunk_transforms.py b/egomimic/rldb/zarr/action_chunk_transforms.py
@@ -25,7 +25,7 @@
     _matrix_to_xyzypr,
     _xyzwxyz_to_matrix,
 )
-
+from egomimic.utils.egomimicUtils import EXTRINSICS
 # ---------------------------------------------------------------------------
 # Base Transform
 # ---------------------------------------------------------------------------
@@ -436,12 +436,16 @@ def build_eva_bimanual_transform_list(
     obs_key: str = "observations.state.ee_pose",
     chunk_length: int = 100,
     stride: int = 1,
+    extrinsics_key: str = "x5Dec13_2",
     is_quat: bool = True,
-    left_extra_batch_key: dict | None = None,
-    right_extra_batch_key: dict | None = None,
 ) -> list[Transform]:
     """Canonical EVA bimanual transform pipeline used by tests and notebooks."""
-    transform_list: list[Transform] = [
+    extrinsics = EXTRINSICS[extrinsics_key]
+    left_extrinsics_pose = _matrix_to_xyzwxyz(extrinsics["left"][None, :])[0]
+    right_extrinsics_pose = _matrix_to_xyzwxyz(extrinsics["right"][None, :])[0]
+    left_extra_batch_key = {"left_extrinsics_pose": left_extrinsics_pose}
+    right_extra_batch_key = {"right_extrinsics_pose": right_extrinsics_pose}
+    transform_list = [
         ActionChunkCoordinateFrameTransform(
             target_world=left_target_world,
             chunk_world=left_cmd_world,
@@ -495,6 +499,7 @@ def build_eva_bimanual_transform_list(
             stride=stride,
         ),
     ]
+
     if is_quat:
         transform_list.append(
             XYZWXYZ_to_XYZYPR(
@@ -644,4 +649,4 @@ def build_aria_bimanual_transform_list(
             DeleteKeys(keys_to_delete=keys_to_delete),
         ]
     )
-    return transform_list
+    return transform_list