GaTech-RL2 · fkryan · Feb 28, 2026
diff --git a/egomimic/scripts/aria_process/aria_helper.py b/egomimic/scripts/aria_process/aria_helper.py
@@ -53,7 +53,8 @@ def zarr_job(
     dataset_name: str,
     arm: str,
     description: str = "",
-) -> tuple[Path, Path] | None:
+    save_mp4: bool = True,
+) -> tuple[Path, Path | None] | None:
     """
     Convert one <vrs, vrs.json, mps_*> trio to a Zarr dataset.
     """
@@ -71,9 +72,9 @@ def zarr_job(
         nthreads=2,
         debug=False,
         benchmark=False,
-        save_mp4=True,
+        save_mp4=save_mp4,
         description=description,
         dataset_name=dataset_name
     )
 
-    return aria_zarr_main(args)
+    return aria_zarr_main(args)
diff --git a/egomimic/scripts/aria_process/aria_to_zarr.py b/egomimic/scripts/aria_process/aria_to_zarr.py
@@ -23,7 +23,6 @@
     compute_orientation_rotation_matrix,
     slam_to_rgb,
     undistort_to_linear,
-    cpf_to_rgb
 )
 from lerobot.common.datasets.lerobot_dataset import LEROBOT_HOME
 from projectaria_tools.core import data_provider, mps
@@ -518,13 +517,16 @@ def process_episode(episode_path, arm: str, low_res=False, benchmark=False):
         rgb_timestamps_ns = np.array(stream_timestamps_ns["rgb"])
 
         print(f"[DEBUG] LENGTH BEFORE CLEANING: {len(hand_cartesian_pose)}")
-        [hand_cartesian_pose, hand_keypoints_pose, head_pose], images, eye_gaze, rgb_timestamps_ns = (
-            AriaVRSExtractor.clean_data(
-                poses=[hand_cartesian_pose, hand_keypoints_pose, head_pose],
-                images=images,
-                eye_gaze=eye_gaze,
-                timestamps=rgb_timestamps_ns
-            )
+        (
+            [hand_cartesian_pose, hand_keypoints_pose, head_pose],
+            images,
+            eye_gaze,
+            rgb_timestamps_ns,
+        ) = AriaVRSExtractor.clean_data(
+            poses=[hand_cartesian_pose, hand_keypoints_pose, head_pose],
+            images=images,
+            eye_gaze=eye_gaze,
+            timestamps=rgb_timestamps_ns,
         )
         # actions, pose, images = AriaVRSExtractor.clean_data_projection(actions=actions, pose=pose, images=images, arm=arm)
         print(f"[DEBUG] LENGTH AFTER CLEANING: {len(hand_cartesian_pose)}")
@@ -1598,19 +1600,21 @@ def extract_episode(
             enable_sharding=False,
             task="",
         )
-        mp4_path = output_dir / f"{episode_name}.mp4"
-        W, H = 960, 720
-        p = start_ffmpeg_mp4(mp4_path, W, H, fps=30, pix_fmt="rgb24")
-        for video_images in AriaVRSExtractor.iter_images(
-            episode_path, chunk_length=256, height=H, width=W, focal_mult=3
-        ):
-            for image in video_images:
-                image = prep_frame(image, H, W)
-                if image is None:
-                    continue
-                p.stdin.write(image.tobytes())
-        p.stdin.close()
-        p.wait()
+        mp4_path = None
+        if self.save_mp4:
+            mp4_path = output_dir / f"{episode_name}.mp4"
+            W, H = 960, 720
+            p = start_ffmpeg_mp4(mp4_path, W, H, fps=30, pix_fmt="rgb24")
+            for video_images in AriaVRSExtractor.iter_images(
+                episode_path, chunk_length=256, height=H, width=W, focal_mult=3
+            ):
+                for image in video_images:
+                    image = prep_frame(image, H, W)
+                    if image is None:
+                        continue
+                    p.stdin.write(image.tobytes())
+            p.stdin.close()
+            p.wait()
         return zarr_path, mp4_path
 
     def extract_episodes(

diff --git a/egomimic/scripts/aria_process/aria_utils.py b/egomimic/scripts/aria_process/aria_utils.py
@@ -27,11 +27,23 @@ def build_camera_matrix(provider, pose_t):
     return T_world_rgb_camera
 
 
-def undistort_to_linear(provider, stream_ids, raw_image, camera_label="rgb", height=480, width=640, focal_mult=2):
+def undistort_to_linear(
+    provider,
+    stream_ids,
+    raw_image,
+    camera_label="rgb",
+    height=480,
+    width=640,
+    focal_mult=2,
+):
     camera_label = provider.get_label_from_stream_id(stream_ids[camera_label])
     calib = provider.get_device_calibration().get_camera_calib(camera_label)
     warped = calibration.get_linear_camera_calibration(
-        height, width, 133.25430222 * focal_mult, camera_label, calib.get_transform_device_camera()
+        height,
+        width,
+        133.25430222 * focal_mult,
+        camera_label,
+        calib.get_transform_device_camera(),
     )
     warped_image = calibration.distort_by_calibration(raw_image, warped, calib)
     warped_rot = np.rot90(warped_image, k=3)
@@ -106,6 +118,7 @@ def slam_to_rgb(provider):
 
     return transform
 
+
 def compute_orientation_rotation_matrix(palm_pose, wrist_pose, palm_normal):
     x_axis = wrist_pose - palm_pose
     x_axis = np.ravel(x_axis) / np.linalg.norm(x_axis)
@@ -119,6 +132,7 @@ def compute_orientation_rotation_matrix(palm_pose, wrist_pose, palm_normal):
     rot_matrix = np.column_stack([-1 * x_axis, y_axis, z_axis])
     return rot_matrix
 
+
 def coordinate_frame_to_ypr(x_axis, y_axis, z_axis):
     rot_matrix = np.column_stack([x_axis, y_axis, z_axis])
     rotation = R.from_matrix(rot_matrix)
@@ -127,6 +141,7 @@ def coordinate_frame_to_ypr(x_axis, y_axis, z_axis):
         euler_ypr = np.zeros_like(euler_ypr)
     return euler_ypr
 
+
 def cpf_to_rgb(provider):
     """
     Get cpf (eye tracking origin) to rgb camera transform (rotated upright)