diff --git a/egomimic/hydra_configs/data/scale_zarr_test.yaml b/egomimic/hydra_configs/data/scale_zarr_test.yaml new file mode 100644 index 00000000..80784f19 --- /dev/null +++ b/egomimic/hydra_configs/data/scale_zarr_test.yaml @@ -0,0 +1,71 @@ +_target_: egomimic.pl_utils.pl_data_utils.MultiDataModuleWrapper +train_datasets: + scale_bimanual: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.MultiDataset._from_resolver + resolver: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.LocalEpisodeResolver + folder_path: /coc/flash7/scratch/egoverseDebugDatasets/scale/2026-02-24-01-49-24-166324 + key_map: + observations.images.front_img_1: #batch key + key_type: camera_keys # key type + zarr_key: images.front_1 + right.action_ee_pose: + key_type: action_keys + zarr_key: right.obs_ee_pose + horizon: 30 + left.action_ee_pose: + key_type: action_keys + zarr_key: left.obs_ee_pose + horizon: 30 + right.obs_ee_pose: + key_type: proprio_keys + zarr_key: right.obs_ee_pose + left.obs_ee_pose: + key_type: proprio_keys + zarr_key: left.obs_ee_pose + obs_head_pose: + zarr_key: obs_head_pose + key_type: proprio_keys + transform_list: + _target_: egomimic.rldb.zarr.action_chunk_transforms.build_aria_bimanual_transform_list + stride: 1 + mode: train +valid_datasets: + scale_bimanual: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.MultiDataset._from_resolver + resolver: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.LocalEpisodeResolver + folder_path: /coc/flash7/scratch/egoverseDebugDatasets/scale/2026-02-24-01-49-24-166324 + key_map: + observations.images.front_img_1: #batch key + key_type: camera_keys # key type + zarr_key: images.front_1 + right.action_ee_pose: + key_type: action_keys + zarr_key: right.obs_ee_pose + horizon: 30 + left.action_ee_pose: + key_type: action_keys + zarr_key: left.obs_ee_pose + horizon: 30 + right.obs_ee_pose: + key_type: proprio_keys + zarr_key: right.obs_ee_pose + left.obs_ee_pose: + key_type: proprio_keys + zarr_key: left.obs_ee_pose + obs_head_pose: + zarr_key: obs_head_pose + key_type: proprio_keys + transform_list: + _target_: egomimic.rldb.zarr.action_chunk_transforms.build_aria_bimanual_transform_list + stride: 1 + mode: valid +train_dataloader_params: + scale_bimanual: + batch_size: 32 + num_workers: 10 +valid_dataloader_params: + scale_bimanual: + batch_size: 32 + num_workers: 10 diff --git a/egomimic/hydra_configs/model/hpt_bc_flow_scale.yaml b/egomimic/hydra_configs/model/hpt_bc_flow_scale.yaml index 2c9d66ef..d4d84f76 100644 --- a/egomimic/hydra_configs/model/hpt_bc_flow_scale.yaml +++ b/egomimic/hydra_configs/model/hpt_bc_flow_scale.yaml @@ -53,7 +53,7 @@ robomimic_model: scale_bimanual: state_ee_pose: _target_: egomimic.models.hpt_nets.MLPPolicyStem - input_dim: 14 + input_dim: 12 output_dim: 256 widths: [256] specs: @@ -73,13 +73,13 @@ robomimic_model: pooling: null time_dist: "beta" infer_ac_dims: - scale_bimanual: 14 + scale_bimanual: 12 model: _target_: egomimic.models.denoising_nets.CrossTransformer nblocks: 6 cond_dim: 256 hidden_dim: 128 - act_dim: 14 + act_dim: 12 act_seq: 100 n_heads: 4 dropout: 0.1 diff --git a/egomimic/hydra_configs/train_zarr.yaml b/egomimic/hydra_configs/train_zarr.yaml index 06c530a1..53afea6f 100644 --- a/egomimic/hydra_configs/train_zarr.yaml +++ b/egomimic/hydra_configs/train_zarr.yaml @@ -4,7 +4,7 @@ defaults: - trainer: debug - debug: null - logger: debug - - data: eva_bc_zarr.yaml + - data: eva_bc_zarr - callbacks: checkpoints - override hydra/launcher: submitit - _self_ @@ -115,4 +115,4 @@ data_schematic: # Dynamically fill in these shapes from the dataset mecka_bimanual: front_img_1 scale_bimanual: - front_img_1 \ No newline at end of file + front_img_1 diff --git a/egomimic/scripts/zarr_data_viz.ipynb b/egomimic/scripts/zarr_data_viz.ipynb index 822580fb..c3b0b085 100644 --- a/egomimic/scripts/zarr_data_viz.ipynb +++ b/egomimic/scripts/zarr_data_viz.ipynb @@ -328,8 +328,8 @@ "# EPISODE_PATH = Path(\"/coc/flash7/scratch/egoverseDebugDatasets/scale/697a9070da7b91acaf3f2d88_episode_000000.zarr\") # Scale\n", "# intrinsics_key = \"scale\"\n", "\n", - "EPISODE_PATH = Path(\"/coc/flash7/scratch/egoverseDebugDatasets/1767671007927.zarr/\") # Aria\n", - "intrinsics_key = \"base\"\n", + "EPISODE_PATH = Path(\"/coc/flash7/scratch/egoverseDebugDatasets/scale/2026-02-24-01-49-24-166324/697a9070da7b91acaf3f2d88_episode_000002.zarr\") # Aria\n", + "intrinsics_key = \"scale\"\n", "\n", "\n", "key_map = {\n", @@ -350,6 +350,47 @@ ")\n" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "1b4c03ca", + "metadata": {}, + "outputs": [], + "source": [ + "# Full MultiDataset via LocalEpisodeResolver (mirrors test_multi_zarr.yaml)\n", + "from egomimic.rldb.zarr.zarr_dataset_multi import LocalEpisodeResolver, MultiDataset\n", + "from egomimic.rldb.zarr.action_chunk_transforms import build_aria_bimanual_transform_list\n", + "\n", + "SCALE_FOLDER = Path(\"/coc/flash7/scratch/egoverseDebugDatasets/scale/2026-02-24-01-49-24-166324\")\n", + "\n", + "key_map = {\n", + " \"images.front_1\": {\"zarr_key\": \"images.front_1\"},\n", + " \"right.obs_ee_pose\": {\"zarr_key\": \"right.obs_ee_pose\"},\n", + " \"left.obs_ee_pose\": {\"zarr_key\": \"left.obs_ee_pose\"},\n", + " \"right.action_ee_pose\": {\"zarr_key\": \"right.obs_ee_pose\", \"horizon\": 30},\n", + " \"left.action_ee_pose\": {\"zarr_key\": \"left.obs_ee_pose\", \"horizon\": 30},\n", + " \"obs_head_pose\": {\"zarr_key\": \"obs_head_pose\"},\n", + "}\n", + "\n", + "transform_list = build_aria_bimanual_transform_list(\n", + " stride=1,\n", + ")\n", + "\n", + "resolver = LocalEpisodeResolver(\n", + " folder_path=SCALE_FOLDER,\n", + " key_map=key_map,\n", + " transform_list=transform_list,\n", + ")\n", + "\n", + "multi_ds = MultiDataset._from_resolver(resolver, mode=\"total\")\n", + "print(f\"MultiDataset total frames: {len(multi_ds)}\")\n", + "print(f\"Underlying episodes: {list(multi_ds.datasets.keys())}\")\n", + "\n", + "loader = torch.utils.data.DataLoader(multi_ds, batch_size=1, shuffle=False)\n", + "batch = next(iter(loader))\n", + "print(\"Batch keys:\", list(batch.keys()))" + ] + }, { "cell_type": "code", "execution_count": null, @@ -379,7 +420,61 @@ "outputs": [], "source": [ "batch = next(iter(loader))\n", - "nds(batch)" + "nds(batch)\n", + "print(\"Batch keys:\", list(batch.keys()))\n", + "print(batch[\"right.action_ee_pose\"][0, 0])\n", + "print(batch[\"left.action_ee_pose\"][0, 0])\n", + "print(batch[\"obs_head_pose\"][0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e94799c", + "metadata": {}, + "outputs": [], + "source": [ + "left_hand_pose = []\n", + "right_hand_pose = []\n", + "head_pose = []\n", + "for i, batch in enumerate(loader):\n", + " left_hand_pose.append(batch[\"left.action_ee_pose\"][0, 0])\n", + " right_hand_pose.append(batch[\"right.action_ee_pose\"][0, 0])\n", + " head_pose.append(batch[\"obs_head_pose\"][0])\n", + " \n", + " if i > 400:\n", + " break\n", + "left_hand_pose = np.array(left_hand_pose)\n", + "right_hand_pose = np.array(right_hand_pose)\n", + "head_pose = np.array(head_pose)\n", + "\n", + "# chunk the pose to actions(N, 100, 3)\n", + "left_hand_pose_actions = []\n", + "right_hand_pose_actions = []\n", + "head_pose_actions = []\n", + "for i in range(left_hand_pose.shape[0] - 100):\n", + " action_left_hand = left_hand_pose[i:i+100, :]\n", + " action_right_hand = right_hand_pose[i:i+100, :]\n", + " action_head = head_pose[i:i+100, :]\n", + " left_hand_pose_actions.append(action_left_hand)\n", + " right_hand_pose_actions.append(action_right_hand)\n", + " head_pose_actions.append(action_head)\n", + "left_hand_pose_actions = np.array(left_hand_pose_actions)\n", + "right_hand_pose_actions = np.array(right_hand_pose_actions)\n", + "head_pose_actions = np.array(head_pose_actions)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6517f061", + "metadata": {}, + "outputs": [], + "source": [ + "from egomimic.utils.egomimicUtils import render_3d_traj_frames\n", + "\n", + "frames = render_3d_traj_frames([left_hand_pose_actions, right_hand_pose_actions, head_pose_actions], labels=[\"left hand\", \"right hand\", \"head\"], stride=10)\n", + "mpy.show_video(frames, fps=30)\n" ] }, { diff --git a/external/scale/scripts/sfs_to_egoverse_zarr.py b/external/scale/scripts/sfs_to_egoverse_zarr.py index 113200e6..057819f1 100644 --- a/external/scale/scripts/sfs_to_egoverse_zarr.py +++ b/external/scale/scripts/sfs_to_egoverse_zarr.py @@ -3,13 +3,13 @@ Scale SFS -> EgoVerse Zarr converter. Output keys per episode: - left.obs_ee_pose (T, 6) xyzypr - right.obs_ee_pose (T, 6) xyzypr + left.obs_ee_pose (T, 7) xyz + quat(w, x, y, z) + right.obs_ee_pose (T, 7) xyz + quat(w, x, y, z) left.obs_keypoints (T, 63) 21 keypoints * 3 (xyz) right.obs_keypoints (T, 63) 21 keypoints * 3 (xyz) - left.obs_wrist_pose (T, 6) xyzypr - right.obs_wrist_pose (T, 6) xyzypr - obs_head_pose (T, 6) xyzypr + left.obs_wrist_pose (T, 7) xyz + quat(w, x, y, z) + right.obs_wrist_pose (T, 7) xyz + quat(w, x, y, z) + obs_head_pose (T, 7) xyz + quat(w, x, y, z) images.front_1 (T, H, W, 3) JPEG-compressed by ZarrWriter Usage: @@ -63,6 +63,21 @@ +def _batch_euler_to_quat(euler_zyx: np.ndarray) -> np.ndarray: + """(N, 3) euler ZYX -> (N, 4) quaternion wxyz.""" + q_xyzw = R.from_euler("ZYX", euler_zyx, degrees=False).as_quat() # scipy: xyzw + return q_xyzw[..., [3, 0, 1, 2]].astype(np.float32) # reorder -> wxyz + +def _batch_pose6_to_pose7(pose6: np.ndarray) -> np.ndarray: + """(N, 6) [xyz ypr] -> (N, 7) [xyz quat_wxyz]. Invalid sentinels → zeros.""" + N = pose6.shape[0] + out = np.zeros((N, 7), dtype=np.float32) + valid = ~np.any(pose6 >= INVALID_VALUE - 1, axis=1) + if valid.any(): + out[valid, :3] = pose6[valid, :3] + out[valid, 3:] = _batch_euler_to_quat(pose6[valid, 3:6]) + return out + # --------------------------------------------------------------------------- # Data structures & extraction (unchanged from original) @@ -245,17 +260,16 @@ def extract_all_frames_metadata(self) -> list[FrameData]: ) return frames - def load_images_for_range(self, start_idx: int, end_idx: int) -> list[np.ndarray | None]: + def load_all_images(self) -> list[np.ndarray | None]: + """Read every frame of the video sequentially (no seeking). Index i == video frame i.""" cap = cv2.VideoCapture(self.video_path) if not cap.isOpened(): - return [None] * (end_idx - start_idx) - cap.set(cv2.CAP_PROP_POS_FRAMES, start_idx) + return [] images: list[np.ndarray | None] = [] - for _ in range(end_idx - start_idx): + while True: ret, frame = cap.read() if not ret: - images.append(None) - continue + break images.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) cap.release() return images @@ -273,21 +287,25 @@ def _compute_palm_centroid(keypoints: np.ndarray) -> np.ndarray: return np.mean(palm_kps[valid_mask], axis=0).astype(np.float32) -def _compute_palm_orientation(keypoints: np.ndarray, mirror_y: bool = False) -> np.ndarray: +def _compute_palm_orientation(keypoints: np.ndarray, flip_x: bool = False) -> np.ndarray: + """Hand frame: x=right, y=down (palm normal toward ground), z=forward (toward fingers). + flip_x=True for the right hand so that x is rightward for both hands. + """ wrist, index1, middle1, pinky1 = keypoints[0], keypoints[5], keypoints[9], keypoints[17] - if any(np.any(kp >= INVALID_VALUE - 1) for kp in (wrist, index1, pinky1)): + if any(np.any(kp >= INVALID_VALUE - 1) for kp in (wrist, index1, middle1, pinky1)): return np.zeros(3, dtype=np.float32) - x_axis = middle1 - wrist - x_axis /= np.linalg.norm(x_axis) + 1e-8 - temp_y = pinky1 - wrist - z_axis = np.cross(x_axis, temp_y) + # z: forward — from wrist toward middle finger + z_axis = middle1 - wrist z_axis /= np.linalg.norm(z_axis) + 1e-8 + # x: right — across palm, orthogonalized against z + # left hand: index1 - pinky1 is rightward + # right hand: pinky1 - index1 is rightward (flip_x=True) + across = (pinky1 - index1) if flip_x else (index1 - pinky1) + across -= np.dot(across, z_axis) * z_axis + x_axis = across / (np.linalg.norm(across) + 1e-8) + # y: down (palm normal toward ground) = cross(z, x) y_axis = np.cross(z_axis, x_axis) y_axis /= np.linalg.norm(y_axis) + 1e-8 - if mirror_y: - y_axis = -y_axis - z_axis = np.cross(x_axis, y_axis) - z_axis /= np.linalg.norm(z_axis) + 1e-8 rot = np.column_stack([x_axis, y_axis, z_axis]) try: return R.from_matrix(rot).as_euler("ZYX", degrees=False).astype(np.float32) @@ -295,11 +313,11 @@ def _compute_palm_orientation(keypoints: np.ndarray, mirror_y: bool = False) -> return np.zeros(3, dtype=np.float32) -def _compute_palm_6dof(keypoints: np.ndarray, mirror_y: bool = False) -> np.ndarray: +def _compute_palm_6dof(keypoints: np.ndarray, flip_x: bool = False) -> np.ndarray: centroid = _compute_palm_centroid(keypoints) if np.any(centroid >= INVALID_VALUE - 1): return np.full(6, INVALID_VALUE, dtype=np.float32) - ypr = _compute_palm_orientation(keypoints, mirror_y=mirror_y) + ypr = _compute_palm_orientation(keypoints, flip_x=flip_x) return np.concatenate([centroid, ypr]).astype(np.float32) @@ -310,22 +328,25 @@ def _compute_wrist_position(keypoints: np.ndarray) -> np.ndarray: return wrist.astype(np.float32) -def _compute_wrist_orientation(keypoints: np.ndarray, mirror_y: bool = False) -> np.ndarray: +def _compute_wrist_orientation(keypoints: np.ndarray, flip_x: bool = False) -> np.ndarray: + """Hand frame: x=right, y=down (palm normal toward ground), z=forward (toward fingers). + flip_x=True for the right hand so that x is rightward for both hands. + """ wrist, index1, middle1, pinky1 = keypoints[0], keypoints[5], keypoints[9], keypoints[17] - if any(np.any(kp >= INVALID_VALUE - 1) for kp in (wrist, index1, pinky1)): + if any(np.any(kp >= INVALID_VALUE - 1) for kp in (wrist, index1, middle1, pinky1)): return np.zeros(3, dtype=np.float32) - - x_axis = middle1 - wrist - x_axis /= np.linalg.norm(x_axis) + 1e-8 - temp_y = pinky1 - wrist - z_axis = np.cross(x_axis, temp_y) + # z: forward — from wrist toward middle finger + z_axis = middle1 - wrist z_axis /= np.linalg.norm(z_axis) + 1e-8 + # x: right — across palm, orthogonalized against z + # left hand: index1 - pinky1 is rightward + # right hand: pinky1 - index1 is rightward (flip_x=True) + across = (pinky1 - index1) if flip_x else (index1 - pinky1) + across -= np.dot(across, z_axis) * z_axis + x_axis = across / (np.linalg.norm(across) + 1e-8) + # y: down (palm normal toward ground) = cross(z, x) y_axis = np.cross(z_axis, x_axis) y_axis /= np.linalg.norm(y_axis) + 1e-8 - if mirror_y: - y_axis = -y_axis - z_axis = np.cross(x_axis, y_axis) - z_axis /= np.linalg.norm(z_axis) + 1e-8 rot = np.column_stack([x_axis, y_axis, z_axis]) try: return R.from_matrix(rot).as_euler("ZYX", degrees=False).astype(np.float32) @@ -333,11 +354,11 @@ def _compute_wrist_orientation(keypoints: np.ndarray, mirror_y: bool = False) -> return np.zeros(3, dtype=np.float32) -def _compute_wrist_6dof(keypoints: np.ndarray, mirror_y: bool = False) -> np.ndarray: +def _compute_wrist_6dof(keypoints: np.ndarray, flip_x: bool = False) -> np.ndarray: wrist_xyz = _compute_wrist_position(keypoints) if np.any(wrist_xyz >= INVALID_VALUE - 1): return np.full(6, INVALID_VALUE, dtype=np.float32) - wrist_ypr = _compute_wrist_orientation(keypoints, mirror_y=mirror_y) + wrist_ypr = _compute_wrist_orientation(keypoints, flip_x=flip_x) return np.concatenate([wrist_xyz, wrist_ypr]).astype(np.float32) @@ -432,34 +453,48 @@ def _nonempty(p: str | None) -> bool: if n_frames <= ACTION_WINDOW: raise ValueError(f"Task {task_id} has too few frames ({n_frames})") + print(f"[{task_id}] Loading all video frames sequentially...") + t_vid = time.perf_counter() + all_images = extractor.load_all_images() + print(f"[{task_id}] Loaded {len(all_images)} video frames in {time.perf_counter() - t_vid:.1f}s (SFS frames={n_frames})") + if len(all_images) != n_frames: + print(f"[{task_id}] WARNING: video frame count ({len(all_images)}) != SFS frame count ({n_frames}) — index drift possible") + task_desc = _task_description(frames, extractor.demonstration_metadata) valid_frame_count = n_frames - ACTION_WINDOW # ------------------------------------------------------------------ # Precompute all per-frame data into dense arrays (once) # ------------------------------------------------------------------ - left_world = np.full((n_frames, 6), INVALID_VALUE, dtype=np.float32) - right_world = np.full((n_frames, 6), INVALID_VALUE, dtype=np.float32) - left_wrist = np.full((n_frames, 6), INVALID_VALUE, dtype=np.float32) - right_wrist = np.full((n_frames, 6), INVALID_VALUE, dtype=np.float32) + left_world_6 = np.full((n_frames, 6), INVALID_VALUE, dtype=np.float32) + right_world_6 = np.full((n_frames, 6), INVALID_VALUE, dtype=np.float32) + left_wrist_6 = np.full((n_frames, 6), INVALID_VALUE, dtype=np.float32) + right_wrist_6 = np.full((n_frames, 6), INVALID_VALUE, dtype=np.float32) left_kps = np.full((n_frames, 63), INVALID_VALUE, dtype=np.float32) right_kps = np.full((n_frames, 63), INVALID_VALUE, dtype=np.float32) - head_pose_world = np.zeros((n_frames, 6), dtype=np.float32) + head_pose_6 = np.zeros((n_frames, 6), dtype=np.float32) for i, frame in enumerate(frames): if frame.hand_keypoints.left is not None: - left_world[i] = _compute_palm_6dof(frame.hand_keypoints.left) - left_wrist[i] = _compute_wrist_6dof(frame.hand_keypoints.left) + left_world_6[i] = _compute_palm_6dof(frame.hand_keypoints.left) + left_wrist_6[i] = _compute_wrist_6dof(frame.hand_keypoints.left) left_kps[i] = frame.hand_keypoints.left.flatten().astype(np.float32) if frame.hand_keypoints.right is not None: - right_world[i] = _compute_palm_6dof(frame.hand_keypoints.right, mirror_y=True) - right_wrist[i] = _compute_wrist_6dof(frame.hand_keypoints.right, mirror_y=True) + right_world_6[i] = _compute_palm_6dof(frame.hand_keypoints.right, flip_x=True) + right_wrist_6[i] = _compute_wrist_6dof(frame.hand_keypoints.right, flip_x=True) right_kps[i] = frame.hand_keypoints.right.flatten().astype(np.float32) - head_pose_world[i, :3] = frame.camera_pose.position.astype(np.float32) - head_pose_world[i, 3:] = R.from_matrix(frame.camera_pose.rotation_matrix).as_euler( + head_pose_6[i, :3] = frame.camera_pose.position.astype(np.float32) + head_pose_6[i, 3:] = R.from_matrix(frame.camera_pose.rotation_matrix).as_euler( "ZYX", degrees=False ).astype(np.float32) + # Batch-convert all (N, 6) [xyz + euler ZYX] -> (N, 7) [xyz + quat xyzw] + left_world = _batch_pose6_to_pose7(left_world_6) + right_world = _batch_pose6_to_pose7(right_world_6) + left_wrist = _batch_pose6_to_pose7(left_wrist_6) + right_wrist = _batch_pose6_to_pose7(right_wrist_6) + head_pose_world = _batch_pose6_to_pose7(head_pose_6) + # ------------------------------------------------------------------ # Filter valid frame indices (same criteria as old script) # ------------------------------------------------------------------ @@ -497,16 +532,16 @@ def _nonempty(p: str | None) -> bool: if len(sub) < 10: continue - min_frame = min(sub) - max_frame = max(sub) - image_batch = extractor.load_images_for_range(min_frame, max_frame+1) - # First pass: figure out which frames have images kept: list[int] = [] + none_count = 0 for t in sub: - img = image_batch[t - min_frame] + img = all_images[t] if t < len(all_images) else None if img is not None: kept.append(t) + else: + none_count += 1 + print(f"[ep{written}] sub={len(sub)} kept={len(kept)} dropped(no image)={none_count} frames=[{sub[0]}..{sub[-1]}]") if len(kept) < 10: continue @@ -514,18 +549,18 @@ def _nonempty(p: str | None) -> bool: # ---- Per-frame current state (vectorised) ---- kept_arr = np.array(kept) - left_curr_6 = left_world[kept_arr] # (T, 6) - right_curr_6 = right_world[kept_arr] - left_curr_6 = np.where(left_curr_6 >= INVALID_VALUE - 1, 0.0, left_curr_6).astype( + left_curr_7 = left_world[kept_arr] # (T, 7) + right_curr_7 = right_world[kept_arr] + left_curr_7 = np.where(left_curr_7 >= INVALID_VALUE - 1, 0.0, left_curr_7).astype( np.float32 ) - right_curr_6 = np.where(right_curr_6 >= INVALID_VALUE - 1, 0.0, right_curr_6).astype( + right_curr_7 = np.where(right_curr_7 >= INVALID_VALUE - 1, 0.0, right_curr_7).astype( np.float32 ) - left_wrist_curr_6 = np.where( + left_wrist_curr_7 = np.where( left_wrist[kept_arr] >= INVALID_VALUE - 1, 0.0, left_wrist[kept_arr] ).astype(np.float32) - right_wrist_curr_6 = np.where( + right_wrist_curr_7 = np.where( right_wrist[kept_arr] >= INVALID_VALUE - 1, 0.0, right_wrist[kept_arr] ).astype(np.float32) @@ -540,19 +575,20 @@ def _nonempty(p: str | None) -> bool: # ---- Build image array ---- images = np.stack( - [cv2.resize(image_batch[t - min_frame], IMAGE_SIZE, interpolation=cv2.INTER_LINEAR) + [cv2.resize(all_images[t], IMAGE_SIZE, interpolation=cv2.INTER_LINEAR) for t in kept], axis=0, ).astype(np.uint8) + print(f"[ep{written}] images.shape={images.shape} kept_arr.shape={kept_arr.shape} match={images.shape[0] == len(kept_arr)}") # ---- Numeric data ---- numeric_data = { - "left.obs_ee_pose": left_curr_6, - "right.obs_ee_pose": right_curr_6, + "left.obs_ee_pose": left_curr_7, + "right.obs_ee_pose": right_curr_7, "left.obs_keypoints": left_keypoints, "right.obs_keypoints": right_keypoints, - "left.obs_wrist_pose": left_wrist_curr_6, - "right.obs_wrist_pose": right_wrist_curr_6, + "left.obs_wrist_pose": left_wrist_curr_7, + "right.obs_wrist_pose": right_wrist_curr_7, "obs_head_pose": actions_head, } image_data = { diff --git a/training_aws.md b/training_aws.md deleted file mode 100644 index fafe1c15..00000000 --- a/training_aws.md +++ /dev/null @@ -1,117 +0,0 @@ -# EgoVerse Dataset Setup and Training Guide - -This guide provides step-by-step instructions for setting up the dataset and training a model in the **EgoVerse** repository. - ---- - -## 1. Setting Up the Data Directory - -Start by navigating out of the **EgoVerse** repository and creating a `data` directory: - -```bash -cd .. -mkdir data -cd data -``` - ---- - -## 2. Downloading the Processed Data - -Download the processed dataset from AWS S3: - -```bash -aws s3 cp s3://rldb/processed/{processed_data_directory}/ {my_processed_data_directory} --recursive -``` - -Replace `{processed_data_directory}` with the name of the dataset you want to download, and `{my_processed_data_directory}` with your desired local directory name. - ---- - -## 3. Modifying Configuration Files - -Once the dataset is downloaded, navigate back to the **EgoVerse** repository: - -```bash -cd .. -cd EgoVerse -``` - -### **Modify `multi-data.yaml`** -Open the configuration file located at: -📌 **`hydra_configs/data/multi_data.yaml`** - -Update the following segments to match your local dataset path: - -```yaml -train_datasets: - dataset1: - _target_: rldb.utils.RLDBDataset - repo_id: "egoverse/smallShirtFold" - root: "{path/to/data/my_processed_data_directory/lerobot}" - local_files_only: true - mode: "train" - -valid_datasets: - dataset1: - _target_: rldb.utils.RLDBDataset - repo_id: "egoverse/smallShirtFold" - root: "{path/to/data/my_processed_data_directory/lerobot}" - local_files_only: true - mode: "valid" -``` -🔹 Replace `{path/to/data/my_processed_data_directory/lerobot}` with the actual path to the lerobot folder in your downloaded dataset. - ---- - -### **Modify `train.yaml`** -Open the configuration file located at: -📌 **`hydra_configs/train.yaml`** - -Modify the **data schematic** section as follows: - -```yaml -data_schematic: # Dynamically fill in these shapes from the dataset - _target_: rldb.utils.DataSchematic - schematic_dict: - aria_bimanual: - front_img_1: - key_type: camera_keys - lerobot_key: observations.images.front_img_1 - ee_pose: - key_type: proprio_keys - lerobot_key: observations.state.ee_pose - actions_cartesian: - key_type: action_keys - lerobot_key: actions_cartesian - embodiment: - key_type: metadata_keys - lerobot_key: metadata.embodiment - viz_img_key: - aria_bimanual: - front_img_1 -``` - ---- - -## 4. Launch Training - -### **Activate Your Environment** -Before running training, activate your **Conda** or **UV** environment: - -```bash -conda activate # If using Conda -``` -or -```bash -uv venv && source /bin/activate # If using UV -``` - -### **Run Training on a GPU Node** -Execute the training script: - -```bash -python trainHydra.py -``` ---- -