diff --git a/egomimic/robot/checkpoint_test.py b/egomimic/robot/checkpoint_test.py new file mode 100644 index 00000000..bbf4a4f0 --- /dev/null +++ b/egomimic/robot/checkpoint_test.py @@ -0,0 +1,21 @@ +import torch, pprint + +def summarize(path): + ckpt = torch.load(path, map_location="cpu") + print("\n===", path, "===") + print("top-level keys:", sorted(ckpt.keys())) + print("pl version:", ckpt.get("pytorch-lightning_version") or ckpt.get("lightning_version")) + hp = ckpt.get("hyper_parameters", {}) + print("hyper_parameters keys count:", len(hp)) + print("has robomimic_model in hyper_parameters?:", "robomimic_model" in hp) + # show likely constructor-related keys + print("hp_keys:", sorted(hp.keys())) + interesting = [k for k in hp.keys() if any(s in k.lower() for s in ["model", "policy", "config", "algo", "robomimic"])] + print("interesting hparams keys:", sorted(interesting)[:50]) + # optional: show hparams content types + for k in sorted(interesting)[:20]: + v = hp[k] + print(f" {k}: {type(v)}") + +summarize("/home/robot/robot_ws/egomimic/robot/models/hpt_objcont_bc/0/checkpoints/epoch_epoch=1599.ckpt") +summarize("/home/robot/robot_ws/egomimic/robot/models/pi_eva_objcon_full_folder/0/checkpoints/epoch_epoch=399.ckpt") diff --git a/egomimic/robot/data_visualization.ipynb b/egomimic/robot/data_visualization.ipynb index 3841352d..0488a28f 100644 --- a/egomimic/robot/data_visualization.ipynb +++ b/egomimic/robot/data_visualization.ipynb @@ -21,7 +21,7 @@ "import numpy as np\n", "import matplotlib.pyplot as plt\n", "\n", - "h5_path = \"/home/robot/robot_ws/egomimic/robot/demos/demo_1.hdf5\" # <- change this\n", + "h5_path = \"/home/robot/robot_ws/egomimic/robot/demo/fold_clothes_ood/demo_5.hdf5\" # <- change this\n", "\n", "with h5py.File(h5_path, \"r\") as f:\n", " # 1) load action\n", diff --git a/egomimic/robot/eva/eva.md b/egomimic/robot/eva/eva.md index b85b4710..5ead3f13 100755 --- a/egomimic/robot/eva/eva.md +++ b/egomimic/robot/eva/eva.md @@ -61,6 +61,16 @@ From the repo root (or wherever `run_eva_docker.sh` lives): ./run_eva_docker.sh {left | right | both} ``` +If the script errors with "could not select device driver", then run the following commands on a host shell + +```bash +sudo apt-get update +sudo apt-get install -y nvidia-container-toolkit + +sudo nvidia-ctk runtime configure --runtime=docker +sudo systemctl restart docker +``` + Once the container appears in Cursor / VS Code, attach a terminal to it and run: ```bash @@ -202,4 +212,8 @@ After you are done collecting data: ```bash python3 eva_uploader.py -``` \ No newline at end of file +``` + +## ROLLOUTS + +Note: Do not use env python \ No newline at end of file diff --git a/egomimic/robot/pi_rollout.py b/egomimic/robot/pi_rollout.py new file mode 100644 index 00000000..32c786a8 --- /dev/null +++ b/egomimic/robot/pi_rollout.py @@ -0,0 +1,726 @@ +import collections +from multiprocessing import process +import os +import time +import h5py +import torch +import sys +import numpy as np +import cv2 +from abc import ABC, abstractmethod + +import egomimic +from egomimic.algo import * +from egomimic.algo.hpt import HPTModel +from egomimic.models.denoising_policy import DenoisingPolicy +from egomimic.rldb.utils import EMBODIMENT, get_embodiment, get_embodiment_id, RLDBDataset +from egomimic.pl_utils.pl_model import ModelWrapper + +from robot_utils import RateLoop +from egomimic.utils.egomimicUtils import CameraTransforms, draw_actions, cam_frame_to_base_frame, ee_pose_to_cam_frame, base_frame_to_cam_frame, interpolate_arr, interpolate_arr_euler, ee_orientation_to_cam_frame +from egomimic.robot.eva.eva_kinematics import EvaMinkKinematicsSolver +sys.path.append(os.path.join(os.path.dirname(__file__), "eva/eva_ws/src/eva")) + +import sys +import select +import termios +import tty + +from omegaconf.listconfig import ListConfig +from omegaconf.base import ContainerMetadata +from typing import Any, Dict, List, Optional, Tuple + +from robot_interface import * +# from stream_aria import AriaRecorder +# from stream_d405 import RealSenseRecorder +def visualize_actions(ims, actions, extrinsics, intrinsics, arm="both"): + if actions.shape[-1] == 7 or actions.shape[-1] == 14: + ac_type = "joints" + elif actions.shape[-1] == 3 or actions.shape[-1] == 6: + ac_type = "xyz" + else: + raise ValueError(f"Unknown action type with shape {actions.shape}") + + ims = draw_actions( + ims, ac_type, "Purples", actions, extrinsics, intrinsics, arm=arm + ) + + return ims + + +from scipy.spatial.transform import Rotation as R +import pickle +from pathlib import Path + + +# Control parameters +DEFAULT_FREQUENCY = 30 # Hz +QUERY_FREQUENCY = 30 + +RIGHT_CAM_SERIAL = "" +LEFT_CAM_SERIAL = "" + +EMBODIMENT_MAP = { + "both": 8, + "left": 7, + "right": 6, +} + +class _KeyPoll: + def __enter__(self): + self.fd = sys.stdin.fileno() + self.old = termios.tcgetattr(self.fd) + tty.setcbreak(self.fd) # no Enter needed + return self + + def __exit__(self, exc_type, exc, tb): + termios.tcsetattr(self.fd, termios.TCSADRAIN, self.old) + + def getch(self): + if select.select([sys.stdin], [], [], 0)[0]: + return sys.stdin.read(1) + return None + +class Rollout(ABC): + def __init__(self): + pass + + @abstractmethod + def rollout_step(self, i): + pass + +Array2D = Any # np.ndarray or torch.Tensor of shape (T, D) + +def _to_picklable(x): + # best-effort conversion to make obs/batch serializable + if isinstance(x, torch.Tensor): + return x.detach().cpu() + if isinstance(x, np.ndarray): + return x + if isinstance(x, dict): + return {k: _to_picklable(v) for k, v in x.items()} + if isinstance(x, (list, tuple)): + t = [_to_picklable(v) for v in x] + return type(x)(t) + return x + +def _dump_pickle(path: Path, obj) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + tmp = str(path) + ".tmp" + with open(tmp, "wb") as f: + pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL) + os.replace(tmp, str(path)) + + +def _as_matrix(x): + """Return a 3x3 ndarray from either Rotation or 3x3 ndarray.""" + return x.as_matrix() if isinstance(x, R) else np.asarray(x, dtype=float) + + +def _row_to_numpy(x: Array2D, i: int) -> np.ndarray: + """Get i-th row as numpy array (handles torch or numpy input).""" + if isinstance(x, np.ndarray): + return x[i] + return x[i].detach().cpu().numpy() + +def fk_SE3( + joints_2d: Array2D, eva_fk: EvaMinkKinematicsSolver, *, dtype=torch.float32, device=None +) -> torch.Tensor: + """ + Eva FK full SE(3) for a sequence of joint vectors. + Returns torch.Tensor of shape (T, 4, 4) with bottom-right set to 1. + Downstream can keep using fk[:, :3, 3] and fk[:, :3, :3]. + """ + T = joints_2d.shape[0] + out = torch.zeros((T, 4, 4), dtype=dtype, device=device) + out[:, 3, 3] = 1.0 + for i in range(T): + q = _row_to_numpy(joints_2d, i) + pos, R_obj = eva_fk.fk(q) # numpy: (3,), (3,3) + Rm = _as_matrix(R_obj) + out[i, :3, :3] = torch.as_tensor(Rm, dtype=dtype, device=device) + out[i, :3, 3] = torch.as_tensor(pos, dtype=dtype, device=device) + return out + + +EVA_XML_PATH = os.path.join( + os.path.dirname(egomimic.__file__), "resources/model_x5.xml" +) +EVA_FK = EvaMinkKinematicsSolver(model_path=str(EVA_XML_PATH)) + +class ReplayRollout(Rollout): + def __init__(self, dataset_path, cartesian): + super().__init__() + self.dataset_path = dataset_path + if not os.path.isfile(self.dataset_path): + raise FileNotFoundError(f"HDF5 not found: {self.dataset_path}") + with h5py.File(self.dataset_path, "r") as f: + if cartesian: + self.actions = np.asarray(f["actions"]["eepose"][...], dtype=np.float32) + else: + self.actions = np.asarray( + f["observations"]["joint_positions"][...], dtype=np.float32 + ) + + def rollout_step(self, i): + if i < self.actions.shape[0]: + return self.actions[i] + else: + return None + +# TODO: Work with all types of arms +class ReplayRolloutLerobot(Rollout): + def __init__(self, dataset_path, repo_id, cartesian, extrinsics_key, episodes=[1], arm="right"): + super().__init__() + self.dataset_path = dataset_path + self.cartesian = cartesian + self.extrinsics = CameraTransforms(intrinsics_key="base", extrinsics_key=extrinsics_key).extrinsics + self.device = "cuda" if torch.cuda.is_available() else "cpu" + self.debug_actions = None + self.arm = arm + + dataset = RLDBDataset(repo_id=repo_id, root=dataset_path, local_files_only=True, episodes=episodes, mode="sample") + data_loader = torch.utils.data.DataLoader(dataset, + batch_size=32, + shuffle=False) + self.iter = iter(data_loader) + self.data_loader = data_loader + self.i = 0 + self.actions_key = "actions_cartesian" if cartesian else "actions_joints" + self.actions = None + + def rollout_step(self, i): + while i >= self.i: + try: + batch = next(self.iter) + except StopIteration: + self.iter = iter(self.data_loader) + batch = next(self.iter) + + cur_actions = batch[self.actions_key].cpu().numpy()[:, 0, :] # (B, 7) or (B, 14) + + if self.cartesian: + if self.arm == "both": + left_actions = cur_actions[:, :7] + right_actions = cur_actions[:, 7:14] + + left_grip = (left_actions[:, 6:7]).copy() + right_grip = (right_actions[:, 6:7]).copy() + + transformed_left = cam_frame_to_base_frame( + left_actions[:, :6].copy(), self.extrinsics["left"] + ) + transformed_right = cam_frame_to_base_frame( + right_actions[:, :6].copy(), self.extrinsics["right"] + ) + + left_out = np.hstack([transformed_left, left_grip]) + right_out = np.hstack([transformed_right, right_grip]) + cur_actions = np.hstack([left_out, right_out]) + else: + grip = (cur_actions[:, 6:7]).copy() + transformed_6dof = cam_frame_to_base_frame( + cur_actions[:, :6].copy(), self.extrinsics[self.arm] + ) + cur_actions = np.hstack([transformed_6dof, grip]) + + cur_actions = cur_actions.astype(np.float32, copy=False) + + if self.actions is None or self.actions.shape[0] == 0: + self.actions = cur_actions + else: + self.actions = np.concatenate([self.actions, cur_actions], axis=0) + + self.i += cur_actions.shape[0] + + if self.actions is None: + return None + if i < 0 or i >= self.actions.shape[0]: + return None + return self.actions[i] + + def reset(self): + self.iter = iter(self.data_loader) + self.i = 0 + self.actions = None + self.debug_actions = None + +class PolicyRollout(Rollout): + def __init__(self, arm, policy_path, query_frequency, cartesian, extrinsics_key, resampled_action_len=None): + super().__init__() + torch.serialization.add_safe_globals([ListConfig]) + self.arm = arm + self.policy_path = policy_path + self.policy = ModelWrapper.load_from_checkpoint(policy_path, weights_only=False, map_location="cpu") + self.policy = self.policy.to('cuda') + + self.query_frequency = query_frequency + self.cartesian = cartesian + self.embodiment_id = EMBODIMENT_MAP[self.arm] + self.embodiment_name = get_embodiment(self.embodiment_id) + if getattr(self.policy.model, "diffusion", False): + for head in self.policy.model.nets.policy.heads: + if isinstance(self.policy.model.nets.policy.heads[head], DenoisingPolicy): + self.policy.model.nets.policy.heads[head].num_inference_steps = 10 + self.extrinsics = CameraTransforms(intrinsics_key="base", extrinsics_key=extrinsics_key).extrinsics + self.device = "cuda" if torch.cuda.is_available() else "cpu" + self.policy.model.nets.to(self.device) + self.debug_actions = None + self.resampled_action_len = resampled_action_len + self._dump_dir = Path("debug/policy_io") + self._dump_i = 0 + + def _downsample_chunk(self, chunk: np.ndarray, target_len: int) -> np.ndarray: + if target_len is None or target_len <= 0 or chunk.shape[0] == target_len: + return chunk.astype(np.float32, copy=False) + + # chunk: (T, D) -> (1, T, D) and back + if self.cartesian: + if self.arm == "both": + left = chunk[:, :7] + right = chunk[:, 7:14] + left_r = interpolate_arr_euler(left[None, ...], target_len)[0] + right_r = interpolate_arr_euler(right[None, ...], target_len)[0] + out = np.hstack([left_r, right_r]) + else: + out = interpolate_arr_euler(chunk[None, ...], target_len)[0] + else: + out = interpolate_arr(chunk[None, ...], target_len)[0] + + return out.astype(np.float32, copy=False) + + def rollout_step(self, i, obs): + if i % self.query_frequency == 0: + start_infer_t = time.time() + batch = self.process_obs_for_policy(obs) + preds = self.policy.model.forward_eval(batch) + ac_key = self.policy.model.ac_keys[self.embodiment_id] + actions = preds[f"{self.embodiment_name.lower()}_{ac_key}"] + self.actions = actions.detach().cpu().numpy().squeeze() + self.debug_actions = self.actions.copy() + if self.cartesian: + if self.arm == "both": + left_actions = self.actions[:, :7] + right_actions = self.actions[:, 7:] + transformed_left = cam_frame_to_base_frame(left_actions[:, :6].copy(), self.extrinsics["left"]) + transformed_right = cam_frame_to_base_frame(right_actions[:, :6].copy(), self.extrinsics["right"]) + if left_actions.shape[1] == 7: + left_actions = np.hstack([transformed_left, left_actions[:, 6:7]]) + else: + left_actions = transformed_left + if right_actions.shape[1] == 7: + right_actions = np.hstack([transformed_right, right_actions[:, 6:7]]) + else: + right_actions = transformed_right + self.actions = np.hstack([left_actions, right_actions]) + else: + transformed_6dof = cam_frame_to_base_frame(self.actions[:, :6].copy(), self.extrinsics[self.arm]) + # Preserve gripper if present (7th value) + if self.actions.shape[1] == 7: + self.actions = np.hstack([transformed_6dof, self.actions[:, 6:7]]) + else: + self.actions = transformed_6dof + + if self.resampled_action_len is not None: + self.actions = self._downsample_chunk(self.actions, self.resampled_action_len) + self.debug_actions = self.actions.copy() + + print(f"Inference time: {(time.time() - start_infer_t)}s") + + # TODO check gripper if we are using 0 to 0.08 or 0 to 1 + act_i = i % self.query_frequency + return self.actions[act_i] + + + + def process_obs_for_policy(self, obs): + dump_i = self._dump_i + self._dump_i += 1 + + # _dump_pickle(self._dump_dir / f"{dump_i:06d}_obs.pkl", _to_picklable(obs)) + + # front camera: obs["front_img_1"] is BGR, shape [H, W, 3] + front = torch.from_numpy(obs["front_img_1"][None, ...]) # [1, H, W, 3] + front = front[..., [2, 1, 0]] # BGR -> RGB + front = front.permute(0, 3, 1, 2).to(self.device, dtype=torch.float32) / 255.0 + + data = { + "observations.images.front_img_1": front, + "pad_mask": torch.ones((1, 100, 1), device=self.device, dtype=torch.bool), + } + + if self.arm == "right": + right = torch.from_numpy(obs["right_wrist_img"][None, ...]) # [1, H, W, 3] BGR + right = right[..., [2, 1, 0]] # BGR -> RGB + right = ( + right.permute(0, 3, 1, 2) + .to(self.device, dtype=torch.float32) + / 255.0 + ) + data["observations.images.right_wrist_img"] = right + joint_positions = obs["joint_positions"][7:] + fk = fk_SE3( + torch.from_numpy(joint_positions[None, :]).to(torch.float32), + EVA_FK, + dtype=torch.float32, + device=self.device, + ) + fk_positions = fk[:, :3, 3] + fk_orientations = fk[:, :3, :3] + + extrinsics = self.extrinsics[self.arm] + fk_positions = ee_pose_to_cam_frame(fk_positions.cpu().numpy(), extrinsics) + fk_orientations, fk_ypr = ee_orientation_to_cam_frame( + fk_orientations, extrinsics + ) + ee_pose = np.concatenate([fk_positions[0], fk_ypr[0], np.array([joint_positions[-1]])], axis=0) + + elif self.arm == "left": + left = torch.from_numpy(obs["left_wrist_img"][None, ...]) # [1, H, W, 3] BGR + left = left[..., [2, 1, 0]] # BGR -> RGB + left = ( + left.permute(0, 3, 1, 2) + .to(self.device, dtype=torch.float32) + / 255.0 + ) + data["observations.images.left_wrist_img"] = left + joint_positions = obs["joint_positions"][:7] + + fk = fk_SE3( + torch.from_numpy(joint_positions[None, :]).to(torch.float32), + EVA_FK, + dtype=torch.float32, + device=self.device, + ) + fk_positions = fk[:, :3, 3] + fk_orientations = fk[:, :3, :3] + + extrinsics = self.extrinsics[self.arm] + fk_positions = ee_pose_to_cam_frame(fk_positions.cpu().numpy(), extrinsics) + fk_orientations, fk_ypr = ee_orientation_to_cam_frame( + fk_orientations, extrinsics + ) + + ee_pose = np.concatenate([fk_positions[0], fk_ypr[0], np.array([joint_positions[-1]])], axis=0) + + elif self.arm == "both": + right = torch.from_numpy(obs["right_wrist_img"][None, ...]) + right = right[..., [2, 1, 0]] + right = ( + right.permute(0, 3, 1, 2) + .to(self.device, dtype=torch.float32) + / 255.0 + ) + left = torch.from_numpy(obs["left_wrist_img"][None, ...]) + left = left[..., [2, 1, 0]] + left = ( + left.permute(0, 3, 1, 2) + .to(self.device, dtype=torch.float32) + / 255.0 + ) + data["observations.images.right_wrist_img"] = right + data["observations.images.left_wrist_img"] = left + joint_positions = obs["joint_positions"] + + left_joint = joint_positions[:7] + right_joint = joint_positions[7:] + + # FK for left + fk_left = fk_SE3( + torch.from_numpy(left_joint[None, :]).to(torch.float32), + EVA_FK, + dtype=torch.float32, + device=self.device, + ) + fk_pos_left = fk_left[:, :3, 3] + fk_rot_left = fk_left[:, :3, :3] + extr_left = self.extrinsics["left"] + fk_pos_left = ee_pose_to_cam_frame(fk_pos_left.cpu().numpy(), extr_left) + fk_rot_left, fk_ypr_left = ee_orientation_to_cam_frame(fk_rot_left, extr_left) + ee_pose_left = np.concatenate([fk_pos_left[0], fk_ypr_left[0], np.array([left_joint[-1]])], axis=0) + + # FK for right + fk_right = fk_SE3( + torch.from_numpy(right_joint[None, :]).to(torch.float32), + EVA_FK, + dtype=torch.float32, + device=self.device, + ) + fk_pos_right = fk_right[:, :3, 3] + fk_rot_right = fk_right[:, :3, :3] + extr_right = self.extrinsics["right"] + fk_pos_right = ee_pose_to_cam_frame(fk_pos_right.cpu().numpy(), extr_right) + fk_rot_right, fk_ypr_right = ee_orientation_to_cam_frame(fk_rot_right, extr_right) + ee_pose_right = np.concatenate([fk_pos_right[0], fk_ypr_right[0], np.array([right_joint[-1]])], axis=0) + + # concatenate both arm ee_poses + ee_pose = np.concatenate([ee_pose_left, ee_pose_right], axis=0) + + data["observations.state.joint_positions"] = torch.from_numpy(joint_positions).reshape(1, -1) + data["embodiment"] = torch.tensor([self.embodiment_id], dtype=torch.int64) + data["observations.state.ee_pose"] = torch.from_numpy(ee_pose).reshape(1, -1) + + if not self.cartesian: + data["actions_joints"] = torch.zeros_like(data["observations.state.joint_positions"]).reshape(1, 1, -1) + else: + data["actions_cartesian"] = torch.zeros_like(data["observations.state.joint_positions"]).reshape(1, 1, -1) + + # processed_batch = {self.embodiment_id: data} + processed_batch = {self.embodiment_id: {}} + # move non-image tensors to device and float32 (images already are) + for key, val in data.items(): + if key not in ("front_img_1", "right_wrist_img", "left_wrist_img"): + data[key] = val.to(self.device, dtype=torch.float32) + + for key, val in data.items(): + batch_key = self.policy.model.data_schematic.lerobot_key_to_keyname(key, self.embodiment_id) + processed_batch[self.embodiment_id][batch_key] = val + # print(f"Key: {key}, batch_key: {batch_key}, shape: {val.shape}") + + processed_batch[self.embodiment_id]["pad_mask"] = data["pad_mask"] + processed_batch[self.embodiment_id]["embodiment"] = data["embodiment"] + processed_batch[self.embodiment_id] = ( + self.policy.model.data_schematic.normalize_data( + processed_batch[self.embodiment_id], self.embodiment_id + ) + ) + + # _dump_pickle( + # self._dump_dir / f"{dump_i:06d}_batch.pkl", + # _to_picklable(processed_batch), + # ) + return processed_batch + + + def reset(self): + self.actions = None + self.debug_actions = None + self.policy = ModelWrapper.load_from_checkpoint(self.policy_path, weights_only=False, map_location="cpu") + self.policy = self.policy.to('cuda') + if getattr(self.policy.model, "diffusion", False): + for head in self.policy.model.nets.policy.heads: + if isinstance(self.policy.model.nets.policy.heads[head], DenoisingPolicy): + self.policy.model.nets.policy.heads[head].num_inference_steps = 10 + + +def reset_rollout(ri, policy): + print("Resetting rollout: going home + clearing policy state") + ri.set_home() + if hasattr(policy, "reset"): + policy.reset() + if hasattr(policy, "actions"): + policy.actions = None + if hasattr(policy, "debug_actions"): + policy.debug_actions = None + +def main( + arms, + frequency, + cartesian, + query_frequency=None, + policy_path=None, + dataset_path=None, + repo_id=None, + episodes=[0], + debug=False, + resampled_action_len=None, +): + if arms == "both": + arms_list = ["right", "left"] + elif arms == "right": + arms_list = ["right"] + else: + arms_list = ["left"] + + ri = ARXInterface(arms=arms_list) + + if policy_path is None and dataset_path is not None and repo_id is not None: + rollout_type = "replay_lerobot" + policy = ReplayRolloutLerobot( + dataset_path=dataset_path, + repo_id=repo_id, + cartesian=cartesian, + extrinsics_key="x5Dec13_2", + episodes=episodes, + arm=arms, + ) + elif policy_path is not None: + rollout_type = "policy" + policy = PolicyRollout( + arm=arms, + policy_path=policy_path, + query_frequency=query_frequency, + cartesian=cartesian, + extrinsics_key="x5Dec13_2", + resampled_action_len=resampled_action_len + ) + elif dataset_path is not None: + rollout_type = "replay" + policy = ReplayRollout(dataset_path=dataset_path, cartesian=cartesian) + else: + raise ValueError("Must provide either --policy-path or --dataset-path (and optionally --repo-id).") + + print(f"Cartesian value {cartesian}") + + camera_transforms = CameraTransforms(intrinsics_key="base", extrinsics_key="x5Dec13_2") + kinematics_solver = EvaMinkKinematicsSolver( + model_path="/home/robot/robot_ws/egomimic/resources/model_x5.xml" + ) + try: + with _KeyPoll() as kp: + reset_rollout(ri, policy) + + while True: # restartable + with RateLoop(frequency=frequency, verbose=True) as loop: + for step_i in loop: + ch = kp.getch() + if ch == "q": + print("Quit requested.") + return + if ch == "r": + print("Restart requested.") + reset_rollout(ri, policy) + time.sleep(10.0) + break # restart RateLoop + + actions = None + if rollout_type == "policy": + obs = ri.get_obs() + actions = policy.rollout_step(step_i, obs) + elif rollout_type == "replay": + actions = policy.rollout_step(step_i) + elif rollout_type == "replay_lerobot": + actions = policy.rollout_step(step_i) + else: + raise ValueError(f"Invalid rollout type: {rollout_type}") + + if actions is None: + print("Finish rollout. Press 'r' to restart or 'q' to quit.") + while True: + ch2 = kp.getch() + if ch2 == "q": + return + if ch2 == "r": + reset_rollout(ri, policy) + time.sleep(10.0) + break + time.sleep(0.01) + break + + if debug and rollout_type == "policy": + os.makedirs("debug", exist_ok=True) + + if isinstance(obs["front_img_1"], torch.Tensor): + if obs["front_img_1"].dim() == 4: + img = obs["front_img_1"][0].permute(1, 2, 0).cpu().numpy() + elif obs["front_img_1"].dim() == 3: + img = obs["front_img_1"].permute(1, 2, 0).cpu().numpy() + else: + img = obs["front_img_1"].cpu().numpy() + else: + img = obs["front_img_1"] + if img.ndim == 3 and img.shape[0] == 3: + img = img.transpose(1, 2, 0) + img = img.astype(np.uint8) + + for arm in arms_list: + arm_offset = 7 if (arm == "right" and arms == "both") else 0 + + if cartesian: + action_xyz = policy.debug_actions[:, :3] + else: + jnts = policy.actions[:, :7] + actions_xyz = np.zeros((jnts.shape[0], 3), dtype=np.float32) + for j in range(actions_xyz.shape[0]): + pos, _rot = kinematics_solver.fk(jnts[j][:6]) + actions_xyz[j] = pos + action_xyz = actions_xyz + + im_viz = visualize_actions( + img, + action_xyz, + camera_transforms.extrinsics, + camera_transforms.intrinsics, + arm=arm, + ) + cv2.imwrite(f"debug/debug_{arm}_{step_i}.png", im_viz) + + for arm in arms_list: + arm_offset = 7 if (arm == "right" and arms == "both") else 0 + arm_action = actions[arm_offset : arm_offset + 7] + if cartesian: + ri.set_pose(arm_action, arm) + else: + ri.set_joints(arm_action, arm) + + except KeyboardInterrupt: + print("KeyboardInterrupt detected, exiting rollout.") + return + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="Rollout robot model.") + parser.add_argument( + "--arms", + type=str, + default="right", + choices=["left", "right", "both"], + help="Which arm(s) to control", + ) + parser.add_argument( + "--frequency", + type=float, + default=DEFAULT_FREQUENCY, + help="Control loop frequency in Hz", + ) + parser.add_argument( + "--query_frequency", + type=int, + default=QUERY_FREQUENCY, + help="Frames which model does inference", + ) + parser.add_argument("--policy-path", type=str, help="policy checkpoint path") + parser.add_argument("--dataset-path", type=str, help="dataset path for replay") + parser.add_argument("--repo-id", type=str, help="repo id for replay") + parser.add_argument("--episodes", type=int, nargs="+", help="episodes to replay") + parser.add_argument( + "--cartesian", + action="store_true", + help="control in cartesian space instead of joint space", + ) + + parser.add_argument( + "--resampled-action-len", + type=int, + default=45, + help="Resample each predicted action chunk to this length (e.g., 100 -> 45). Euler if --cartesian.", + ) + + parser.add_argument( + "--debug", + action="store_true", + help="enable debug visualization of actions on images", + ) + + args = parser.parse_args() + episodes = args.episodes if args.episodes is not None else [0] + + print(f"Resampling actions to {args.resampled_action_len}") + # breakpoint() + # ModelWrapper.load_from_checkpoint(args.policy_path) + # exit() + + main( + arms=args.arms, + frequency=args.frequency, + query_frequency=args.query_frequency, + policy_path=args.policy_path, + dataset_path=args.dataset_path, + repo_id=args.repo_id, + episodes=episodes, + cartesian=args.cartesian, + debug=args.debug, + resampled_action_len=args.resampled_action_len + ) diff --git a/egomimic/robot/rollout.py b/egomimic/robot/rollout.py index 9249ffc9..3c4ab726 100644 --- a/egomimic/robot/rollout.py +++ b/egomimic/robot/rollout.py @@ -1,3 +1,5 @@ +import collections +from multiprocessing import process import os import time import h5py @@ -7,6 +9,7 @@ import cv2 from abc import ABC, abstractmethod +import egomimic from egomimic.algo import * from egomimic.algo.hpt import HPTModel from egomimic.models.denoising_policy import DenoisingPolicy @@ -14,7 +17,7 @@ from egomimic.pl_utils.pl_model import ModelWrapper from robot_utils import RateLoop -from egomimic.utils.egomimicUtils import CameraTransforms, draw_actions, cam_frame_to_base_frame, ee_pose_to_cam_frame, base_frame_to_cam_frame, interpolate_arr, interpolate_arr_euler +from egomimic.utils.egomimicUtils import CameraTransforms, draw_actions, cam_frame_to_base_frame, ee_pose_to_cam_frame, base_frame_to_cam_frame, interpolate_arr, interpolate_arr_euler, ee_orientation_to_cam_frame from egomimic.robot.eva.eva_kinematics import EvaMinkKinematicsSolver sys.path.append(os.path.join(os.path.dirname(__file__), "eva/eva_ws/src/eva")) @@ -23,6 +26,10 @@ import termios import tty +from omegaconf.listconfig import ListConfig +from omegaconf.base import ContainerMetadata +from typing import Any, Dict, List, Optional, Tuple + from robot_interface import * # from stream_aria import AriaRecorder # from stream_d405 import RealSenseRecorder @@ -40,6 +47,12 @@ def visualize_actions(ims, actions, extrinsics, intrinsics, arm="both"): return ims + +from scipy.spatial.transform import Rotation as R +import pickle +from pathlib import Path + + # Control parameters DEFAULT_FREQUENCY = 30 # Hz QUERY_FREQUENCY = 30 @@ -75,7 +88,65 @@ def __init__(self): @abstractmethod def rollout_step(self, i): pass - + +Array2D = Any # np.ndarray or torch.Tensor of shape (T, D) + +def _to_picklable(x): + # best-effort conversion to make obs/batch serializable + if isinstance(x, torch.Tensor): + return x.detach().cpu() + if isinstance(x, np.ndarray): + return x + if isinstance(x, dict): + return {k: _to_picklable(v) for k, v in x.items()} + if isinstance(x, (list, tuple)): + t = [_to_picklable(v) for v in x] + return type(x)(t) + return x + +def _dump_pickle(path: Path, obj) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + tmp = str(path) + ".tmp" + with open(tmp, "wb") as f: + pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL) + os.replace(tmp, str(path)) + + +def _as_matrix(x): + """Return a 3x3 ndarray from either Rotation or 3x3 ndarray.""" + return x.as_matrix() if isinstance(x, R) else np.asarray(x, dtype=float) + + +def _row_to_numpy(x: Array2D, i: int) -> np.ndarray: + """Get i-th row as numpy array (handles torch or numpy input).""" + if isinstance(x, np.ndarray): + return x[i] + return x[i].detach().cpu().numpy() + +def fk_SE3( + joints_2d: Array2D, eva_fk: EvaMinkKinematicsSolver, *, dtype=torch.float32, device=None +) -> torch.Tensor: + """ + Eva FK full SE(3) for a sequence of joint vectors. + Returns torch.Tensor of shape (T, 4, 4) with bottom-right set to 1. + Downstream can keep using fk[:, :3, 3] and fk[:, :3, :3]. + """ + T = joints_2d.shape[0] + out = torch.zeros((T, 4, 4), dtype=dtype, device=device) + out[:, 3, 3] = 1.0 + for i in range(T): + q = _row_to_numpy(joints_2d, i) + pos, R_obj = eva_fk.fk(q) # numpy: (3,), (3,3) + Rm = _as_matrix(R_obj) + out[i, :3, :3] = torch.as_tensor(Rm, dtype=dtype, device=device) + out[i, :3, 3] = torch.as_tensor(pos, dtype=dtype, device=device) + return out + + +EVA_XML_PATH = os.path.join( + os.path.dirname(egomimic.__file__), "resources/model_x5.xml" +) +EVA_FK = EvaMinkKinematicsSolver(model_path=str(EVA_XML_PATH)) class ReplayRollout(Rollout): def __init__(self, dataset_path, cartesian): @@ -177,9 +248,12 @@ def reset(self): class PolicyRollout(Rollout): def __init__(self, arm, policy_path, query_frequency, cartesian, extrinsics_key, resampled_action_len=None): super().__init__() + torch.serialization.add_safe_globals([ListConfig]) self.arm = arm self.policy_path = policy_path - self.policy = ModelWrapper.load_from_checkpoint(policy_path, weights_only=False) + self.policy = ModelWrapper.load_from_checkpoint(policy_path, weights_only=False, map_location="cpu") + self.policy = self.policy.to('cuda') + self.query_frequency = query_frequency self.cartesian = cartesian self.embodiment_id = EMBODIMENT_MAP[self.arm] @@ -190,8 +264,11 @@ def __init__(self, arm, policy_path, query_frequency, cartesian, extrinsics_key, self.policy.model.nets.policy.heads[head].num_inference_steps = 10 self.extrinsics = CameraTransforms(intrinsics_key="base", extrinsics_key=extrinsics_key).extrinsics self.device = "cuda" if torch.cuda.is_available() else "cpu" + self.policy.model.nets.to(self.device) self.debug_actions = None self.resampled_action_len = resampled_action_len + self._dump_dir = Path("debug/policy_io") + self._dump_i = 0 def _downsample_chunk(self, chunk: np.ndarray, target_len: int) -> np.ndarray: if target_len is None or target_len <= 0 or chunk.shape[0] == target_len: @@ -254,14 +331,21 @@ def rollout_step(self, i, obs): act_i = i % self.query_frequency return self.actions[act_i] + + def process_obs_for_policy(self, obs): + dump_i = self._dump_i + self._dump_i += 1 + + _dump_pickle(self._dump_dir / f"{dump_i:06d}_obs.pkl", _to_picklable(obs)) + # front camera: obs["front_img_1"] is BGR, shape [H, W, 3] front = torch.from_numpy(obs["front_img_1"][None, ...]) # [1, H, W, 3] front = front[..., [2, 1, 0]] # BGR -> RGB front = front.permute(0, 3, 1, 2).to(self.device, dtype=torch.float32) / 255.0 data = { - "front_img_1": front, + "observations.images.front_img_1": front, "pad_mask": torch.ones((1, 100, 1), device=self.device, dtype=torch.bool), } @@ -273,8 +357,23 @@ def process_obs_for_policy(self, obs): .to(self.device, dtype=torch.float32) / 255.0 ) - data["right_wrist_img"] = right + data["observations.images.right_wrist_img"] = right joint_positions = obs["joint_positions"][7:] + fk = fk_SE3( + torch.from_numpy(joint_positions[None, :]).to(torch.float32), + EVA_FK, + dtype=torch.float32, + device=self.device, + ) + fk_positions = fk[:, :3, 3] + fk_orientations = fk[:, :3, :3] + + extrinsics = self.extrinsics[self.arm] + fk_positions = ee_pose_to_cam_frame(fk_positions.cpu().numpy(), extrinsics) + fk_orientations, fk_ypr = ee_orientation_to_cam_frame( + fk_orientations, extrinsics + ) + ee_pose = np.concatenate([fk_positions[0], fk_ypr[0], np.array([joint_positions[-1]])], axis=0) elif self.arm == "left": left = torch.from_numpy(obs["left_wrist_img"][None, ...]) # [1, H, W, 3] BGR @@ -284,8 +383,25 @@ def process_obs_for_policy(self, obs): .to(self.device, dtype=torch.float32) / 255.0 ) - data["left_wrist_img"] = left + data["observations.images.left_wrist_img"] = left joint_positions = obs["joint_positions"][:7] + + fk = fk_SE3( + torch.from_numpy(joint_positions[None, :]).to(torch.float32), + EVA_FK, + dtype=torch.float32, + device=self.device, + ) + fk_positions = fk[:, :3, 3] + fk_orientations = fk[:, :3, :3] + + extrinsics = self.extrinsics[self.arm] + fk_positions = ee_pose_to_cam_frame(fk_positions.cpu().numpy(), extrinsics) + fk_orientations, fk_ypr = ee_orientation_to_cam_frame( + fk_orientations, extrinsics + ) + + ee_pose = np.concatenate([fk_positions[0], fk_ypr[0], np.array([joint_positions[-1]])], axis=0) elif self.arm == "both": right = torch.from_numpy(obs["right_wrist_img"][None, ...]) @@ -302,37 +418,85 @@ def process_obs_for_policy(self, obs): .to(self.device, dtype=torch.float32) / 255.0 ) - data["right_wrist_img"] = right - data["left_wrist_img"] = left + data["observations.images.right_wrist_img"] = right + data["observations.images.left_wrist_img"] = left joint_positions = obs["joint_positions"] + + left_joint = joint_positions[:7] + right_joint = joint_positions[7:] + + # FK for left + fk_left = fk_SE3( + torch.from_numpy(left_joint[None, :]).to(torch.float32), + EVA_FK, + dtype=torch.float32, + device=self.device, + ) + fk_pos_left = fk_left[:, :3, 3] + fk_rot_left = fk_left[:, :3, :3] + extr_left = self.extrinsics["left"] + fk_pos_left = ee_pose_to_cam_frame(fk_pos_left.cpu().numpy(), extr_left) + fk_rot_left, fk_ypr_left = ee_orientation_to_cam_frame(fk_rot_left, extr_left) + ee_pose_left = np.concatenate([fk_pos_left[0], fk_ypr_left[0], np.array([left_joint[-1]])], axis=0) + + # FK for right + fk_right = fk_SE3( + torch.from_numpy(right_joint[None, :]).to(torch.float32), + EVA_FK, + dtype=torch.float32, + device=self.device, + ) + fk_pos_right = fk_right[:, :3, 3] + fk_rot_right = fk_right[:, :3, :3] + extr_right = self.extrinsics["right"] + fk_pos_right = ee_pose_to_cam_frame(fk_pos_right.cpu().numpy(), extr_right) + fk_rot_right, fk_ypr_right = ee_orientation_to_cam_frame(fk_rot_right, extr_right) + ee_pose_right = np.concatenate([fk_pos_right[0], fk_ypr_right[0], np.array([right_joint[-1]])], axis=0) + + # concatenate both arm ee_poses + ee_pose = np.concatenate([ee_pose_left, ee_pose_right], axis=0) - data["joint_positions"] = torch.from_numpy(joint_positions).reshape(1, 1, -1) + data["observations.state.joint_positions"] = torch.from_numpy(joint_positions).reshape(1, -1) data["embodiment"] = torch.tensor([self.embodiment_id], dtype=torch.int64) + data["observations.state.ee_pose"] = torch.from_numpy(ee_pose).reshape(1, -1) if not self.cartesian: - data["actions_joints"] = torch.zeros_like(data["joint_positions"]) + data["actions_joints"] = torch.zeros_like(data["observations.state.joint_positions"]).reshape(1, 1, -1) else: - data["actions_cartesian"] = torch.zeros_like(data["joint_positions"]) - - processed_batch = {self.embodiment_id: data} + data["actions_cartesian"] = torch.zeros_like(data["observations.state.joint_positions"]).reshape(1, 1, -1) + # processed_batch = {self.embodiment_id: data} + processed_batch = {self.embodiment_id: {}} # move non-image tensors to device and float32 (images already are) for key, val in data.items(): if key not in ("front_img_1", "right_wrist_img", "left_wrist_img"): data[key] = val.to(self.device, dtype=torch.float32) + + for key, val in data.items(): + batch_key = self.policy.model.data_schematic.lerobot_key_to_keyname(key, self.embodiment_id) + processed_batch[self.embodiment_id][batch_key] = val + # print(f"Key: {key}, batch_key: {batch_key}, shape: {val.shape}") + processed_batch[self.embodiment_id]["pad_mask"] = data["pad_mask"] + processed_batch[self.embodiment_id]["embodiment"] = data["embodiment"] processed_batch[self.embodiment_id] = ( self.policy.model.data_schematic.normalize_data( processed_batch[self.embodiment_id], self.embodiment_id ) ) - + + _dump_pickle( + self._dump_dir / f"{dump_i:06d}_batch.pkl", + _to_picklable(processed_batch), + ) return processed_batch + def reset(self): self.actions = None self.debug_actions = None - self.policy = ModelWrapper.load_from_checkpoint(self.policy_path, weights_only=False) + self.policy = ModelWrapper.load_from_checkpoint(self.policy_path, weights_only=False, map_location="cpu") + self.policy = self.policy.to('cuda') if getattr(self.policy.model, "diffusion", False): for head in self.policy.model.nets.policy.heads: if isinstance(self.policy.model.nets.policy.heads[head], DenoisingPolicy): @@ -402,7 +566,6 @@ def main( kinematics_solver = EvaMinkKinematicsSolver( model_path="/home/robot/robot_ws/egomimic/resources/model_x5.xml" ) - try: with _KeyPoll() as kp: reset_rollout(ri, policy) @@ -531,7 +694,7 @@ def main( parser.add_argument( "--resampled-action-len", type=int, - default=None, + default=45, help="Resample each predicted action chunk to this length (e.g., 100 -> 45). Euler if --cartesian.", ) @@ -545,6 +708,10 @@ def main( episodes = args.episodes if args.episodes is not None else [0] print(f"Resampling actions to {args.resampled_action_len}") + # breakpoint() + # ModelWrapper.load_from_checkpoint(args.policy_path) + # exit() + main( arms=args.arms, frequency=args.frequency, diff --git a/external/openpi b/external/openpi index 981483dc..d1c4ac34 160000 --- a/external/openpi +++ b/external/openpi @@ -1 +1 @@ -Subproject commit 981483dca0fd9acba698fea00aa6e52d56a66c58 +Subproject commit d1c4ac34f3dbc6b9576882ce7f1576070827dc3f diff --git a/pi05.md b/pi05.md index 496893c2..91635119 100644 --- a/pi05.md +++ b/pi05.md @@ -28,4 +28,15 @@ uv run external/openpi/examples/convert_jax_model_to_pytorch.py \ After running this step, lerobot environment versions get messed up, so run the following: ``` uv pip install -e external/lerobot -``` \ No newline at end of file +``` + + +# Evaluation on robot +git submodule update --init --recursive +pip install -e external/openpi +pip install -e external/lerobot + +cd external/openpi +cp -r ./src/openpi/models_pytorch/transformers_replace/* /usr/local/lib/python3.10/dist-packages/transformers/ + + diff --git a/pull_models.sh b/pull_models.sh index f1f54aa7..eed9f514 100755 --- a/pull_models.sh +++ b/pull_models.sh @@ -1,10 +1,10 @@ #!/usr/bin/env bash -set -euo pipefail +# set -euo pipefail # ====== config (edit these) ====== -REMOTE_USER_HOST="rpunamiya6@sky1.cc.gatech.edu" -REMOTE_PATH="/coc/flash7/rpunamiya6/Projects/EgoVerse/logs/everse_cup_on_saucer" -LOCAL_PATH="./egomimic/robot/models/put_cup_on_saucer" +REMOTE_USER_HOST="acheluva3@sky2.cc.gatech.edu" +REMOTE_PATH="/coc/cedarp-dxu345-0/acheluva3/EgoVerse/logs/trained/cotrain_objcont_ft_pi_8/object in container cotrain finetune pi 8 gpu_2026-01-10_05-24-38/0/checkpoints" +LOCAL_PATH="./egomimic/robot/models/pi_cotrain_objcont" # ================================= mkdir -p "$LOCAL_PATH" diff --git a/requirements.txt b/requirements.txt index bc48e8d8..50e2dcf5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ -torch==2.6.0 -torchvision==0.21.0 +torch==2.5.0 +torchvision==0.20.0 projectaria-tools[all]==2.0.0 pyyaml matplotlib @@ -19,8 +19,8 @@ rospkg einops av==12.0.0 opencv-python==4.7.0.72 -dm-control==1.0.8 -mink +# dm-control==1.0.8 +mink==0.0.13 mujoco mujoco-py==2.1.2.14 submitit diff --git a/run_eva_docker.sh b/run_eva_docker.sh index 531773d0..fd2046b8 100755 --- a/run_eva_docker.sh +++ b/run_eva_docker.sh @@ -90,7 +90,7 @@ echo " ${RS_DEVICE_ARGS[*]}" echo " -v /dev/aria_usb:/dev/aria_usb" echo -docker run -it --network host \ +docker run -it --gpus all --network host \ "${CAN_DEVICES[@]}" \ "${VIDEO_DEVICES[@]}" \ "${RS_DEVICE_ARGS[@]}" \