diff --git a/egomimic/algo/hpt.py b/egomimic/algo/hpt.py index 6ae47832..1deca207 100644 --- a/egomimic/algo/hpt.py +++ b/egomimic/algo/hpt.py @@ -829,7 +829,6 @@ def __init__( self.domains = domains.copy() self.auxiliary_ac_keys = auxiliary_ac_keys.copy() self.shared_ac_key = kwargs.get("shared_ac_key", None) - self.is_6dof = kwargs.get("6dof", False) self.kinematics_solver = kwargs.get("kinematics_solver", None) model = HPTModel(**trunk) @@ -1282,13 +1281,16 @@ def compute_losses(self, predictions, batch): embodiment_name = get_embodiment(embodiment_id).lower() bc_loss = predictions[f"{embodiment_name}_loss"] scaled_bc_loss = bc_weight * bc_loss - total_action_loss += scaled_bc_loss + total_action_loss = total_action_loss + scaled_bc_loss loss_dict[f"{embodiment_name}_loss"] = bc_loss # for logging if self.ot: loss_dict["ot_loss"] = predictions["ot_loss"] loss_dict["avg_feature_distance"] = predictions["avg_feature_distance"] - total_action_loss += ot_weight * self.temperature * predictions["ot_loss"] + total_action_loss = ( + total_action_loss + + ot_weight * self.temperature * predictions["ot_loss"] + ) loss_dict["action_loss"] = total_action_loss / len(self.domains) return loss_dict @@ -1372,7 +1374,6 @@ def _robomimic_to_hpt_data( if key in batch: data[key] = batch[key] - data["is_6dof"] = self.is_6dof data["pad_mask"] = batch["pad_mask"] data["embodiment"] = batch["embodiment"] diff --git a/egomimic/hydra_configs/data/eva_human_keypoints_cotrain.yaml b/egomimic/hydra_configs/data/eva_human_keypoints_cotrain.yaml new file mode 100644 index 00000000..111ef609 --- /dev/null +++ b/egomimic/hydra_configs/data/eva_human_keypoints_cotrain.yaml @@ -0,0 +1,73 @@ +_target_: egomimic.pl_utils.pl_data_utils.MultiDataModuleWrapper +train_datasets: + eva_bimanual: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.MultiDataset._from_resolver + resolver: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.S3EpisodeResolver + folder_path: /coc/flash7/scratch/egoverseDebugDatasets/egoverseS3DatasetTest/ + key_map: + _target_: egomimic.rldb.embodiment.eva.Eva.get_keymap + transform_list: + _target_: egomimic.rldb.embodiment.eva.Eva.get_transform_list + filters: + robot_name: "eva_bimanual" + task: "fold_clothes" + mode: total + aria_bimanual: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.MultiDataset._from_resolver + resolver: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.S3EpisodeResolver + folder_path: /coc/flash7/scratch/egoverseDebugDatasets/aria + key_map: + _target_: egomimic.rldb.embodiment.human.Aria.get_keymap + mode: keypoints + transform_list: + _target_: egomimic.rldb.embodiment.human.Aria.get_transform_list + mode: keypoints + filters: + robot_name: "aria_bimanual" + task: "fold_clothes_indomain" + mode: total +valid_datasets: + eva_bimanual: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.MultiDataset._from_resolver + resolver: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.S3EpisodeResolver + folder_path: /coc/flash7/scratch/egoverseDebugDatasets/egoverseS3DatasetTest/ + key_map: + _target_: egomimic.rldb.embodiment.eva.Eva.get_keymap + transform_list: + _target_: egomimic.rldb.embodiment.eva.Eva.get_transform_list + filters: + robot_name: "eva_bimanual" + task: "fold_clothes" + mode: total + aria_bimanual: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.MultiDataset._from_resolver + resolver: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.S3EpisodeResolver + folder_path: /coc/flash7/scratch/egoverseDebugDatasets/aria + key_map: + _target_: egomimic.rldb.embodiment.human.Aria.get_keymap + mode: keypoints + transform_list: + _target_: egomimic.rldb.embodiment.human.Aria.get_transform_list + mode: keypoints + filters: + robot_name: "aria_bimanual" + task: "fold_clothes_indomain" + mode: total +train_dataloader_params: + eva_bimanual: + batch_size: 64 + num_workers: 10 + aria_bimanual: + batch_size: 64 + num_workers: 10 +valid_dataloader_params: + eva_bimanual: + batch_size: 64 + num_workers: 10 + aria_bimanual: + batch_size: 64 + num_workers: 10 diff --git a/egomimic/hydra_configs/hydra/launcher/submitit.yaml b/egomimic/hydra_configs/hydra/launcher/submitit.yaml index c56f2cd5..b068685e 100644 --- a/egomimic/hydra_configs/hydra/launcher/submitit.yaml +++ b/egomimic/hydra_configs/hydra/launcher/submitit.yaml @@ -4,15 +4,15 @@ defaults: _target_: hydra_plugins.hydra_submitit_launcher.submitit_launcher.SlurmLauncher # Slurm configuration -name: ${hydra.job.name} # Default job name -partition: "rl2-lab" # Slurm partition (e.g., 'gpu' or 'compute') -account: "rl2-lab" # Slurm account (e.g., 'my_account') -cpus_per_task: 12 # Number of CPUs per task -nodes: ${launch_params.nodes} # Number of nodes -tasks_per_node: ${launch_params.gpus_per_node} # Use variable for tasks per node +name: ${hydra.job.name} # Default job name +partition: "hoffman-lab" # Slurm partition (e.g., 'gpu' or 'compute') +account: "hoffman-lab" # Slurm account (e.g., 'my_account') +cpus_per_task: 12 # Number of CPUs per task +nodes: ${launch_params.nodes} # Number of nodes +tasks_per_node: ${launch_params.gpus_per_node} # Use variable for tasks per node gres: "gpu:a40:${eval:'${launch_params.gpus_per_node} * ${launch_params.nodes}'}" # GPU type and count -qos: "short" # Slurm QoS -timeout_min: 2880 # Timeout in minutes (48 hours) -exclude: "protocol, puma" # Nodes to exclude +qos: "short" # Slurm QoS +timeout_min: 2880 # Timeout in minutes (48 hours) +exclude: "protocol, puma" # Nodes to exclude additional_parameters: - requeue: true \ No newline at end of file + requeue: true diff --git a/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent.yaml b/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent.yaml new file mode 100644 index 00000000..b256d18d --- /dev/null +++ b/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent.yaml @@ -0,0 +1,74 @@ +defaults: + - hpt_cotrain_keypoints_base + +robomimic_model: + ac_keys: + eva_bimanual: "actions_eva_cart_aria_keypoints" + aria_bimanual: "actions_eva_cart_aria_keypoints" + shared_ac_key: "actions_eva_cart_aria_keypoints" + + 6dof: true + diffusion: true + + head_specs: + aria_bimanual: null + eva_bimanual: null + shared: + _target_: egomimic.models.fm_policy.FMPolicy + action_horizon: 100 + num_inference_steps: 50 + pooling: null + padding: "zero" + time_dist: "beta" + model: + _target_: egomimic.models.denoising_nets.CrossTransformer + nblocks: 6 + cond_dim: 256 + hidden_dim: 256 + act_dim: 128 + act_seq: 12 + n_heads: 4 + dropout: 0.1 + mlp_layers: 4 + mlp_ratio: 4 + embodiment_specs: + eva_bimanual: + ac_dims: 14 + encoder: + _target_: egomimic.models.codec.temporal_enc_dec.SmallTemporalEncoder + action_dim: 14 + hidden_dim: 128 + activation: "gelu" + use_layernorm: false + decoder: + _target_: egomimic.models.codec.temporal_enc_dec.SmallTemporalDecoder + action_dim: 14 + hidden_dim: 128 + activation: "gelu" + use_layernorm: true + aria_bimanual: + ac_dims: 140 + encoder: + _target_: egomimic.models.codec.temporal_enc_dec.LargeTemporalEncoder + action_dim: 140 + hidden_dim: 128 + activation: "gelu" + use_layernorm: false + decoder: + _target_: egomimic.models.codec.temporal_enc_dec.LargeTemporalDecoder + action_dim: 140 + hidden_dim: 128 + activation: "gelu" + use_layernorm: true + +optimizer: + _target_: torch.optim.AdamW + _partial_: true + lr: 1e-4 + weight_decay: 0.0001 + +scheduler: + _target_: torch.optim.lr_scheduler.CosineAnnealingLR + _partial_: true + T_max: 1800 + eta_min: 1e-5 diff --git a/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent_large.yaml b/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent_large.yaml new file mode 100644 index 00000000..c22bfdf5 --- /dev/null +++ b/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent_large.yaml @@ -0,0 +1,74 @@ +defaults: + - hpt_cotrain_keypoints_base + +robomimic_model: + ac_keys: + eva_bimanual: "actions_eva_cart_aria_keypoints" + aria_bimanual: "actions_eva_cart_aria_keypoints" + shared_ac_key: "actions_eva_cart_aria_keypoints" + + 6dof: true + diffusion: true + + head_specs: + aria_bimanual: null + eva_bimanual: null + shared: + _target_: egomimic.models.fm_policy.FMPolicy + action_horizon: 100 + num_inference_steps: 50 + pooling: null + padding: "zero" + time_dist: "beta" + model: + _target_: egomimic.models.denoising_nets.CrossTransformer + nblocks: 8 + cond_dim: 256 + hidden_dim: 256 + act_dim: 256 + act_seq: 32 + n_heads: 4 + dropout: 0.1 + mlp_layers: 4 + mlp_ratio: 4 + embodiment_specs: + eva_bimanual: + ac_dims: 14 + encoder: + _target_: egomimic.models.codec.temporal_enc_dec.SmallTemporalEncoder_32_256 + action_dim: 14 + hidden_dim: 256 + activation: "gelu" + use_layernorm: false + decoder: + _target_: egomimic.models.codec.temporal_enc_dec.SmallTemporalDecoder_32_256 + action_dim: 14 + hidden_dim: 256 + activation: "gelu" + use_layernorm: true + aria_bimanual: + ac_dims: 140 + encoder: + _target_: egomimic.models.codec.temporal_enc_dec.LargeTemporalEncoder_32_256 + action_dim: 140 + hidden_dim: 256 + activation: "gelu" + use_layernorm: false + decoder: + _target_: egomimic.models.codec.temporal_enc_dec.LargeTemporalDecoder_32_256 + action_dim: 140 + hidden_dim: 256 + activation: "gelu" + use_layernorm: true + +optimizer: + _target_: torch.optim.AdamW + _partial_: true + lr: 1e-4 + weight_decay: 0.0001 + +scheduler: + _target_: torch.optim.lr_scheduler.CosineAnnealingLR + _partial_: true + T_max: 1800 + eta_min: 1e-5 diff --git a/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent_mlp.yaml b/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent_mlp.yaml new file mode 100644 index 00000000..305c8d62 --- /dev/null +++ b/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent_mlp.yaml @@ -0,0 +1,70 @@ +defaults: + - hpt_cotrain_keypoints_base + +robomimic_model: + ac_keys: + eva_bimanual: "actions_eva_cart_aria_keypoints" + aria_bimanual: "actions_eva_cart_aria_keypoints" + shared_ac_key: "actions_eva_cart_aria_keypoints" + + 6dof: true + diffusion: true + + head_specs: + aria_bimanual: null + eva_bimanual: null + shared: + _target_: egomimic.models.fm_policy.FMPolicy + action_horizon: 100 + num_inference_steps: 50 + pooling: null + padding: "zero" + time_dist: "beta" + model: + _target_: egomimic.models.denoising_nets.CrossTransformer + nblocks: 8 + cond_dim: 256 + hidden_dim: 256 + act_dim: 256 + act_seq: 100 + n_heads: 4 + dropout: 0.1 + mlp_layers: 4 + mlp_ratio: 4 + embodiment_specs: + eva_bimanual: + ac_dims: 14 + encoder: + _target_: egomimic.models.codec.mlp.MLPProjection + input_dim: 14 + hidden_dim: 256 + output_dim: 256 + decoder: + _target_: egomimic.models.codec.mlp.MLPProjection + input_dim: 256 + hidden_dim: 256 + output_dim: 14 + aria_bimanual: + ac_dims: 140 + encoder: + _target_: egomimic.models.codec.mlp.MLPProjection + input_dim: 140 + hidden_dim: 256 + output_dim: 256 + decoder: + _target_: egomimic.models.codec.mlp.MLPProjection + input_dim: 256 + hidden_dim: 256 + output_dim: 140 + +optimizer: + _target_: torch.optim.AdamW + _partial_: true + lr: 1e-4 + weight_decay: 0.0001 + +scheduler: + _target_: torch.optim.lr_scheduler.CosineAnnealingLR + _partial_: true + T_max: 1800 + eta_min: 1e-5 diff --git a/egomimic/hydra_configs/model/hpt_cotrain_keypoints_base.yaml b/egomimic/hydra_configs/model/hpt_cotrain_keypoints_base.yaml new file mode 100644 index 00000000..96e2d9ca --- /dev/null +++ b/egomimic/hydra_configs/model/hpt_cotrain_keypoints_base.yaml @@ -0,0 +1,147 @@ +_target_: egomimic.pl_utils.pl_model.ModelWrapper +robomimic_model: + _target_: egomimic.algo.hpt.HPT + data_schematic: _${data.dataset.data_schematic} + camera_transforms: + aria_bimanual: + _target_: egomimic.utils.egomimicUtils.CameraTransforms + intrinsics_key: "base" # change to base_half if using half res + extrinsics_key: "x5Dec13_2" + eva_bimanual: + _target_: egomimic.utils.egomimicUtils.CameraTransforms + intrinsics_key: "base" # change to base_half if using half res + extrinsics_key: "x5Dec13_2" + ac_keys: + aria_bimanual: "actions_eva_cart_aria_keypoints" + eva_bimanual: "actions_eva_cart_aria_keypoints" + shared_ac_key: "actions_eva_cart_aria_keypoints" + + reverse_kl_samples: 8 + + trunk: + embed_dim: 256 + num_blocks: 16 + num_heads: 8 + token_postprocessing: "action_token" + observation_horizon: 1 + action_horizon: 64 + no_trunk: false + use_domain_embedding: true + drop_path: 0.1 + weight_init_style: "pytorch" + + multitask: false + pretrained: false + pretrained_checkpoint: null + domains: ["eva_bimanual", "aria_bimanual"] + shared_obs_keys: ["front_img_1"] + + shared_stem_specs: + front_img_1: + _target_: egomimic.models.hpt_nets.MLPPolicyStem + input_dim: 256 + output_dim: 256 + widths: [256] + specs: + random_horizon_masking: false + cross_attn: + crossattn_latent: 16 + crossattn_heads: 8 + crossattn_dim_head: 64 + crossattn_modality_dropout: 0.1 + modality_embed_dim: 256 + + stem_specs: + aria_bimanual: + state_keypoints: # TODO: check if this is added to dataschematic correctly + _target_: egomimic.models.hpt_nets.MLPPolicyStem + input_dim: 140 + output_dim: 256 + widths: [256] + specs: + random_horizon_masking: false + cross_attn: + crossattn_latent: 16 + crossattn_heads: 8 + crossattn_dim_head: 64 + crossattn_modality_dropout: 0.1 + modality_embed_dim: 256 + + eva_bimanual: + state_joint_positions: + _target_: egomimic.models.hpt_nets.MLPPolicyStem + input_dim: 14 + output_dim: 256 + widths: [256] + specs: + random_horizon_masking: false + cross_attn: + crossattn_latent: 16 + crossattn_heads: 8 + crossattn_dim_head: 64 + crossattn_modality_dropout: 0.1 + modality_embed_dim: 256 + right_wrist_img: + _target_: egomimic.models.hpt_nets.MLPPolicyStem + input_dim: 256 + output_dim: 256 + widths: [256] + specs: + random_horizon_masking: false + cross_attn: + crossattn_latent: 16 + crossattn_heads: 8 + crossattn_dim_head: 64 + crossattn_modality_dropout: 0.1 + modality_embed_dim: 256 + left_wrist_img: + _target_: egomimic.models.hpt_nets.MLPPolicyStem + input_dim: 256 + output_dim: 256 + widths: [256] + specs: + random_horizon_masking: false + cross_attn: + crossattn_latent: 16 + crossattn_heads: 8 + crossattn_dim_head: 64 + crossattn_modality_dropout: 0.1 + modality_embed_dim: 256 + + encoder_specs: + front_img_1: + _target_: egomimic.models.hpt_nets.ResNet + output_dim: 256 + num_of_copy: 1 + right_wrist_img: + _target_: egomimic.models.hpt_nets.ResNet + output_dim: 256 + num_of_copy: 1 + left_wrist_img: + _target_: egomimic.models.hpt_nets.ResNet + output_dim: 256 + num_of_copy: 1 + + train_image_augs: + _target_: torchvision.transforms.Compose + transforms: + - _target_: torchvision.transforms.ColorJitter + brightness: 0.1 + contrast: 0.1 + saturation: 0.1 + hue: 0.05 + - _target_: torchvision.transforms.Normalize + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + eval_image_augs: + _target_: torchvision.transforms.Compose + transforms: + - _target_: torchvision.transforms.Normalize + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + +optimizer: + _target_: torch.optim.AdamW + _partial_: true + lr: 5e-5 + weight_decay: 0.0001 diff --git a/egomimic/hydra_configs/train_zarr_latent.yaml b/egomimic/hydra_configs/train_zarr_latent.yaml new file mode 100644 index 00000000..9280721a --- /dev/null +++ b/egomimic/hydra_configs/train_zarr_latent.yaml @@ -0,0 +1,111 @@ +defaults: + - model: hpt_cotrain_flow_shared_head_latent_mlp + - visualization: eva_cartesian_aria_keypoints + - paths: default + - trainer: ddp + - debug: null + - logger: wandb + - data: eva_human_keypoints_cotrain + - callbacks: checkpoints + - override hydra/launcher: submitit + - _self_ + +name: latent_flow +description: latent_flow +ckpt_path: null +train: true +eval: false + +eval_class: + _target_: egomimic.scripts.evaluation.Eve + mode: real + arm: both + eval_path: "./logs/eval/${name}_${now:%Y-%m-%d_%H-%M-%S}" + +hydra: + run: + # Dir should be experiment_name/description_{timestamp} + dir: ./logs/${name}/${description}_${now:%Y-%m-%d_%H-%M-%S} + sweep: + dir: ./logs/${name}/${description}_${now:%Y-%m-%d_%H-%M-%S} + +launch_params: + gpus_per_node: 1 + nodes: 1 + +data_schematic: # Dynamically fill in these shapes from the dataset + _target_: egomimic.rldb.zarr.utils.DataSchematic + norm_mode: quantile + schematic_dict: + eva_bimanual: + front_img_1: #batch key + key_type: camera_keys # key type + zarr_key: observations.images.front_img_1 # dataset key + right_wrist_img: + key_type: camera_keys + zarr_key: observations.images.right_wrist_img + left_wrist_img: + key_type: camera_keys + zarr_key: observations.images.left_wrist_img + ee_pose: + key_type: proprio_keys + zarr_key: observations.state.ee_pose + joint_positions: + key_type: proprio_keys + zarr_key: observations.state.joint_positions + actions_joints: + key_type: action_keys + zarr_key: actions_joints + actions_eva_cart_aria_keypoints: + key_type: action_keys + zarr_key: actions_cartesian + embodiment: + key_type: metadata_keys + zarr_key: metadata.embodiment + aria_bimanual: + front_img_1: + key_type: camera_keys + zarr_key: observations.images.front_img_1 + ee_pose: + key_type: proprio_keys + zarr_key: observations.state.ee_pose + actions_cartesian: + key_type: action_keys + zarr_key: actions_cartesian + actions_eva_cart_aria_keypoints: + key_type: action_keys + zarr_key: actions_keypoints + keypoint_positions: + key_type: proprio_keys + zarr_key: observations.state.keypoints + embodiment: + key_type: metadata_keys + zarr_key: metadata.embodiment + mecka_bimanual: + front_img_1: + key_type: camera_keys + zarr_key: observations.images.front_img_1 + ee_pose: + key_type: proprio_keys + zarr_key: observations.state.ee_pose + actions_cartesian: + key_type: action_keys + zarr_key: actions_cartesian + embodiment: + key_type: metadata_keys + zarr_key: metadata.embodiment + scale_bimanual: + front_img_1: + key_type: camera_keys + zarr_key: observations.images.front_img_1 + ee_pose: + key_type: proprio_keys + zarr_key: observations.state.ee_pose + actions_cartesian: + key_type: action_keys + zarr_key: actions_cartesian + embodiment: + key_type: metadata_keys + zarr_key: metadata.embodiment + +seed: 42 diff --git a/egomimic/hydra_configs/trainer/ddp.yaml b/egomimic/hydra_configs/trainer/ddp.yaml index d3d90aca..d4359f17 100644 --- a/egomimic/hydra_configs/trainer/ddp.yaml +++ b/egomimic/hydra_configs/trainer/ddp.yaml @@ -1,11 +1,11 @@ defaults: - default -strategy: ddp +strategy: ddp_find_unused_parameters_true accelerator: gpu devices: ${eval:'${launch_params.gpus_per_node} * ${launch_params.nodes}'} num_nodes: ${launch_params.nodes} sync_batchnorm: True check_val_every_n_epoch: 200 -num_sanity_val_steps: 0 \ No newline at end of file +num_sanity_val_steps: 0 diff --git a/egomimic/hydra_configs/trainer/debug.yaml b/egomimic/hydra_configs/trainer/debug.yaml index e3a9a1a5..905d3711 100644 --- a/egomimic/hydra_configs/trainer/debug.yaml +++ b/egomimic/hydra_configs/trainer/debug.yaml @@ -3,7 +3,7 @@ defaults: strategy: ddp_find_unused_parameters_true limit_train_batches: 5 -limit_val_batches: 20 +limit_val_batches: 3 check_val_every_n_epoch: 2 profiler: simple max_epochs: 4 diff --git a/egomimic/hydra_configs/visualization/eva_cartesian_aria_keypoints.yaml b/egomimic/hydra_configs/visualization/eva_cartesian_aria_keypoints.yaml index 8c4d1c91..33ae292c 100644 --- a/egomimic/hydra_configs/visualization/eva_cartesian_aria_keypoints.yaml +++ b/egomimic/hydra_configs/visualization/eva_cartesian_aria_keypoints.yaml @@ -1,14 +1,10 @@ eva_bimanual: - action_keys: actions_cartesian - viz_function: - _target_: egomimic.rldb.embodiment.eva.Eva.viz - _partial_: true - mode: traj - intrinsics_key: base_half + _target_: egomimic.rldb.embodiment.eva.Eva.viz_cartesian_gt_preds + _partial_: true + image_key: front_img_1 + action_key: actions_cartesian aria_bimanual: - action_keys: actions_cartesian - viz_function: - _target_: egomimic.rldb.embodiment.human.Aria.viz - _partial_: true - mode: keypoints - intrinsics_key: base_half + _target_: egomimic.rldb.embodiment.human.Human.viz_keypoints_gt_preds + _partial_: true + image_key: front_img_1 + action_key: actions_keypoints diff --git a/egomimic/models/codec/mlp.py b/egomimic/models/codec/mlp.py new file mode 100644 index 00000000..43efe3a6 --- /dev/null +++ b/egomimic/models/codec/mlp.py @@ -0,0 +1,16 @@ +import torch +import torch.nn as nn + + +class MLPProjection(nn.Module): + def __init__(self, input_dim: int, hidden_dim: int, output_dim: int): + super().__init__() + self.net = nn.Sequential( + nn.Linear(input_dim, hidden_dim), + nn.GELU(), + nn.Linear(hidden_dim, output_dim), + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + # x is in (B, T, D) -> (B, T, H) + return self.net(x) diff --git a/egomimic/models/codec/temporal_enc_dec.py b/egomimic/models/codec/temporal_enc_dec.py new file mode 100644 index 00000000..4547c567 --- /dev/null +++ b/egomimic/models/codec/temporal_enc_dec.py @@ -0,0 +1,477 @@ +from __future__ import annotations + +import torch +import torch.nn as nn + + +class SmallTemporalEncoder(nn.Module): + """ + Fix temporal encoder for 100 seq of actiona + """ + + def __init__( + self, + *, + action_dim: int, + activation: str = "gelu", + hidden_dim: int = 64, + use_layernorm: bool = True, + ): + super().__init__() + if activation == "relu": + self.act = nn.ReLU() + elif activation == "gelu": + self.act = nn.GELU() + elif activation == "silu": + self.act = nn.SiLU() + else: + raise ValueError(f"Unknown activation: {activation}") + + layers = [ + nn.Conv1d(action_dim, action_dim * 2, kernel_size=8, stride=2, padding=3), + self.act, + nn.Conv1d( + action_dim * 2, action_dim * 2, kernel_size=8, stride=2, padding=2 + ), + self.act, + nn.Conv1d( + action_dim * 2, action_dim * 2, kernel_size=8, stride=2, padding=3 + ), + self.act, + ] + + self.down = nn.Sequential(*layers) + self.proj = nn.Linear(action_dim * 2, hidden_dim) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + Input: (B, T, D) or (T, D) + Output: (B, K, H) or (K, H) + """ + squeeze_B = False + if x.dim() == 2: + x = x.unsqueeze(0) + squeeze_B = True + elif x.dim() != 3: + raise ValueError(f"Expected (T,D) or (B,T,D), got {tuple(x.shape)}") + + x = x.transpose(1, 2) # (B, D, T) + x = self.down(x) # (B, D, K) + x = x.transpose(1, 2) # (B, K, D) + x = self.proj(x) # (B, K, H) + + return x.squeeze(0) if squeeze_B else x + + +class SmallTemporalDecoder(nn.Module): + """ + Decoder that mirrors SmallTemporalEncoder: + Enc convs (over time, channels-first): + (D -> 2D) k=8 s=2 p=3 + (2D -> 2D) k=8 s=2 p=2 + (2D -> 2D) k=8 s=2 p=3 + For T=100 this encoder produces K=12. + + This decoder maps: + Input: (B, K=12, H=64) or (K, H) + Output: (B, T=100, D) or (T, D) + """ + + def __init__( + self, + *, + action_dim: int, + hidden_dim: int = 64, + activation: str = "gelu", + use_layernorm: bool = True, + K: int = 12, + T: int = 100, + ): + super().__init__() + self.action_dim = action_dim + self.hidden_dim = hidden_dim + self.T = T + + if activation == "relu": + self.act = nn.ReLU() + elif activation == "gelu": + self.act = nn.GELU() + elif activation == "silu": + self.act = nn.SiLU() + else: + raise ValueError(f"Unknown activation: {activation}") + + C2 = action_dim * 2 + + self.proj = nn.Linear(hidden_dim, C2) + self.norm = nn.LayerNorm(C2) if use_layernorm else nn.Identity() + + self.up = nn.Sequential( + nn.ConvTranspose1d( + C2, C2, kernel_size=8, stride=2, padding=3, output_padding=0 + ), + self.act, + nn.ConvTranspose1d( + C2, C2, kernel_size=8, stride=2, padding=2, output_padding=0 + ), + self.act, + nn.ConvTranspose1d( + C2, action_dim, kernel_size=8, stride=2, padding=3, output_padding=0 + ), + ) + + def forward(self, z: torch.Tensor) -> torch.Tensor: + squeeze_B = False + if z.dim() == 2: + z = z.unsqueeze(0) + squeeze_B = True + elif z.dim() != 3: + raise ValueError(f"Expected (K,H) or (B,K,H), got {tuple(z.shape)}") + + B, K, H = z.shape + if H != self.hidden_dim: + raise ValueError(f"Expected H={self.hidden_dim}, got {H}") + + x = self.norm(self.proj(z)) # (B, K, 2D) + x = x.transpose(1, 2) # (B, 2D, K) + x = self.up(x) # (B, D, T) + if x.shape[-1] != self.T: + raise ValueError(f"Got T_out={x.shape[-1]}, expected T={self.T}") + x = x.transpose(1, 2) # (B, T, D) + + return x.squeeze(0) if squeeze_B else x + + +class LargeTemporalEncoder(nn.Module): + """ + Encoder for (B, T=100, D) that halves channels: D -> D/2, + and downsamples time: 100 -> 12. + Output: (B, K=12, H) + """ + + def __init__( + self, + *, + action_dim: int, + hidden_dim: int = 64, + activation: str = "gelu", + use_layernorm: bool = True, + expect_T: int | None = 100, + ): + super().__init__() + if action_dim % 2 != 0: + raise ValueError(f"action_dim must be even to halve. Got {action_dim}") + + self.action_dim = action_dim + self.hidden_dim = hidden_dim + self.expect_T = expect_T + + if activation == "relu": + self.act = nn.ReLU() + elif activation == "gelu": + self.act = nn.GELU() + elif activation == "silu": + self.act = nn.SiLU() + else: + raise ValueError(f"Unknown activation: {activation}") + + D = action_dim + + self.down = nn.Sequential( + nn.Conv1d(D, action_dim, kernel_size=8, stride=2, padding=3), # 100 -> 50 + self.act, + nn.Conv1d( + action_dim, action_dim, kernel_size=8, stride=2, padding=2 + ), # 50 -> 24 + self.act, + nn.Conv1d( + action_dim, action_dim, kernel_size=8, stride=2, padding=3 + ), # 24 -> 12 + self.act, + ) + + self.proj = nn.Linear(action_dim, hidden_dim) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + squeeze_B = False + if x.dim() == 2: + x = x.unsqueeze(0) + squeeze_B = True + elif x.dim() != 3: + raise ValueError(f"Expected (T,D) or (B,T,D), got {tuple(x.shape)}") + + B, T, D = x.shape + if D != self.action_dim: + raise ValueError(f"Expected D={self.action_dim}, got {D}") + if self.expect_T is not None and T != self.expect_T: + raise ValueError(f"Expected T={self.expect_T}, got {T}") + + x = x.transpose(1, 2) # (B, D, T) + x = self.down(x) # (B, D/2, K=12) + x = x.transpose(1, 2) # (B, K, D/2) + x = self.proj(x) # (B, K, H) + return x.squeeze(0) if squeeze_B else x + + +class LargeTemporalDecoder(nn.Module): + """ + Decoder that mirrors LargeTemporalEncoder: + time: 12 -> 24 -> 50 -> 100 + channels: H -> D/2 -> D + Input: (B, K=12, H) or (K, H) + Output: (B, T=100, D) or (T, D) + """ + + def __init__( + self, + *, + action_dim: int, + hidden_dim: int = 64, + activation: str = "gelu", + use_layernorm: bool = True, + K: int = 12, + T: int = 100, + ): + super().__init__() + if action_dim % 2 != 0: + raise ValueError(f"action_dim must be even to halve. Got {action_dim}") + + self.action_dim = action_dim + self.half_dim = action_dim // 2 + self.hidden_dim = hidden_dim + self.T = T + + if activation == "relu": + self.act = nn.ReLU() + elif activation == "gelu": + self.act = nn.GELU() + elif activation == "silu": + self.act = nn.SiLU() + else: + raise ValueError(f"Unknown activation: {activation}") + + self.proj = nn.Linear(hidden_dim, action_dim) + self.norm = nn.LayerNorm(action_dim) if use_layernorm else nn.Identity() + + # Mirrors paddings/strides/kernels in reverse. + # Lengths: 12 -> 24 -> 50 -> 100 with output_padding=0 for these params. + self.up = nn.Sequential( + nn.ConvTranspose1d( + action_dim, + action_dim, + kernel_size=8, + stride=2, + padding=3, + output_padding=0, + ), + self.act, + nn.ConvTranspose1d( + action_dim, + action_dim, + kernel_size=8, + stride=2, + padding=2, + output_padding=0, + ), + self.act, + nn.ConvTranspose1d( + action_dim, + action_dim, + kernel_size=8, + stride=2, + padding=3, + output_padding=0, + ), + ) + + def forward(self, z: torch.Tensor) -> torch.Tensor: + squeeze_B = False + if z.dim() == 2: + z = z.unsqueeze(0) + squeeze_B = True + elif z.dim() != 3: + raise ValueError(f"Expected (K,H) or (B,K,H), got {tuple(z.shape)}") + + B, K, H = z.shape + if H != self.hidden_dim: + raise ValueError(f"Expected H={self.hidden_dim}, got {H}") + + x = self.norm(self.proj(z)) # (B, K, D/2) + x = x.transpose(1, 2) # (B, D/2, K) + x = self.up(x) # (B, D, T) + if x.shape[-1] != self.T: + raise ValueError(f"Got T_out={x.shape[-1]}, expected T={self.T}") + x = x.transpose(1, 2) # (B, T, D) + return x.squeeze(0) if squeeze_B else x + + +class SmallTemporalEncoder_32_256(SmallTemporalEncoder): + """ + Fix temporal encoder for 100 seq of actiona + """ + + def __init__( + self, + *, + action_dim: int, + activation: str = "gelu", + hidden_dim: int = 256, + use_layernorm: bool = True, + ): + super().__init__( + action_dim=action_dim, + hidden_dim=hidden_dim, + activation=activation, + use_layernorm=use_layernorm, + ) + + layers = [ + nn.Conv1d(action_dim, 512, kernel_size=9, stride=3, padding=2), + self.act, + ] + + self.down = nn.Sequential(*layers) + self.proj = nn.Linear(512, hidden_dim) + + +class SmallTemporalDecoder_32_256(SmallTemporalDecoder): + """ + Decoder that mirrors SmallTemporalEncoder_32_128: + """ + + def __init__( + self, + *, + action_dim: int, + hidden_dim: int = 256, + activation: str = "gelu", + use_layernorm: bool = True, + ): + super().__init__( + action_dim=action_dim, + hidden_dim=hidden_dim, + activation=activation, + use_layernorm=use_layernorm, + ) + + layers = [ + nn.ConvTranspose1d(512, action_dim, kernel_size=9, stride=3, padding=1), + self.act, + ] + + self.up = nn.Sequential(*layers) + self.proj = nn.Linear(hidden_dim, 512) + self.norm = nn.LayerNorm(512) if use_layernorm else nn.Identity() + + +class LargeTemporalEncoder_32_256(LargeTemporalEncoder): + """ + Encoder for (B, T=100, D) that halves channels: D -> D/2, + and downsamples time: 100 -> 12. + Output: (B, K=12, H) + """ + + def __init__( + self, + *, + action_dim: int, + hidden_dim: int = 256, + activation: str = "gelu", + use_layernorm: bool = True, + expect_T: int | None = 100, + ): + super().__init__( + action_dim=action_dim, + hidden_dim=hidden_dim, + activation=activation, + use_layernorm=use_layernorm, + expect_T=expect_T, + ) + layers = [ + nn.Conv1d(action_dim, 1024, kernel_size=9, stride=3, padding=2), + self.act, + ] + + self.down = nn.Sequential(*layers) + self.proj = nn.Linear(1024, hidden_dim) + + +class LargeTemporalDecoder_32_256(LargeTemporalDecoder): + """ + Decoder that mirrors LargeTemporalEncoder_32_128: + """ + + def __init__( + self, + *, + action_dim: int, + hidden_dim: int = 256, + activation: str = "gelu", + use_layernorm: bool = True, + K: int = 12, + T: int = 100, + ): + super().__init__( + action_dim=action_dim, + hidden_dim=hidden_dim, + activation=activation, + use_layernorm=use_layernorm, + K=K, + T=T, + ) + + layers = [ + nn.ConvTranspose1d(1024, action_dim, kernel_size=9, stride=3, padding=1), + self.act, + ] + + self.up = nn.Sequential(*layers) + self.proj = nn.Linear(hidden_dim, 1024) + self.norm = nn.LayerNorm(1024) if use_layernorm else nn.Identity() + + +def count_params(module: nn.Module, trainable_only: bool = False) -> int: + if trainable_only: + return sum(p.numel() for p in module.parameters() if p.requires_grad) + return sum(p.numel() for p in module.parameters()) + + +def print_param_breakdown(module: nn.Module, trainable_only: bool = False) -> None: + total = 0 + for name, p in module.named_parameters(): + if trainable_only and not p.requires_grad: + continue + n = p.numel() + total += n + print(f"{name:60s} {tuple(p.shape)!s:20s} {n}") + print(f"\nTOTAL params: {total}") + + +if __name__ == "__main__": + B, T, D = 8, 100, 140 + + enc = LargeTemporalEncoder_32_256(action_dim=D) + dec = LargeTemporalDecoder_32_256(action_dim=D, use_layernorm=True) + + x = torch.randn(B, T, D) + z = enc(x) + x_hat = dec(z) + # z shape is 8, 12, 64 + print("LargeTemporalEncoder_32_256") + print(count_params(enc)) + print(count_params(enc, trainable_only=True)) + print_param_breakdown(enc) + + B, T, D = 8, 100, 14 + enc = SmallTemporalEncoder_32_256(action_dim=D) + dec = SmallTemporalDecoder_32_256(action_dim=D, use_layernorm=True) + + x = torch.randn(B, T, D) + z = enc(x) + x_hat = dec(z) + # z shape is 8, 12, 64 + + print("SmallTemporalEncoder_32_256") + print(count_params(enc)) + print(count_params(enc, trainable_only=True)) + print_param_breakdown(enc) diff --git a/egomimic/models/denoising_policy.py b/egomimic/models/denoising_policy.py index 645a8c44..5c5225b0 100644 --- a/egomimic/models/denoising_policy.py +++ b/egomimic/models/denoising_policy.py @@ -5,6 +5,7 @@ import torch.nn.functional as F from egomimic.models.denoising_nets import ConditionalUnet1D +from egomimic.rldb.embodiment.embodiment import get_embodiment class DenoisingPolicy(nn.Module): @@ -23,29 +24,59 @@ def __init__( self, model: ConditionalUnet1D, action_horizon: int, - infer_ac_dims: dict, num_inference_steps: int = None, + embodiment_specs: dict = None, **kwargs, ): super().__init__() self.model = model self.action_horizon = action_horizon - self.infer_ac_dims = infer_ac_dims self.num_inference_steps = num_inference_steps + self.embodiment_specs = embodiment_specs + self.codec_enabled = False + + _codecs = {} + if embodiment_specs is not None: + for _emb_name, _spec in embodiment_specs.items(): + if _spec.get("encoder") is not None: + _codecs[f"{_emb_name}_encoder"] = _spec["encoder"] + if _spec.get("decoder") is not None: + _codecs[f"{_emb_name}_decoder"] = _spec["decoder"] + if _codecs: + self.codecs = nn.ModuleDict(_codecs) self.padding = kwargs.get("padding", None) self.pooling = kwargs.get("pooling", None) - self.model_type = kwargs.get("model_type", None) - - if not infer_ac_dims: - raise ValueError("infer_ac_dims must be a non-empty dict") for name, param in self.model.named_parameters(): if not param.requires_grad: print(f"[warn] {name} has requires_grad=False") total_params = sum(p.numel() for p in self.model.parameters()) + if self.embodiment_specs is not None: + for embodiment_name, spec in self.embodiment_specs.items(): + if spec.get("ac_dims") is None: + raise ValueError(f"ac_dims must be specified for {embodiment_name}") + for embodiment_name, spec in self.embodiment_specs.items(): + if spec.get("encoder") is not None: + encoder_params = sum( + p.numel() for p in spec["encoder"].parameters() + ) + self.codec_enabled = True + if spec.get("decoder") is not None: + decoder_params = sum( + p.numel() for p in spec["decoder"].parameters() + ) + self.codec_enabled = True + print( + f"[{embodiment_name}] Encoder params: {encoder_params / 1e6:.2f}M" + ) + print( + f"[{embodiment_name}] Decoder params: {decoder_params / 1e6:.2f}M" + ) + total_params += encoder_params + decoder_params + print( f"[{self.__class__.__name__}] Total trainable parameters: {total_params / 1e6:.2f}M" ) @@ -60,7 +91,7 @@ def preprocess_sampling(self, global_cond, embodiment_name, generator=None): ( len(global_cond), self.action_horizon, - self.infer_ac_dims[embodiment_name], + self.embodiment_specs[embodiment_name].get("ac_dims"), ), dtype=global_cond.dtype, device=global_cond.device, @@ -68,7 +99,9 @@ def preprocess_sampling(self, global_cond, embodiment_name, generator=None): ) return noise, global_cond - def inference(self, noise, global_cond, generator=None) -> torch.Tensor: + def inference( + self, noise, global_cond, embodiment_name, generator=None + ) -> torch.Tensor: # pyright: ignore[reportUnusedParameter] """ To be implemented in subclass: predict actions from noise and conditioning. """ @@ -78,13 +111,15 @@ def sample_action(self, global_cond, embodiment_name, generator=None): noise, global_cond = self.preprocess_sampling( global_cond, embodiment_name, generator ) - return self.inference(noise, global_cond, generator) + return self.inference(noise, global_cond, embodiment_name, generator) def forward(self, global_cond): - cond, embodiment = global_cond - return self.sample_action(cond, embodiment) + cond, embodiment_name = global_cond + return self.sample_action(cond, embodiment_name) - def predict(self, actions, global_cond) -> Tuple[torch.Tensor, torch.Tensor]: + def predict( + self, actions, global_cond, embodiment_name + ) -> Tuple[torch.Tensor, torch.Tensor]: """ To be implemented in subclass: returns (prediction, target) given action input and conditioning. """ @@ -96,7 +131,7 @@ def loss_fn(self, pred, target): """ return F.mse_loss(pred, target) - def preprocess_compute_loss(self, global_cond, data): + def preprocess_compute_loss(self, global_cond, data, embodiment_name): if self.pooling == "mean": global_cond = global_cond.mean(dim=1) elif self.pooling == "flatten": @@ -122,6 +157,9 @@ def preprocess_compute_loss(self, global_cond, data): return actions, global_cond def compute_loss(self, global_cond, data): - actions, global_cond = self.preprocess_compute_loss(global_cond, data) - pred, target = self.predict(actions, global_cond) + embodiment_name = get_embodiment(data["embodiment"][0].item()).lower() + actions, global_cond = self.preprocess_compute_loss( + global_cond, data, embodiment_name + ) + pred, target = self.predict(actions, global_cond, embodiment_name) return self.loss_fn(pred, target) diff --git a/egomimic/models/fm_policy.py b/egomimic/models/fm_policy.py index e41f4943..27e74ee5 100644 --- a/egomimic/models/fm_policy.py +++ b/egomimic/models/fm_policy.py @@ -24,32 +24,37 @@ def __init__( self, model: ConditionalUnet1D, action_horizon, - infer_ac_dims, num_inference_steps=None, + embodiment_specs=None, **kwargs, ): super().__init__( - model, action_horizon, infer_ac_dims, num_inference_steps, **kwargs + model, action_horizon, num_inference_steps, embodiment_specs, **kwargs ) self.time_dist = kwargs.get("time_dist", "beta") + self.dt = -1.0 / self.num_inference_steps - def step(self, x_t, t, global_cond): + def step(self, x_t, t, global_cond, embodiment_name): if len(t.shape) != 1: t = torch.tensor([t], device=global_cond.device) - v_t = self.model(x_t, t, global_cond) + v_t = self.denoising_model(x_t, t, global_cond, embodiment_name) return x_t + self.dt * v_t, t + self.dt @override - def inference(self, noise, global_cond, generator=None) -> torch.Tensor: + def inference( + self, noise, global_cond, embodiment_name, generator=None + ) -> torch.Tensor: self.dt = -1.0 / self.num_inference_steps x_t = noise time = torch.ones((len(global_cond)), device=global_cond.device) while time[0] >= -self.dt / 2: - x_t, time = self.step(x_t, time, global_cond) + x_t, time = self.step(x_t, time, global_cond, embodiment_name) return x_t @override - def predict(self, actions, global_cond) -> Tuple[torch.Tensor, torch.Tensor]: + def predict( + self, actions, global_cond, embodiment_name + ) -> Tuple[torch.Tensor, torch.Tensor]: noise = torch.randn(actions.shape, device=actions.device) batch_shape = (actions.shape[0],) if self.time_dist == "beta": @@ -65,8 +70,45 @@ def predict(self, actions, global_cond) -> Tuple[torch.Tensor, torch.Tensor]: x_t = time_expanded * noise + (1 - time_expanded) * actions u_t = noise - actions - v_t = self.model(x_t, time, global_cond) + v_t = self.denoising_model(x_t, time, global_cond, embodiment_name) target = u_t pred = v_t return pred, target + + def denoising_model(self, x_t, time, global_cond, embodiment_name): + if self.codec_enabled: + x_t = self.embodiment_specs[embodiment_name]["encoder"](x_t) + else: + x_t = x_t + v_t = self.model(x_t, time, global_cond) + if self.codec_enabled: + v_t = self.embodiment_specs[embodiment_name]["decoder"](v_t) + else: + v_t = v_t + return v_t + + +if __name__ == "__main__": + import hydra + from omegaconf import OmegaConf + + cfg = OmegaConf.load( + "/coc/flash7/paphiwetsa3/projects/EgoVerse/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent.yaml" + ) + model = hydra.utils.instantiate(cfg.robomimic_model.head_specs.shared) + + # test the model + aria_input = torch.randn(8, 100, 140) + global_cond = torch.randn(8, 64, 256) + aria_output = model.step( + aria_input, torch.tensor([0.0]), global_cond, "aria_bimanual" + ) + aria_output_inference = model.inference(aria_input, global_cond, "aria_bimanual") + aria_output_predict = model.predict(aria_input, global_cond, "aria_bimanual") + + eva_input = torch.randn(8, 100, 14) + eva_output = model.step(eva_input, torch.tensor([0.0]), global_cond, "eva_bimanual") + eva_output_inference = model.inference(eva_input, global_cond, "eva_bimanual") + eva_output_predict = model.predict(eva_input, global_cond, "eva_bimanual") + breakpoint() diff --git a/egomimic/rldb/embodiment/human.py b/egomimic/rldb/embodiment/human.py index bab7432b..5c8e3da3 100644 --- a/egomimic/rldb/embodiment/human.py +++ b/egomimic/rldb/embodiment/human.py @@ -2,7 +2,9 @@ from typing import Literal -from egomimic.rldb.embodiment.embodiment import Embodiment +import numpy as np + +from egomimic.rldb.embodiment.embodiment import Embodiment, get_embodiment from egomimic.rldb.zarr.action_chunk_transforms import ( ActionChunkCoordinateFrameTransform, ConcatKeys, @@ -15,6 +17,7 @@ ) from egomimic.utils.type_utils import _to_numpy from egomimic.utils.viz_utils import ( + ColorPalette, _viz_axes, _viz_keypoints, _viz_traj, @@ -43,6 +46,30 @@ def get_transform_list( f"Unsupported mode '{mode}'. Expected one of: 'cartesian', 'keypoints'." ) + @classmethod + def viz_keypoints_gt_preds( + cls, predictions, batch, image_key, action_key, **kwargs + ): + embodiment_id = batch["embodiment"][0].item() + embodiment_name = get_embodiment(embodiment_id).lower() + + images = batch[image_key] + actions = batch[action_key] + pred_actions = predictions[f"{embodiment_name}_{action_key}"] + ims_list = [] + images = _to_numpy(images) + actions = _to_numpy(actions) + pred_actions = _to_numpy(pred_actions) + for i in range(images.shape[0]): + image = images[i] + action = actions[i] + pred_action = pred_actions[i] + ims = cls.viz(image, action, mode="keypoints", color="Reds", **kwargs) + ims = cls.viz(ims, pred_action, mode="keypoints", color="Greens", **kwargs) + ims_list.append(ims) + ims = np.stack(ims_list, axis=0) + return ims + @classmethod def viz_transformed_batch( cls, @@ -50,6 +77,7 @@ def viz_transformed_batch( mode=Literal["traj", "axes", "keypoints"], action_key="actions_cartesian", image_key=None, + **kwargs, ): image_key = image_key or cls.VIZ_IMAGE_KEY action_key = action_key or "actions_cartesian" @@ -59,7 +87,11 @@ def viz_transformed_batch( actions = _to_numpy(batch[action_key][0]) return cls.viz( - images=images, actions=actions, mode=mode, intrinsics_key=intrinsics_key + images=images, + actions=actions, + mode=mode, + intrinsics_key=intrinsics_key, + **kwargs, ) @classmethod @@ -87,13 +119,25 @@ def viz( **kwargs, ) if mode == "keypoints": + color = kwargs.get("color", None) + if color is not None and ColorPalette.is_valid(color): + n = len(cls.FINGER_COLORS) + colors = { + finger: ColorPalette.to_rgb(color, value=(i + 1) / (n + 1)) + for i, finger in enumerate(cls.FINGER_COLORS) + } + dot_color = ColorPalette.to_rgb(color, value=0.7) + else: + colors = cls.FINGER_COLORS + dot_color = cls.DOT_COLOR return _viz_keypoints( images=images, actions=actions, intrinsics_key=intrinsics_key, edges=cls.FINGER_EDGES, - colors=cls.FINGER_COLORS, edge_ranges=cls.FINGER_EDGE_RANGES, + colors=colors, + dot_color=dot_color, **kwargs, ) raise ValueError( @@ -227,6 +271,7 @@ class Aria(Human): ("ring", 9, 12), ("pinky", 12, 15), ] + DOT_COLOR = (255, 165, 0) class Scale(Human): diff --git a/egomimic/trainHydra.py b/egomimic/trainHydra.py index 7f2644c5..431d2f7d 100644 --- a/egomimic/trainHydra.py +++ b/egomimic/trainHydra.py @@ -1,6 +1,7 @@ import copy import os import signal +import subprocess from collections.abc import Mapping from typing import Any, Dict, List, Optional, Tuple @@ -114,7 +115,7 @@ def train(cfg: DictConfig) -> Tuple[Dict[str, Any], Dict[str, Any]]: data_schematic.infer_norm_from_dataset( norm_dataset, dataset_name, - sample_frac=0.005, + sample_frac=0.0001, benchmark_dir=os.path.join( cfg.trainer.default_root_dir, "benchmark_stats.json" ), @@ -216,6 +217,9 @@ def main(cfg: DictConfig) -> Optional[float]: :param cfg: DictConfig configuration composed by Hydra. :return: Optional[float] with optimized metric value. """ + script = os.path.join(os.path.dirname(__file__), "utils/aws/setup_secret.sh") + subprocess.run(["bash", script], check=True) + # apply extra utilities # (e.g. ask for tags if none are provided in cfg, print cfg tree, etc.) extras(cfg) diff --git a/egomimic/train_zarr.yaml b/egomimic/train_zarr.yaml new file mode 100644 index 00000000..26f14b3b --- /dev/null +++ b/egomimic/train_zarr.yaml @@ -0,0 +1,111 @@ +defaults: + - model: hpt_cotrain_flow_shared_head_latent + - visualization: eva_cartesian_aria_cartesian + - paths: default + - trainer: ddp + - debug: null + - logger: wandb + - data: eva_human_keypoints_cotrain + - callbacks: checkpoints + - override hydra/launcher: submitit + - _self_ + +name: latent_flow +description: latent_flow +ckpt_path: null +train: true +eval: false + +eval_class: + _target_: egomimic.scripts.evaluation.Eve + mode: real + arm: both + eval_path: "./logs/eval/${name}_${now:%Y-%m-%d_%H-%M-%S}" + +hydra: + run: + # Dir should be experiment_name/description_{timestamp} + dir: ./logs/${name}/${description}_${now:%Y-%m-%d_%H-%M-%S} + sweep: + dir: ./logs/${name}/${description}_${now:%Y-%m-%d_%H-%M-%S} + +launch_params: + gpus_per_node: 1 + nodes: 1 + +data_schematic: # Dynamically fill in these shapes from the dataset + _target_: egomimic.rldb.zarr.utils.DataSchematic + norm_mode: quantile + schematic_dict: + eva_bimanual: + front_img_1: #batch key + key_type: camera_keys # key type + zarr_key: observations.images.front_img_1 # dataset key + right_wrist_img: + key_type: camera_keys + zarr_key: observations.images.right_wrist_img + left_wrist_img: + key_type: camera_keys + zarr_key: observations.images.left_wrist_img + ee_pose: + key_type: proprio_keys + zarr_key: observations.state.ee_pose + joint_positions: + key_type: proprio_keys + zarr_key: observations.state.joint_positions + actions_joints: + key_type: action_keys + zarr_key: actions_joints + actions_cartesian: + key_type: action_keys + zarr_key: actions_cartesian + embodiment: + key_type: metadata_keys + zarr_key: metadata.embodiment + aria_bimanual: + front_img_1: + key_type: camera_keys + zarr_key: observations.images.front_img_1 + ee_pose: + key_type: proprio_keys + zarr_key: observations.state.ee_pose + actions_cartesian: + key_type: action_keys + zarr_key: actions_cartesian + actions_keypoints: + key_type: action_keys + zarr_key: actions_keypoints + keypoint_positions: + key_type: proprio_keys + zarr_key: observations.state.keypoints + embodiment: + key_type: metadata_keys + zarr_key: metadata.embodiment + mecka_bimanual: + front_img_1: + key_type: camera_keys + zarr_key: observations.images.front_img_1 + ee_pose: + key_type: proprio_keys + zarr_key: observations.state.ee_pose + actions_cartesian: + key_type: action_keys + zarr_key: actions_cartesian + embodiment: + key_type: metadata_keys + zarr_key: metadata.embodiment + scale_bimanual: + front_img_1: + key_type: camera_keys + zarr_key: observations.images.front_img_1 + ee_pose: + key_type: proprio_keys + zarr_key: observations.state.ee_pose + actions_cartesian: + key_type: action_keys + zarr_key: actions_cartesian + embodiment: + key_type: metadata_keys + zarr_key: metadata.embodiment + +seed: 42 diff --git a/egomimic/utils/viz_utils.py b/egomimic/utils/viz_utils.py index bff862bd..d4289a74 100644 --- a/egomimic/utils/viz_utils.py +++ b/egomimic/utils/viz_utils.py @@ -1,4 +1,5 @@ import cv2 +import matplotlib.pyplot as plt import numpy as np from scipy.spatial.transform import Rotation as R @@ -22,6 +23,13 @@ class ColorPalette: def is_valid(cls, name: str) -> bool: return name in vars(cls).values() + @classmethod + def to_rgb(cls, cmap_name: str, value: float = 0.7) -> tuple[int, int, int]: + """Convert a ColorPalette cmap name to an RGB tuple (0-255). + value: 0-1, where higher = darker shade.""" + rgba = plt.get_cmap(cmap_name)(value) + return tuple(int(c * 255) for c in rgba[:3]) + def _prepare_viz_image(img): if img.ndim == 3 and img.shape[0] in (1, 3): @@ -161,7 +169,7 @@ def _draw_rotation_at_anchor( def _viz_keypoints( - images, actions, intrinsics_key, edges, colors, edge_ranges, **kwargs + images, actions, intrinsics_key, edges, edge_ranges, colors, dot_color, **kwargs ): """Visualize all 21 MANO keypoints per hand, projected onto the image.""" # Prepare image @@ -178,7 +186,7 @@ def _viz_keypoints( keypoints = {} keypoints["left"] = left_keypoints.reshape(-1, 3) keypoints["right"] = right_keypoints.reshape(-1, 3) - for hand, dot_color in [("left", (0, 120, 255)), ("right", (255, 80, 0))]: + for hand, dot_color in [("left", dot_color), ("right", dot_color)]: kps_cam = keypoints[hand] # Camera frame -> pixels kps_px = cam_frame_to_cam_pixels(kps_cam, intrinsics) # (42, 3+) 21 per arm