From ec93aa419bfcd45b9fc064945a3aad8252cf5a2d Mon Sep 17 00:00:00 2001 From: ElmoPA Date: Sat, 28 Feb 2026 00:44:36 -0500 Subject: [PATCH 1/2] Latent Flow matching that works regardless of dimension dim --- .../hpt_cotrain_flow_shared_head_latent.yaml | 75 +++++ egomimic/models/conv/temporal_enc_dec.py | 305 ++++++++++++++++++ egomimic/models/denoising_policy.py | 18 +- egomimic/models/fm_policy.py | 2 + 4 files changed, 391 insertions(+), 9 deletions(-) create mode 100644 egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent.yaml create mode 100644 egomimic/models/conv/temporal_enc_dec.py diff --git a/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent.yaml b/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent.yaml new file mode 100644 index 00000000..2d45f799 --- /dev/null +++ b/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent.yaml @@ -0,0 +1,75 @@ +defaults: + - hpt_cotrain_enc_dec_base + +robomimic_model: + ac_keys: + eva_bimanual: "actions_cartesian" + aria_bimanual: "actions_cartesian" + shared_ac_key: "actions_cartesian" + + 6dof: true + diffusion: true + + head_specs: + aria_bimanual: null + eva_bimanual: null + shared: + _target_: egomimic.models.fm_policy.FMPolicy + action_horizon: 100 + num_inference_steps: 50 + pooling: null + padding: "zero" + time_dist: "beta" + infer_ac_dims: + eva_bimanual: 14 + aria_bimanual: 14 + model: + _target_: egomimic.models.denoising_nets.CrossTransformer + nblocks: 6 + cond_dim: 256 + hidden_dim: 128 + act_dim: 14 + act_seq: 100 + n_heads: 4 + dropout: 0.1 + mlp_layers: 4 + mlp_ratio: 4 + latent_map: + eva_bimanual: + encoder: + _target_: egomimic.models.conv.temporal_enc_dec.SmallTemporalEncoder + action_dim: 14 + hidden_dim: 128 + activation: "gelu" + use_layernorm: false + decoder: + _target_: egomimic.models.conv.temporal_enc_dec.SmallTemporalDecoder + action_dim: 14 + hidden_dim: 128 + activation: "gelu" + use_layernorm: true + aria_keypoints: + encoder: + _target_: egomimic.models.conv.temporal_enc_dec.LargeTemporalEncoder + action_dim: 140 + hidden_dim: 128 + activation: "gelu" + use_layernorm: false + decoder: + _target_: egomimic.models.conv.temporal_enc_dec.LargeTemporalDecoder + action_dim: 140 + hidden_dim: 128 + activation: "gelu" + use_layernorm: true + +optimizer: + _target_: torch.optim.AdamW + _partial_: true + lr: 1e-4 + weight_decay: 0.0001 + +scheduler: + _target_: torch.optim.lr_scheduler.CosineAnnealingLR + _partial_: true + T_max: 1400 + eta_min: 1e-5 diff --git a/egomimic/models/conv/temporal_enc_dec.py b/egomimic/models/conv/temporal_enc_dec.py new file mode 100644 index 00000000..4d438c82 --- /dev/null +++ b/egomimic/models/conv/temporal_enc_dec.py @@ -0,0 +1,305 @@ +from __future__ import annotations + +from typing import List + +import torch +import torch.nn as nn + + +class SmallTemporalEncoder(nn.Module): + """ + Fix temporal encoder for 100 seq of actiona + """ + def __init__( + self, + *, + action_dim: int, + activation: str = "gelu", + use_layernorm: bool = True, + ): + super().__init__() + if activation == "relu": + act = nn.ReLU() + elif activation == "gelu": + act = nn.GELU() + elif activation == "silu": + act = nn.SiLU() + else: + raise ValueError(f"Unknown activation: {activation}") + + layers = [nn.Conv1d(action_dim, action_dim*2, kernel_size=8, stride=2, padding=3), + act, + nn.Conv1d(action_dim*2, action_dim*2, kernel_size=8, stride=2, padding=2), + act, + nn.Conv1d(action_dim*2, action_dim*2, kernel_size=8, stride=2, padding=3), + act, + ] + + + hidden_dim = 64 + self.down = nn.Sequential(*layers) + self.proj = nn.Linear(action_dim*2, hidden_dim) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + Input: (B, T, D) or (T, D) + Output: (B, K, H) or (K, H) + """ + squeeze_B = False + if x.dim() == 2: + x = x.unsqueeze(0) + squeeze_B = True + elif x.dim() != 3: + raise ValueError(f"Expected (T,D) or (B,T,D), got {tuple(x.shape)}") + + x = x.transpose(1, 2) # (B, D, T) + x = self.down(x) # (B, D, K) + x = x.transpose(1, 2) # (B, K, D) + x = self.proj(x) # (B, K, H) + + return x.squeeze(0) if squeeze_B else x + +class SmallTemporalDecoder(nn.Module): + """ + Decoder that mirrors SmallTemporalEncoder: + Enc convs (over time, channels-first): + (D -> 2D) k=8 s=2 p=3 + (2D -> 2D) k=8 s=2 p=2 + (2D -> 2D) k=8 s=2 p=3 + For T=100 this encoder produces K=12. + + This decoder maps: + Input: (B, K=12, H=64) or (K, H) + Output: (B, T=100, D) or (T, D) + """ + def __init__( + self, + *, + action_dim: int, + hidden_dim: int = 64, + activation: str = "gelu", + use_layernorm: bool = True, + K: int = 12, + T: int = 100, + ): + super().__init__() + self.action_dim = action_dim + self.hidden_dim = hidden_dim + self.K = K + self.T = T + + if activation == "relu": + act = nn.ReLU() + elif activation == "gelu": + act = nn.GELU() + elif activation == "silu": + act = nn.SiLU() + else: + raise ValueError(f"Unknown activation: {activation}") + + C2 = action_dim * 2 + + self.proj = nn.Linear(hidden_dim, C2) + self.norm = nn.LayerNorm(C2) if use_layernorm else nn.Identity() + + self.up = nn.Sequential( + nn.ConvTranspose1d(C2, C2, kernel_size=8, stride=2, padding=3, output_padding=0), + act, + nn.ConvTranspose1d(C2, C2, kernel_size=8, stride=2, padding=2, output_padding=0), + act, + nn.ConvTranspose1d(C2, action_dim, kernel_size=8, stride=2, padding=3, output_padding=0), + ) + + def forward(self, z: torch.Tensor) -> torch.Tensor: + squeeze_B = False + if z.dim() == 2: + z = z.unsqueeze(0) + squeeze_B = True + elif z.dim() != 3: + raise ValueError(f"Expected (K,H) or (B,K,H), got {tuple(z.shape)}") + + B, K, H = z.shape + if H != self.hidden_dim: + raise ValueError(f"Expected H={self.hidden_dim}, got {H}") + if K != self.K: + raise ValueError(f"Expected K={self.K}, got {K}") + + x = self.norm(self.proj(z)) # (B, K, 2D) + x = x.transpose(1, 2) # (B, 2D, K) + x = self.up(x) # (B, D, T) + if x.shape[-1] != self.T: + raise ValueError(f"Got T_out={x.shape[-1]}, expected T={self.T}") + x = x.transpose(1, 2) # (B, T, D) + + return x.squeeze(0) if squeeze_B else x + +class LargeTemporalEncoder(nn.Module): + """ + Encoder for (B, T=100, D) that halves channels: D -> D/2, + and downsamples time: 100 -> 12. + Output: (B, K=12, H) + """ + def __init__( + self, + *, + action_dim: int, + hidden_dim: int = 64, + activation: str = "gelu", + use_layernorm: bool = True, + expect_T: int | None = 100, + ): + super().__init__() + if action_dim % 2 != 0: + raise ValueError(f"action_dim must be even to halve. Got {action_dim}") + + self.action_dim = action_dim + self.hidden_dim = hidden_dim + self.expect_T = expect_T + + if activation == "relu": + act = nn.ReLU() + elif activation == "gelu": + act = nn.GELU() + elif activation == "silu": + act = nn.SiLU() + else: + raise ValueError(f"Unknown activation: {activation}") + + D = action_dim + + self.down = nn.Sequential( + nn.Conv1d(D, action_dim, kernel_size=8, stride=2, padding=3), # 100 -> 50 + act, + nn.Conv1d(action_dim, action_dim, kernel_size=8, stride=2, padding=2), # 50 -> 24 + act, + nn.Conv1d(action_dim, action_dim, kernel_size=8, stride=2, padding=3), # 24 -> 12 + act, + ) + + self.proj = nn.Linear(action_dim, hidden_dim) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + squeeze_B = False + if x.dim() == 2: + x = x.unsqueeze(0) + squeeze_B = True + elif x.dim() != 3: + raise ValueError(f"Expected (T,D) or (B,T,D), got {tuple(x.shape)}") + + B, T, D = x.shape + if D != self.action_dim: + raise ValueError(f"Expected D={self.action_dim}, got {D}") + if self.expect_T is not None and T != self.expect_T: + raise ValueError(f"Expected T={self.expect_T}, got {T}") + + x = x.transpose(1, 2) # (B, D, T) + x = self.down(x) # (B, D/2, K=12) + x = x.transpose(1, 2) # (B, K, D/2) + x = self.proj(x) # (B, K, H) + return x.squeeze(0) if squeeze_B else x + + +class LargeTemporalDecoder(nn.Module): + """ + Decoder that mirrors LargeTemporalEncoder: + time: 12 -> 24 -> 50 -> 100 + channels: H -> D/2 -> D + Input: (B, K=12, H) or (K, H) + Output: (B, T=100, D) or (T, D) + """ + def __init__( + self, + *, + action_dim: int, + hidden_dim: int = 64, + activation: str = "gelu", + use_layernorm: bool = True, + K: int = 12, + T: int = 100, + ): + super().__init__() + if action_dim % 2 != 0: + raise ValueError(f"action_dim must be even to halve. Got {action_dim}") + + self.action_dim = action_dim + self.half_dim = action_dim // 2 + self.hidden_dim = hidden_dim + self.K = K + self.T = T + + if activation == "relu": + act = nn.ReLU() + elif activation == "gelu": + act = nn.GELU() + elif activation == "silu": + act = nn.SiLU() + else: + raise ValueError(f"Unknown activation: {activation}") + + self.proj = nn.Linear(hidden_dim, action_dim) + self.norm = nn.LayerNorm(action_dim) if use_layernorm else nn.Identity() + + # Mirrors paddings/strides/kernels in reverse. + # Lengths: 12 -> 24 -> 50 -> 100 with output_padding=0 for these params. + self.up = nn.Sequential( + nn.ConvTranspose1d(action_dim, action_dim, kernel_size=8, stride=2, padding=3, output_padding=0), + act, + nn.ConvTranspose1d(action_dim, action_dim, kernel_size=8, stride=2, padding=2, output_padding=0), + act, + nn.ConvTranspose1d(action_dim, action_dim, kernel_size=8, stride=2, padding=3, output_padding=0), + ) + + def forward(self, z: torch.Tensor) -> torch.Tensor: + squeeze_B = False + if z.dim() == 2: + z = z.unsqueeze(0) + squeeze_B = True + elif z.dim() != 3: + raise ValueError(f"Expected (K,H) or (B,K,H), got {tuple(z.shape)}") + + B, K, H = z.shape + if H != self.hidden_dim: + raise ValueError(f"Expected H={self.hidden_dim}, got {H}") + if K != self.K: + raise ValueError(f"Expected K={self.K}, got {K}") + + x = self.norm(self.proj(z)) # (B, K, D/2) + x = x.transpose(1, 2) # (B, D/2, K) + x = self.up(x) # (B, D, T) + if x.shape[-1] != self.T: + raise ValueError(f"Got T_out={x.shape[-1]}, expected T={self.T}") + x = x.transpose(1, 2) # (B, T, D) + return x.squeeze(0) if squeeze_B else x + + +def count_params(module: nn.Module, trainable_only: bool = False) -> int: + if trainable_only: + return sum(p.numel() for p in module.parameters() if p.requires_grad) + return sum(p.numel() for p in module.parameters()) + + +def print_param_breakdown(module: nn.Module, trainable_only: bool = False) -> None: + total = 0 + for name, p in module.named_parameters(): + if trainable_only and not p.requires_grad: + continue + n = p.numel() + total += n + print(f"{name:60s} {tuple(p.shape)!s:20s} {n}") + print(f"\nTOTAL params: {total}") + +if __name__ == "__main__": + B, T, D = 8, 100, 140 + + enc = LargeTemporalEncoder(action_dim=D) + dec = LargeTemporalDecoder(action_dim=D, use_layernorm=True) + + x = torch.randn(B, T, D) + z = enc(x) + x_hat = dec(z) + + print(count_params(enc)) + print(count_params(enc, trainable_only=True)) + print_param_breakdown(enc) + + \ No newline at end of file diff --git a/egomimic/models/denoising_policy.py b/egomimic/models/denoising_policy.py index 645a8c44..25ccc641 100644 --- a/egomimic/models/denoising_policy.py +++ b/egomimic/models/denoising_policy.py @@ -68,7 +68,7 @@ def preprocess_sampling(self, global_cond, embodiment_name, generator=None): ) return noise, global_cond - def inference(self, noise, global_cond, generator=None) -> torch.Tensor: + def inference(self, noise, global_cond, generator=None) -> torch.Tensor: # pyright: ignore[reportUnusedParameter] """ To be implemented in subclass: predict actions from noise and conditioning. """ @@ -78,13 +78,13 @@ def sample_action(self, global_cond, embodiment_name, generator=None): noise, global_cond = self.preprocess_sampling( global_cond, embodiment_name, generator ) - return self.inference(noise, global_cond, generator) + return self.inference(noise, global_cond, generator, embodiment_name) - def forward(self, global_cond): + def forward(self, global_cond, embodiment_name): cond, embodiment = global_cond - return self.sample_action(cond, embodiment) + return self.sample_action(cond, embodiment, embodiment_name) - def predict(self, actions, global_cond) -> Tuple[torch.Tensor, torch.Tensor]: + def predict(self, actions, global_cond, embodiment_name) -> Tuple[torch.Tensor, torch.Tensor]: """ To be implemented in subclass: returns (prediction, target) given action input and conditioning. """ @@ -96,7 +96,7 @@ def loss_fn(self, pred, target): """ return F.mse_loss(pred, target) - def preprocess_compute_loss(self, global_cond, data): + def preprocess_compute_loss(self, global_cond, data, embodiment_name): if self.pooling == "mean": global_cond = global_cond.mean(dim=1) elif self.pooling == "flatten": @@ -121,7 +121,7 @@ def preprocess_compute_loss(self, global_cond, data): return actions, global_cond - def compute_loss(self, global_cond, data): - actions, global_cond = self.preprocess_compute_loss(global_cond, data) - pred, target = self.predict(actions, global_cond) + def compute_loss(self, global_cond, data, embodiment_name): + actions, global_cond = self.preprocess_compute_loss(global_cond, data, embodiment_name) + pred, target = self.predict(actions, global_cond, embodiment_name) return self.loss_fn(pred, target) diff --git a/egomimic/models/fm_policy.py b/egomimic/models/fm_policy.py index e41f4943..551853a6 100644 --- a/egomimic/models/fm_policy.py +++ b/egomimic/models/fm_policy.py @@ -26,12 +26,14 @@ def __init__( action_horizon, infer_ac_dims, num_inference_steps=None, + encoder_map=None, **kwargs, ): super().__init__( model, action_horizon, infer_ac_dims, num_inference_steps, **kwargs ) self.time_dist = kwargs.get("time_dist", "beta") + self.encoder_map = encoder_map def step(self, x_t, t, global_cond): if len(t.shape) != 1: From 65cf274898202da2626eff38c8181cc22b7efa23 Mon Sep 17 00:00:00 2001 From: ElmoPA Date: Thu, 5 Mar 2026 16:14:20 -0500 Subject: [PATCH 2/2] Changes for latent flow --- egomimic/algo/hpt.py | 9 +- .../data/eva_human_keypoints_cotrain.yaml | 73 +++ .../hydra/launcher/submitit.yaml | 20 +- .../hpt_cotrain_flow_shared_head_latent.yaml | 33 +- ...cotrain_flow_shared_head_latent_large.yaml | 74 +++ ...t_cotrain_flow_shared_head_latent_mlp.yaml | 70 +++ .../model/hpt_cotrain_keypoints_base.yaml | 147 ++++++ egomimic/hydra_configs/train_zarr_latent.yaml | 111 ++++ egomimic/hydra_configs/trainer/ddp.yaml | 4 +- egomimic/hydra_configs/trainer/debug.yaml | 2 +- .../eva_cartesian_aria_keypoints.yaml | 20 +- egomimic/models/codec/mlp.py | 16 + egomimic/models/codec/temporal_enc_dec.py | 477 ++++++++++++++++++ egomimic/models/conv/temporal_enc_dec.py | 305 ----------- egomimic/models/denoising_policy.py | 68 ++- egomimic/models/fm_policy.py | 60 ++- egomimic/rldb/embodiment/human.py | 51 +- egomimic/trainHydra.py | 6 +- egomimic/train_zarr.yaml | 111 ++++ egomimic/utils/viz_utils.py | 12 +- 20 files changed, 1287 insertions(+), 382 deletions(-) create mode 100644 egomimic/hydra_configs/data/eva_human_keypoints_cotrain.yaml create mode 100644 egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent_large.yaml create mode 100644 egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent_mlp.yaml create mode 100644 egomimic/hydra_configs/model/hpt_cotrain_keypoints_base.yaml create mode 100644 egomimic/hydra_configs/train_zarr_latent.yaml create mode 100644 egomimic/models/codec/mlp.py create mode 100644 egomimic/models/codec/temporal_enc_dec.py delete mode 100644 egomimic/models/conv/temporal_enc_dec.py create mode 100644 egomimic/train_zarr.yaml diff --git a/egomimic/algo/hpt.py b/egomimic/algo/hpt.py index 6ae47832..1deca207 100644 --- a/egomimic/algo/hpt.py +++ b/egomimic/algo/hpt.py @@ -829,7 +829,6 @@ def __init__( self.domains = domains.copy() self.auxiliary_ac_keys = auxiliary_ac_keys.copy() self.shared_ac_key = kwargs.get("shared_ac_key", None) - self.is_6dof = kwargs.get("6dof", False) self.kinematics_solver = kwargs.get("kinematics_solver", None) model = HPTModel(**trunk) @@ -1282,13 +1281,16 @@ def compute_losses(self, predictions, batch): embodiment_name = get_embodiment(embodiment_id).lower() bc_loss = predictions[f"{embodiment_name}_loss"] scaled_bc_loss = bc_weight * bc_loss - total_action_loss += scaled_bc_loss + total_action_loss = total_action_loss + scaled_bc_loss loss_dict[f"{embodiment_name}_loss"] = bc_loss # for logging if self.ot: loss_dict["ot_loss"] = predictions["ot_loss"] loss_dict["avg_feature_distance"] = predictions["avg_feature_distance"] - total_action_loss += ot_weight * self.temperature * predictions["ot_loss"] + total_action_loss = ( + total_action_loss + + ot_weight * self.temperature * predictions["ot_loss"] + ) loss_dict["action_loss"] = total_action_loss / len(self.domains) return loss_dict @@ -1372,7 +1374,6 @@ def _robomimic_to_hpt_data( if key in batch: data[key] = batch[key] - data["is_6dof"] = self.is_6dof data["pad_mask"] = batch["pad_mask"] data["embodiment"] = batch["embodiment"] diff --git a/egomimic/hydra_configs/data/eva_human_keypoints_cotrain.yaml b/egomimic/hydra_configs/data/eva_human_keypoints_cotrain.yaml new file mode 100644 index 00000000..111ef609 --- /dev/null +++ b/egomimic/hydra_configs/data/eva_human_keypoints_cotrain.yaml @@ -0,0 +1,73 @@ +_target_: egomimic.pl_utils.pl_data_utils.MultiDataModuleWrapper +train_datasets: + eva_bimanual: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.MultiDataset._from_resolver + resolver: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.S3EpisodeResolver + folder_path: /coc/flash7/scratch/egoverseDebugDatasets/egoverseS3DatasetTest/ + key_map: + _target_: egomimic.rldb.embodiment.eva.Eva.get_keymap + transform_list: + _target_: egomimic.rldb.embodiment.eva.Eva.get_transform_list + filters: + robot_name: "eva_bimanual" + task: "fold_clothes" + mode: total + aria_bimanual: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.MultiDataset._from_resolver + resolver: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.S3EpisodeResolver + folder_path: /coc/flash7/scratch/egoverseDebugDatasets/aria + key_map: + _target_: egomimic.rldb.embodiment.human.Aria.get_keymap + mode: keypoints + transform_list: + _target_: egomimic.rldb.embodiment.human.Aria.get_transform_list + mode: keypoints + filters: + robot_name: "aria_bimanual" + task: "fold_clothes_indomain" + mode: total +valid_datasets: + eva_bimanual: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.MultiDataset._from_resolver + resolver: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.S3EpisodeResolver + folder_path: /coc/flash7/scratch/egoverseDebugDatasets/egoverseS3DatasetTest/ + key_map: + _target_: egomimic.rldb.embodiment.eva.Eva.get_keymap + transform_list: + _target_: egomimic.rldb.embodiment.eva.Eva.get_transform_list + filters: + robot_name: "eva_bimanual" + task: "fold_clothes" + mode: total + aria_bimanual: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.MultiDataset._from_resolver + resolver: + _target_: egomimic.rldb.zarr.zarr_dataset_multi.S3EpisodeResolver + folder_path: /coc/flash7/scratch/egoverseDebugDatasets/aria + key_map: + _target_: egomimic.rldb.embodiment.human.Aria.get_keymap + mode: keypoints + transform_list: + _target_: egomimic.rldb.embodiment.human.Aria.get_transform_list + mode: keypoints + filters: + robot_name: "aria_bimanual" + task: "fold_clothes_indomain" + mode: total +train_dataloader_params: + eva_bimanual: + batch_size: 64 + num_workers: 10 + aria_bimanual: + batch_size: 64 + num_workers: 10 +valid_dataloader_params: + eva_bimanual: + batch_size: 64 + num_workers: 10 + aria_bimanual: + batch_size: 64 + num_workers: 10 diff --git a/egomimic/hydra_configs/hydra/launcher/submitit.yaml b/egomimic/hydra_configs/hydra/launcher/submitit.yaml index c56f2cd5..b068685e 100644 --- a/egomimic/hydra_configs/hydra/launcher/submitit.yaml +++ b/egomimic/hydra_configs/hydra/launcher/submitit.yaml @@ -4,15 +4,15 @@ defaults: _target_: hydra_plugins.hydra_submitit_launcher.submitit_launcher.SlurmLauncher # Slurm configuration -name: ${hydra.job.name} # Default job name -partition: "rl2-lab" # Slurm partition (e.g., 'gpu' or 'compute') -account: "rl2-lab" # Slurm account (e.g., 'my_account') -cpus_per_task: 12 # Number of CPUs per task -nodes: ${launch_params.nodes} # Number of nodes -tasks_per_node: ${launch_params.gpus_per_node} # Use variable for tasks per node +name: ${hydra.job.name} # Default job name +partition: "hoffman-lab" # Slurm partition (e.g., 'gpu' or 'compute') +account: "hoffman-lab" # Slurm account (e.g., 'my_account') +cpus_per_task: 12 # Number of CPUs per task +nodes: ${launch_params.nodes} # Number of nodes +tasks_per_node: ${launch_params.gpus_per_node} # Use variable for tasks per node gres: "gpu:a40:${eval:'${launch_params.gpus_per_node} * ${launch_params.nodes}'}" # GPU type and count -qos: "short" # Slurm QoS -timeout_min: 2880 # Timeout in minutes (48 hours) -exclude: "protocol, puma" # Nodes to exclude +qos: "short" # Slurm QoS +timeout_min: 2880 # Timeout in minutes (48 hours) +exclude: "protocol, puma" # Nodes to exclude additional_parameters: - requeue: true \ No newline at end of file + requeue: true diff --git a/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent.yaml b/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent.yaml index 2d45f799..b256d18d 100644 --- a/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent.yaml +++ b/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent.yaml @@ -1,11 +1,11 @@ defaults: - - hpt_cotrain_enc_dec_base + - hpt_cotrain_keypoints_base robomimic_model: ac_keys: - eva_bimanual: "actions_cartesian" - aria_bimanual: "actions_cartesian" - shared_ac_key: "actions_cartesian" + eva_bimanual: "actions_eva_cart_aria_keypoints" + aria_bimanual: "actions_eva_cart_aria_keypoints" + shared_ac_key: "actions_eva_cart_aria_keypoints" 6dof: true diffusion: true @@ -20,43 +20,42 @@ robomimic_model: pooling: null padding: "zero" time_dist: "beta" - infer_ac_dims: - eva_bimanual: 14 - aria_bimanual: 14 model: _target_: egomimic.models.denoising_nets.CrossTransformer nblocks: 6 cond_dim: 256 - hidden_dim: 128 - act_dim: 14 - act_seq: 100 + hidden_dim: 256 + act_dim: 128 + act_seq: 12 n_heads: 4 dropout: 0.1 mlp_layers: 4 mlp_ratio: 4 - latent_map: + embodiment_specs: eva_bimanual: + ac_dims: 14 encoder: - _target_: egomimic.models.conv.temporal_enc_dec.SmallTemporalEncoder + _target_: egomimic.models.codec.temporal_enc_dec.SmallTemporalEncoder action_dim: 14 hidden_dim: 128 activation: "gelu" use_layernorm: false decoder: - _target_: egomimic.models.conv.temporal_enc_dec.SmallTemporalDecoder + _target_: egomimic.models.codec.temporal_enc_dec.SmallTemporalDecoder action_dim: 14 hidden_dim: 128 activation: "gelu" use_layernorm: true - aria_keypoints: + aria_bimanual: + ac_dims: 140 encoder: - _target_: egomimic.models.conv.temporal_enc_dec.LargeTemporalEncoder + _target_: egomimic.models.codec.temporal_enc_dec.LargeTemporalEncoder action_dim: 140 hidden_dim: 128 activation: "gelu" use_layernorm: false decoder: - _target_: egomimic.models.conv.temporal_enc_dec.LargeTemporalDecoder + _target_: egomimic.models.codec.temporal_enc_dec.LargeTemporalDecoder action_dim: 140 hidden_dim: 128 activation: "gelu" @@ -71,5 +70,5 @@ optimizer: scheduler: _target_: torch.optim.lr_scheduler.CosineAnnealingLR _partial_: true - T_max: 1400 + T_max: 1800 eta_min: 1e-5 diff --git a/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent_large.yaml b/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent_large.yaml new file mode 100644 index 00000000..c22bfdf5 --- /dev/null +++ b/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent_large.yaml @@ -0,0 +1,74 @@ +defaults: + - hpt_cotrain_keypoints_base + +robomimic_model: + ac_keys: + eva_bimanual: "actions_eva_cart_aria_keypoints" + aria_bimanual: "actions_eva_cart_aria_keypoints" + shared_ac_key: "actions_eva_cart_aria_keypoints" + + 6dof: true + diffusion: true + + head_specs: + aria_bimanual: null + eva_bimanual: null + shared: + _target_: egomimic.models.fm_policy.FMPolicy + action_horizon: 100 + num_inference_steps: 50 + pooling: null + padding: "zero" + time_dist: "beta" + model: + _target_: egomimic.models.denoising_nets.CrossTransformer + nblocks: 8 + cond_dim: 256 + hidden_dim: 256 + act_dim: 256 + act_seq: 32 + n_heads: 4 + dropout: 0.1 + mlp_layers: 4 + mlp_ratio: 4 + embodiment_specs: + eva_bimanual: + ac_dims: 14 + encoder: + _target_: egomimic.models.codec.temporal_enc_dec.SmallTemporalEncoder_32_256 + action_dim: 14 + hidden_dim: 256 + activation: "gelu" + use_layernorm: false + decoder: + _target_: egomimic.models.codec.temporal_enc_dec.SmallTemporalDecoder_32_256 + action_dim: 14 + hidden_dim: 256 + activation: "gelu" + use_layernorm: true + aria_bimanual: + ac_dims: 140 + encoder: + _target_: egomimic.models.codec.temporal_enc_dec.LargeTemporalEncoder_32_256 + action_dim: 140 + hidden_dim: 256 + activation: "gelu" + use_layernorm: false + decoder: + _target_: egomimic.models.codec.temporal_enc_dec.LargeTemporalDecoder_32_256 + action_dim: 140 + hidden_dim: 256 + activation: "gelu" + use_layernorm: true + +optimizer: + _target_: torch.optim.AdamW + _partial_: true + lr: 1e-4 + weight_decay: 0.0001 + +scheduler: + _target_: torch.optim.lr_scheduler.CosineAnnealingLR + _partial_: true + T_max: 1800 + eta_min: 1e-5 diff --git a/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent_mlp.yaml b/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent_mlp.yaml new file mode 100644 index 00000000..305c8d62 --- /dev/null +++ b/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent_mlp.yaml @@ -0,0 +1,70 @@ +defaults: + - hpt_cotrain_keypoints_base + +robomimic_model: + ac_keys: + eva_bimanual: "actions_eva_cart_aria_keypoints" + aria_bimanual: "actions_eva_cart_aria_keypoints" + shared_ac_key: "actions_eva_cart_aria_keypoints" + + 6dof: true + diffusion: true + + head_specs: + aria_bimanual: null + eva_bimanual: null + shared: + _target_: egomimic.models.fm_policy.FMPolicy + action_horizon: 100 + num_inference_steps: 50 + pooling: null + padding: "zero" + time_dist: "beta" + model: + _target_: egomimic.models.denoising_nets.CrossTransformer + nblocks: 8 + cond_dim: 256 + hidden_dim: 256 + act_dim: 256 + act_seq: 100 + n_heads: 4 + dropout: 0.1 + mlp_layers: 4 + mlp_ratio: 4 + embodiment_specs: + eva_bimanual: + ac_dims: 14 + encoder: + _target_: egomimic.models.codec.mlp.MLPProjection + input_dim: 14 + hidden_dim: 256 + output_dim: 256 + decoder: + _target_: egomimic.models.codec.mlp.MLPProjection + input_dim: 256 + hidden_dim: 256 + output_dim: 14 + aria_bimanual: + ac_dims: 140 + encoder: + _target_: egomimic.models.codec.mlp.MLPProjection + input_dim: 140 + hidden_dim: 256 + output_dim: 256 + decoder: + _target_: egomimic.models.codec.mlp.MLPProjection + input_dim: 256 + hidden_dim: 256 + output_dim: 140 + +optimizer: + _target_: torch.optim.AdamW + _partial_: true + lr: 1e-4 + weight_decay: 0.0001 + +scheduler: + _target_: torch.optim.lr_scheduler.CosineAnnealingLR + _partial_: true + T_max: 1800 + eta_min: 1e-5 diff --git a/egomimic/hydra_configs/model/hpt_cotrain_keypoints_base.yaml b/egomimic/hydra_configs/model/hpt_cotrain_keypoints_base.yaml new file mode 100644 index 00000000..96e2d9ca --- /dev/null +++ b/egomimic/hydra_configs/model/hpt_cotrain_keypoints_base.yaml @@ -0,0 +1,147 @@ +_target_: egomimic.pl_utils.pl_model.ModelWrapper +robomimic_model: + _target_: egomimic.algo.hpt.HPT + data_schematic: _${data.dataset.data_schematic} + camera_transforms: + aria_bimanual: + _target_: egomimic.utils.egomimicUtils.CameraTransforms + intrinsics_key: "base" # change to base_half if using half res + extrinsics_key: "x5Dec13_2" + eva_bimanual: + _target_: egomimic.utils.egomimicUtils.CameraTransforms + intrinsics_key: "base" # change to base_half if using half res + extrinsics_key: "x5Dec13_2" + ac_keys: + aria_bimanual: "actions_eva_cart_aria_keypoints" + eva_bimanual: "actions_eva_cart_aria_keypoints" + shared_ac_key: "actions_eva_cart_aria_keypoints" + + reverse_kl_samples: 8 + + trunk: + embed_dim: 256 + num_blocks: 16 + num_heads: 8 + token_postprocessing: "action_token" + observation_horizon: 1 + action_horizon: 64 + no_trunk: false + use_domain_embedding: true + drop_path: 0.1 + weight_init_style: "pytorch" + + multitask: false + pretrained: false + pretrained_checkpoint: null + domains: ["eva_bimanual", "aria_bimanual"] + shared_obs_keys: ["front_img_1"] + + shared_stem_specs: + front_img_1: + _target_: egomimic.models.hpt_nets.MLPPolicyStem + input_dim: 256 + output_dim: 256 + widths: [256] + specs: + random_horizon_masking: false + cross_attn: + crossattn_latent: 16 + crossattn_heads: 8 + crossattn_dim_head: 64 + crossattn_modality_dropout: 0.1 + modality_embed_dim: 256 + + stem_specs: + aria_bimanual: + state_keypoints: # TODO: check if this is added to dataschematic correctly + _target_: egomimic.models.hpt_nets.MLPPolicyStem + input_dim: 140 + output_dim: 256 + widths: [256] + specs: + random_horizon_masking: false + cross_attn: + crossattn_latent: 16 + crossattn_heads: 8 + crossattn_dim_head: 64 + crossattn_modality_dropout: 0.1 + modality_embed_dim: 256 + + eva_bimanual: + state_joint_positions: + _target_: egomimic.models.hpt_nets.MLPPolicyStem + input_dim: 14 + output_dim: 256 + widths: [256] + specs: + random_horizon_masking: false + cross_attn: + crossattn_latent: 16 + crossattn_heads: 8 + crossattn_dim_head: 64 + crossattn_modality_dropout: 0.1 + modality_embed_dim: 256 + right_wrist_img: + _target_: egomimic.models.hpt_nets.MLPPolicyStem + input_dim: 256 + output_dim: 256 + widths: [256] + specs: + random_horizon_masking: false + cross_attn: + crossattn_latent: 16 + crossattn_heads: 8 + crossattn_dim_head: 64 + crossattn_modality_dropout: 0.1 + modality_embed_dim: 256 + left_wrist_img: + _target_: egomimic.models.hpt_nets.MLPPolicyStem + input_dim: 256 + output_dim: 256 + widths: [256] + specs: + random_horizon_masking: false + cross_attn: + crossattn_latent: 16 + crossattn_heads: 8 + crossattn_dim_head: 64 + crossattn_modality_dropout: 0.1 + modality_embed_dim: 256 + + encoder_specs: + front_img_1: + _target_: egomimic.models.hpt_nets.ResNet + output_dim: 256 + num_of_copy: 1 + right_wrist_img: + _target_: egomimic.models.hpt_nets.ResNet + output_dim: 256 + num_of_copy: 1 + left_wrist_img: + _target_: egomimic.models.hpt_nets.ResNet + output_dim: 256 + num_of_copy: 1 + + train_image_augs: + _target_: torchvision.transforms.Compose + transforms: + - _target_: torchvision.transforms.ColorJitter + brightness: 0.1 + contrast: 0.1 + saturation: 0.1 + hue: 0.05 + - _target_: torchvision.transforms.Normalize + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + eval_image_augs: + _target_: torchvision.transforms.Compose + transforms: + - _target_: torchvision.transforms.Normalize + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + +optimizer: + _target_: torch.optim.AdamW + _partial_: true + lr: 5e-5 + weight_decay: 0.0001 diff --git a/egomimic/hydra_configs/train_zarr_latent.yaml b/egomimic/hydra_configs/train_zarr_latent.yaml new file mode 100644 index 00000000..9280721a --- /dev/null +++ b/egomimic/hydra_configs/train_zarr_latent.yaml @@ -0,0 +1,111 @@ +defaults: + - model: hpt_cotrain_flow_shared_head_latent_mlp + - visualization: eva_cartesian_aria_keypoints + - paths: default + - trainer: ddp + - debug: null + - logger: wandb + - data: eva_human_keypoints_cotrain + - callbacks: checkpoints + - override hydra/launcher: submitit + - _self_ + +name: latent_flow +description: latent_flow +ckpt_path: null +train: true +eval: false + +eval_class: + _target_: egomimic.scripts.evaluation.Eve + mode: real + arm: both + eval_path: "./logs/eval/${name}_${now:%Y-%m-%d_%H-%M-%S}" + +hydra: + run: + # Dir should be experiment_name/description_{timestamp} + dir: ./logs/${name}/${description}_${now:%Y-%m-%d_%H-%M-%S} + sweep: + dir: ./logs/${name}/${description}_${now:%Y-%m-%d_%H-%M-%S} + +launch_params: + gpus_per_node: 1 + nodes: 1 + +data_schematic: # Dynamically fill in these shapes from the dataset + _target_: egomimic.rldb.zarr.utils.DataSchematic + norm_mode: quantile + schematic_dict: + eva_bimanual: + front_img_1: #batch key + key_type: camera_keys # key type + zarr_key: observations.images.front_img_1 # dataset key + right_wrist_img: + key_type: camera_keys + zarr_key: observations.images.right_wrist_img + left_wrist_img: + key_type: camera_keys + zarr_key: observations.images.left_wrist_img + ee_pose: + key_type: proprio_keys + zarr_key: observations.state.ee_pose + joint_positions: + key_type: proprio_keys + zarr_key: observations.state.joint_positions + actions_joints: + key_type: action_keys + zarr_key: actions_joints + actions_eva_cart_aria_keypoints: + key_type: action_keys + zarr_key: actions_cartesian + embodiment: + key_type: metadata_keys + zarr_key: metadata.embodiment + aria_bimanual: + front_img_1: + key_type: camera_keys + zarr_key: observations.images.front_img_1 + ee_pose: + key_type: proprio_keys + zarr_key: observations.state.ee_pose + actions_cartesian: + key_type: action_keys + zarr_key: actions_cartesian + actions_eva_cart_aria_keypoints: + key_type: action_keys + zarr_key: actions_keypoints + keypoint_positions: + key_type: proprio_keys + zarr_key: observations.state.keypoints + embodiment: + key_type: metadata_keys + zarr_key: metadata.embodiment + mecka_bimanual: + front_img_1: + key_type: camera_keys + zarr_key: observations.images.front_img_1 + ee_pose: + key_type: proprio_keys + zarr_key: observations.state.ee_pose + actions_cartesian: + key_type: action_keys + zarr_key: actions_cartesian + embodiment: + key_type: metadata_keys + zarr_key: metadata.embodiment + scale_bimanual: + front_img_1: + key_type: camera_keys + zarr_key: observations.images.front_img_1 + ee_pose: + key_type: proprio_keys + zarr_key: observations.state.ee_pose + actions_cartesian: + key_type: action_keys + zarr_key: actions_cartesian + embodiment: + key_type: metadata_keys + zarr_key: metadata.embodiment + +seed: 42 diff --git a/egomimic/hydra_configs/trainer/ddp.yaml b/egomimic/hydra_configs/trainer/ddp.yaml index d3d90aca..d4359f17 100644 --- a/egomimic/hydra_configs/trainer/ddp.yaml +++ b/egomimic/hydra_configs/trainer/ddp.yaml @@ -1,11 +1,11 @@ defaults: - default -strategy: ddp +strategy: ddp_find_unused_parameters_true accelerator: gpu devices: ${eval:'${launch_params.gpus_per_node} * ${launch_params.nodes}'} num_nodes: ${launch_params.nodes} sync_batchnorm: True check_val_every_n_epoch: 200 -num_sanity_val_steps: 0 \ No newline at end of file +num_sanity_val_steps: 0 diff --git a/egomimic/hydra_configs/trainer/debug.yaml b/egomimic/hydra_configs/trainer/debug.yaml index e3a9a1a5..905d3711 100644 --- a/egomimic/hydra_configs/trainer/debug.yaml +++ b/egomimic/hydra_configs/trainer/debug.yaml @@ -3,7 +3,7 @@ defaults: strategy: ddp_find_unused_parameters_true limit_train_batches: 5 -limit_val_batches: 20 +limit_val_batches: 3 check_val_every_n_epoch: 2 profiler: simple max_epochs: 4 diff --git a/egomimic/hydra_configs/visualization/eva_cartesian_aria_keypoints.yaml b/egomimic/hydra_configs/visualization/eva_cartesian_aria_keypoints.yaml index 8c4d1c91..33ae292c 100644 --- a/egomimic/hydra_configs/visualization/eva_cartesian_aria_keypoints.yaml +++ b/egomimic/hydra_configs/visualization/eva_cartesian_aria_keypoints.yaml @@ -1,14 +1,10 @@ eva_bimanual: - action_keys: actions_cartesian - viz_function: - _target_: egomimic.rldb.embodiment.eva.Eva.viz - _partial_: true - mode: traj - intrinsics_key: base_half + _target_: egomimic.rldb.embodiment.eva.Eva.viz_cartesian_gt_preds + _partial_: true + image_key: front_img_1 + action_key: actions_cartesian aria_bimanual: - action_keys: actions_cartesian - viz_function: - _target_: egomimic.rldb.embodiment.human.Aria.viz - _partial_: true - mode: keypoints - intrinsics_key: base_half + _target_: egomimic.rldb.embodiment.human.Human.viz_keypoints_gt_preds + _partial_: true + image_key: front_img_1 + action_key: actions_keypoints diff --git a/egomimic/models/codec/mlp.py b/egomimic/models/codec/mlp.py new file mode 100644 index 00000000..43efe3a6 --- /dev/null +++ b/egomimic/models/codec/mlp.py @@ -0,0 +1,16 @@ +import torch +import torch.nn as nn + + +class MLPProjection(nn.Module): + def __init__(self, input_dim: int, hidden_dim: int, output_dim: int): + super().__init__() + self.net = nn.Sequential( + nn.Linear(input_dim, hidden_dim), + nn.GELU(), + nn.Linear(hidden_dim, output_dim), + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + # x is in (B, T, D) -> (B, T, H) + return self.net(x) diff --git a/egomimic/models/codec/temporal_enc_dec.py b/egomimic/models/codec/temporal_enc_dec.py new file mode 100644 index 00000000..4547c567 --- /dev/null +++ b/egomimic/models/codec/temporal_enc_dec.py @@ -0,0 +1,477 @@ +from __future__ import annotations + +import torch +import torch.nn as nn + + +class SmallTemporalEncoder(nn.Module): + """ + Fix temporal encoder for 100 seq of actiona + """ + + def __init__( + self, + *, + action_dim: int, + activation: str = "gelu", + hidden_dim: int = 64, + use_layernorm: bool = True, + ): + super().__init__() + if activation == "relu": + self.act = nn.ReLU() + elif activation == "gelu": + self.act = nn.GELU() + elif activation == "silu": + self.act = nn.SiLU() + else: + raise ValueError(f"Unknown activation: {activation}") + + layers = [ + nn.Conv1d(action_dim, action_dim * 2, kernel_size=8, stride=2, padding=3), + self.act, + nn.Conv1d( + action_dim * 2, action_dim * 2, kernel_size=8, stride=2, padding=2 + ), + self.act, + nn.Conv1d( + action_dim * 2, action_dim * 2, kernel_size=8, stride=2, padding=3 + ), + self.act, + ] + + self.down = nn.Sequential(*layers) + self.proj = nn.Linear(action_dim * 2, hidden_dim) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + Input: (B, T, D) or (T, D) + Output: (B, K, H) or (K, H) + """ + squeeze_B = False + if x.dim() == 2: + x = x.unsqueeze(0) + squeeze_B = True + elif x.dim() != 3: + raise ValueError(f"Expected (T,D) or (B,T,D), got {tuple(x.shape)}") + + x = x.transpose(1, 2) # (B, D, T) + x = self.down(x) # (B, D, K) + x = x.transpose(1, 2) # (B, K, D) + x = self.proj(x) # (B, K, H) + + return x.squeeze(0) if squeeze_B else x + + +class SmallTemporalDecoder(nn.Module): + """ + Decoder that mirrors SmallTemporalEncoder: + Enc convs (over time, channels-first): + (D -> 2D) k=8 s=2 p=3 + (2D -> 2D) k=8 s=2 p=2 + (2D -> 2D) k=8 s=2 p=3 + For T=100 this encoder produces K=12. + + This decoder maps: + Input: (B, K=12, H=64) or (K, H) + Output: (B, T=100, D) or (T, D) + """ + + def __init__( + self, + *, + action_dim: int, + hidden_dim: int = 64, + activation: str = "gelu", + use_layernorm: bool = True, + K: int = 12, + T: int = 100, + ): + super().__init__() + self.action_dim = action_dim + self.hidden_dim = hidden_dim + self.T = T + + if activation == "relu": + self.act = nn.ReLU() + elif activation == "gelu": + self.act = nn.GELU() + elif activation == "silu": + self.act = nn.SiLU() + else: + raise ValueError(f"Unknown activation: {activation}") + + C2 = action_dim * 2 + + self.proj = nn.Linear(hidden_dim, C2) + self.norm = nn.LayerNorm(C2) if use_layernorm else nn.Identity() + + self.up = nn.Sequential( + nn.ConvTranspose1d( + C2, C2, kernel_size=8, stride=2, padding=3, output_padding=0 + ), + self.act, + nn.ConvTranspose1d( + C2, C2, kernel_size=8, stride=2, padding=2, output_padding=0 + ), + self.act, + nn.ConvTranspose1d( + C2, action_dim, kernel_size=8, stride=2, padding=3, output_padding=0 + ), + ) + + def forward(self, z: torch.Tensor) -> torch.Tensor: + squeeze_B = False + if z.dim() == 2: + z = z.unsqueeze(0) + squeeze_B = True + elif z.dim() != 3: + raise ValueError(f"Expected (K,H) or (B,K,H), got {tuple(z.shape)}") + + B, K, H = z.shape + if H != self.hidden_dim: + raise ValueError(f"Expected H={self.hidden_dim}, got {H}") + + x = self.norm(self.proj(z)) # (B, K, 2D) + x = x.transpose(1, 2) # (B, 2D, K) + x = self.up(x) # (B, D, T) + if x.shape[-1] != self.T: + raise ValueError(f"Got T_out={x.shape[-1]}, expected T={self.T}") + x = x.transpose(1, 2) # (B, T, D) + + return x.squeeze(0) if squeeze_B else x + + +class LargeTemporalEncoder(nn.Module): + """ + Encoder for (B, T=100, D) that halves channels: D -> D/2, + and downsamples time: 100 -> 12. + Output: (B, K=12, H) + """ + + def __init__( + self, + *, + action_dim: int, + hidden_dim: int = 64, + activation: str = "gelu", + use_layernorm: bool = True, + expect_T: int | None = 100, + ): + super().__init__() + if action_dim % 2 != 0: + raise ValueError(f"action_dim must be even to halve. Got {action_dim}") + + self.action_dim = action_dim + self.hidden_dim = hidden_dim + self.expect_T = expect_T + + if activation == "relu": + self.act = nn.ReLU() + elif activation == "gelu": + self.act = nn.GELU() + elif activation == "silu": + self.act = nn.SiLU() + else: + raise ValueError(f"Unknown activation: {activation}") + + D = action_dim + + self.down = nn.Sequential( + nn.Conv1d(D, action_dim, kernel_size=8, stride=2, padding=3), # 100 -> 50 + self.act, + nn.Conv1d( + action_dim, action_dim, kernel_size=8, stride=2, padding=2 + ), # 50 -> 24 + self.act, + nn.Conv1d( + action_dim, action_dim, kernel_size=8, stride=2, padding=3 + ), # 24 -> 12 + self.act, + ) + + self.proj = nn.Linear(action_dim, hidden_dim) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + squeeze_B = False + if x.dim() == 2: + x = x.unsqueeze(0) + squeeze_B = True + elif x.dim() != 3: + raise ValueError(f"Expected (T,D) or (B,T,D), got {tuple(x.shape)}") + + B, T, D = x.shape + if D != self.action_dim: + raise ValueError(f"Expected D={self.action_dim}, got {D}") + if self.expect_T is not None and T != self.expect_T: + raise ValueError(f"Expected T={self.expect_T}, got {T}") + + x = x.transpose(1, 2) # (B, D, T) + x = self.down(x) # (B, D/2, K=12) + x = x.transpose(1, 2) # (B, K, D/2) + x = self.proj(x) # (B, K, H) + return x.squeeze(0) if squeeze_B else x + + +class LargeTemporalDecoder(nn.Module): + """ + Decoder that mirrors LargeTemporalEncoder: + time: 12 -> 24 -> 50 -> 100 + channels: H -> D/2 -> D + Input: (B, K=12, H) or (K, H) + Output: (B, T=100, D) or (T, D) + """ + + def __init__( + self, + *, + action_dim: int, + hidden_dim: int = 64, + activation: str = "gelu", + use_layernorm: bool = True, + K: int = 12, + T: int = 100, + ): + super().__init__() + if action_dim % 2 != 0: + raise ValueError(f"action_dim must be even to halve. Got {action_dim}") + + self.action_dim = action_dim + self.half_dim = action_dim // 2 + self.hidden_dim = hidden_dim + self.T = T + + if activation == "relu": + self.act = nn.ReLU() + elif activation == "gelu": + self.act = nn.GELU() + elif activation == "silu": + self.act = nn.SiLU() + else: + raise ValueError(f"Unknown activation: {activation}") + + self.proj = nn.Linear(hidden_dim, action_dim) + self.norm = nn.LayerNorm(action_dim) if use_layernorm else nn.Identity() + + # Mirrors paddings/strides/kernels in reverse. + # Lengths: 12 -> 24 -> 50 -> 100 with output_padding=0 for these params. + self.up = nn.Sequential( + nn.ConvTranspose1d( + action_dim, + action_dim, + kernel_size=8, + stride=2, + padding=3, + output_padding=0, + ), + self.act, + nn.ConvTranspose1d( + action_dim, + action_dim, + kernel_size=8, + stride=2, + padding=2, + output_padding=0, + ), + self.act, + nn.ConvTranspose1d( + action_dim, + action_dim, + kernel_size=8, + stride=2, + padding=3, + output_padding=0, + ), + ) + + def forward(self, z: torch.Tensor) -> torch.Tensor: + squeeze_B = False + if z.dim() == 2: + z = z.unsqueeze(0) + squeeze_B = True + elif z.dim() != 3: + raise ValueError(f"Expected (K,H) or (B,K,H), got {tuple(z.shape)}") + + B, K, H = z.shape + if H != self.hidden_dim: + raise ValueError(f"Expected H={self.hidden_dim}, got {H}") + + x = self.norm(self.proj(z)) # (B, K, D/2) + x = x.transpose(1, 2) # (B, D/2, K) + x = self.up(x) # (B, D, T) + if x.shape[-1] != self.T: + raise ValueError(f"Got T_out={x.shape[-1]}, expected T={self.T}") + x = x.transpose(1, 2) # (B, T, D) + return x.squeeze(0) if squeeze_B else x + + +class SmallTemporalEncoder_32_256(SmallTemporalEncoder): + """ + Fix temporal encoder for 100 seq of actiona + """ + + def __init__( + self, + *, + action_dim: int, + activation: str = "gelu", + hidden_dim: int = 256, + use_layernorm: bool = True, + ): + super().__init__( + action_dim=action_dim, + hidden_dim=hidden_dim, + activation=activation, + use_layernorm=use_layernorm, + ) + + layers = [ + nn.Conv1d(action_dim, 512, kernel_size=9, stride=3, padding=2), + self.act, + ] + + self.down = nn.Sequential(*layers) + self.proj = nn.Linear(512, hidden_dim) + + +class SmallTemporalDecoder_32_256(SmallTemporalDecoder): + """ + Decoder that mirrors SmallTemporalEncoder_32_128: + """ + + def __init__( + self, + *, + action_dim: int, + hidden_dim: int = 256, + activation: str = "gelu", + use_layernorm: bool = True, + ): + super().__init__( + action_dim=action_dim, + hidden_dim=hidden_dim, + activation=activation, + use_layernorm=use_layernorm, + ) + + layers = [ + nn.ConvTranspose1d(512, action_dim, kernel_size=9, stride=3, padding=1), + self.act, + ] + + self.up = nn.Sequential(*layers) + self.proj = nn.Linear(hidden_dim, 512) + self.norm = nn.LayerNorm(512) if use_layernorm else nn.Identity() + + +class LargeTemporalEncoder_32_256(LargeTemporalEncoder): + """ + Encoder for (B, T=100, D) that halves channels: D -> D/2, + and downsamples time: 100 -> 12. + Output: (B, K=12, H) + """ + + def __init__( + self, + *, + action_dim: int, + hidden_dim: int = 256, + activation: str = "gelu", + use_layernorm: bool = True, + expect_T: int | None = 100, + ): + super().__init__( + action_dim=action_dim, + hidden_dim=hidden_dim, + activation=activation, + use_layernorm=use_layernorm, + expect_T=expect_T, + ) + layers = [ + nn.Conv1d(action_dim, 1024, kernel_size=9, stride=3, padding=2), + self.act, + ] + + self.down = nn.Sequential(*layers) + self.proj = nn.Linear(1024, hidden_dim) + + +class LargeTemporalDecoder_32_256(LargeTemporalDecoder): + """ + Decoder that mirrors LargeTemporalEncoder_32_128: + """ + + def __init__( + self, + *, + action_dim: int, + hidden_dim: int = 256, + activation: str = "gelu", + use_layernorm: bool = True, + K: int = 12, + T: int = 100, + ): + super().__init__( + action_dim=action_dim, + hidden_dim=hidden_dim, + activation=activation, + use_layernorm=use_layernorm, + K=K, + T=T, + ) + + layers = [ + nn.ConvTranspose1d(1024, action_dim, kernel_size=9, stride=3, padding=1), + self.act, + ] + + self.up = nn.Sequential(*layers) + self.proj = nn.Linear(hidden_dim, 1024) + self.norm = nn.LayerNorm(1024) if use_layernorm else nn.Identity() + + +def count_params(module: nn.Module, trainable_only: bool = False) -> int: + if trainable_only: + return sum(p.numel() for p in module.parameters() if p.requires_grad) + return sum(p.numel() for p in module.parameters()) + + +def print_param_breakdown(module: nn.Module, trainable_only: bool = False) -> None: + total = 0 + for name, p in module.named_parameters(): + if trainable_only and not p.requires_grad: + continue + n = p.numel() + total += n + print(f"{name:60s} {tuple(p.shape)!s:20s} {n}") + print(f"\nTOTAL params: {total}") + + +if __name__ == "__main__": + B, T, D = 8, 100, 140 + + enc = LargeTemporalEncoder_32_256(action_dim=D) + dec = LargeTemporalDecoder_32_256(action_dim=D, use_layernorm=True) + + x = torch.randn(B, T, D) + z = enc(x) + x_hat = dec(z) + # z shape is 8, 12, 64 + print("LargeTemporalEncoder_32_256") + print(count_params(enc)) + print(count_params(enc, trainable_only=True)) + print_param_breakdown(enc) + + B, T, D = 8, 100, 14 + enc = SmallTemporalEncoder_32_256(action_dim=D) + dec = SmallTemporalDecoder_32_256(action_dim=D, use_layernorm=True) + + x = torch.randn(B, T, D) + z = enc(x) + x_hat = dec(z) + # z shape is 8, 12, 64 + + print("SmallTemporalEncoder_32_256") + print(count_params(enc)) + print(count_params(enc, trainable_only=True)) + print_param_breakdown(enc) diff --git a/egomimic/models/conv/temporal_enc_dec.py b/egomimic/models/conv/temporal_enc_dec.py deleted file mode 100644 index 4d438c82..00000000 --- a/egomimic/models/conv/temporal_enc_dec.py +++ /dev/null @@ -1,305 +0,0 @@ -from __future__ import annotations - -from typing import List - -import torch -import torch.nn as nn - - -class SmallTemporalEncoder(nn.Module): - """ - Fix temporal encoder for 100 seq of actiona - """ - def __init__( - self, - *, - action_dim: int, - activation: str = "gelu", - use_layernorm: bool = True, - ): - super().__init__() - if activation == "relu": - act = nn.ReLU() - elif activation == "gelu": - act = nn.GELU() - elif activation == "silu": - act = nn.SiLU() - else: - raise ValueError(f"Unknown activation: {activation}") - - layers = [nn.Conv1d(action_dim, action_dim*2, kernel_size=8, stride=2, padding=3), - act, - nn.Conv1d(action_dim*2, action_dim*2, kernel_size=8, stride=2, padding=2), - act, - nn.Conv1d(action_dim*2, action_dim*2, kernel_size=8, stride=2, padding=3), - act, - ] - - - hidden_dim = 64 - self.down = nn.Sequential(*layers) - self.proj = nn.Linear(action_dim*2, hidden_dim) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - """ - Input: (B, T, D) or (T, D) - Output: (B, K, H) or (K, H) - """ - squeeze_B = False - if x.dim() == 2: - x = x.unsqueeze(0) - squeeze_B = True - elif x.dim() != 3: - raise ValueError(f"Expected (T,D) or (B,T,D), got {tuple(x.shape)}") - - x = x.transpose(1, 2) # (B, D, T) - x = self.down(x) # (B, D, K) - x = x.transpose(1, 2) # (B, K, D) - x = self.proj(x) # (B, K, H) - - return x.squeeze(0) if squeeze_B else x - -class SmallTemporalDecoder(nn.Module): - """ - Decoder that mirrors SmallTemporalEncoder: - Enc convs (over time, channels-first): - (D -> 2D) k=8 s=2 p=3 - (2D -> 2D) k=8 s=2 p=2 - (2D -> 2D) k=8 s=2 p=3 - For T=100 this encoder produces K=12. - - This decoder maps: - Input: (B, K=12, H=64) or (K, H) - Output: (B, T=100, D) or (T, D) - """ - def __init__( - self, - *, - action_dim: int, - hidden_dim: int = 64, - activation: str = "gelu", - use_layernorm: bool = True, - K: int = 12, - T: int = 100, - ): - super().__init__() - self.action_dim = action_dim - self.hidden_dim = hidden_dim - self.K = K - self.T = T - - if activation == "relu": - act = nn.ReLU() - elif activation == "gelu": - act = nn.GELU() - elif activation == "silu": - act = nn.SiLU() - else: - raise ValueError(f"Unknown activation: {activation}") - - C2 = action_dim * 2 - - self.proj = nn.Linear(hidden_dim, C2) - self.norm = nn.LayerNorm(C2) if use_layernorm else nn.Identity() - - self.up = nn.Sequential( - nn.ConvTranspose1d(C2, C2, kernel_size=8, stride=2, padding=3, output_padding=0), - act, - nn.ConvTranspose1d(C2, C2, kernel_size=8, stride=2, padding=2, output_padding=0), - act, - nn.ConvTranspose1d(C2, action_dim, kernel_size=8, stride=2, padding=3, output_padding=0), - ) - - def forward(self, z: torch.Tensor) -> torch.Tensor: - squeeze_B = False - if z.dim() == 2: - z = z.unsqueeze(0) - squeeze_B = True - elif z.dim() != 3: - raise ValueError(f"Expected (K,H) or (B,K,H), got {tuple(z.shape)}") - - B, K, H = z.shape - if H != self.hidden_dim: - raise ValueError(f"Expected H={self.hidden_dim}, got {H}") - if K != self.K: - raise ValueError(f"Expected K={self.K}, got {K}") - - x = self.norm(self.proj(z)) # (B, K, 2D) - x = x.transpose(1, 2) # (B, 2D, K) - x = self.up(x) # (B, D, T) - if x.shape[-1] != self.T: - raise ValueError(f"Got T_out={x.shape[-1]}, expected T={self.T}") - x = x.transpose(1, 2) # (B, T, D) - - return x.squeeze(0) if squeeze_B else x - -class LargeTemporalEncoder(nn.Module): - """ - Encoder for (B, T=100, D) that halves channels: D -> D/2, - and downsamples time: 100 -> 12. - Output: (B, K=12, H) - """ - def __init__( - self, - *, - action_dim: int, - hidden_dim: int = 64, - activation: str = "gelu", - use_layernorm: bool = True, - expect_T: int | None = 100, - ): - super().__init__() - if action_dim % 2 != 0: - raise ValueError(f"action_dim must be even to halve. Got {action_dim}") - - self.action_dim = action_dim - self.hidden_dim = hidden_dim - self.expect_T = expect_T - - if activation == "relu": - act = nn.ReLU() - elif activation == "gelu": - act = nn.GELU() - elif activation == "silu": - act = nn.SiLU() - else: - raise ValueError(f"Unknown activation: {activation}") - - D = action_dim - - self.down = nn.Sequential( - nn.Conv1d(D, action_dim, kernel_size=8, stride=2, padding=3), # 100 -> 50 - act, - nn.Conv1d(action_dim, action_dim, kernel_size=8, stride=2, padding=2), # 50 -> 24 - act, - nn.Conv1d(action_dim, action_dim, kernel_size=8, stride=2, padding=3), # 24 -> 12 - act, - ) - - self.proj = nn.Linear(action_dim, hidden_dim) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - squeeze_B = False - if x.dim() == 2: - x = x.unsqueeze(0) - squeeze_B = True - elif x.dim() != 3: - raise ValueError(f"Expected (T,D) or (B,T,D), got {tuple(x.shape)}") - - B, T, D = x.shape - if D != self.action_dim: - raise ValueError(f"Expected D={self.action_dim}, got {D}") - if self.expect_T is not None and T != self.expect_T: - raise ValueError(f"Expected T={self.expect_T}, got {T}") - - x = x.transpose(1, 2) # (B, D, T) - x = self.down(x) # (B, D/2, K=12) - x = x.transpose(1, 2) # (B, K, D/2) - x = self.proj(x) # (B, K, H) - return x.squeeze(0) if squeeze_B else x - - -class LargeTemporalDecoder(nn.Module): - """ - Decoder that mirrors LargeTemporalEncoder: - time: 12 -> 24 -> 50 -> 100 - channels: H -> D/2 -> D - Input: (B, K=12, H) or (K, H) - Output: (B, T=100, D) or (T, D) - """ - def __init__( - self, - *, - action_dim: int, - hidden_dim: int = 64, - activation: str = "gelu", - use_layernorm: bool = True, - K: int = 12, - T: int = 100, - ): - super().__init__() - if action_dim % 2 != 0: - raise ValueError(f"action_dim must be even to halve. Got {action_dim}") - - self.action_dim = action_dim - self.half_dim = action_dim // 2 - self.hidden_dim = hidden_dim - self.K = K - self.T = T - - if activation == "relu": - act = nn.ReLU() - elif activation == "gelu": - act = nn.GELU() - elif activation == "silu": - act = nn.SiLU() - else: - raise ValueError(f"Unknown activation: {activation}") - - self.proj = nn.Linear(hidden_dim, action_dim) - self.norm = nn.LayerNorm(action_dim) if use_layernorm else nn.Identity() - - # Mirrors paddings/strides/kernels in reverse. - # Lengths: 12 -> 24 -> 50 -> 100 with output_padding=0 for these params. - self.up = nn.Sequential( - nn.ConvTranspose1d(action_dim, action_dim, kernel_size=8, stride=2, padding=3, output_padding=0), - act, - nn.ConvTranspose1d(action_dim, action_dim, kernel_size=8, stride=2, padding=2, output_padding=0), - act, - nn.ConvTranspose1d(action_dim, action_dim, kernel_size=8, stride=2, padding=3, output_padding=0), - ) - - def forward(self, z: torch.Tensor) -> torch.Tensor: - squeeze_B = False - if z.dim() == 2: - z = z.unsqueeze(0) - squeeze_B = True - elif z.dim() != 3: - raise ValueError(f"Expected (K,H) or (B,K,H), got {tuple(z.shape)}") - - B, K, H = z.shape - if H != self.hidden_dim: - raise ValueError(f"Expected H={self.hidden_dim}, got {H}") - if K != self.K: - raise ValueError(f"Expected K={self.K}, got {K}") - - x = self.norm(self.proj(z)) # (B, K, D/2) - x = x.transpose(1, 2) # (B, D/2, K) - x = self.up(x) # (B, D, T) - if x.shape[-1] != self.T: - raise ValueError(f"Got T_out={x.shape[-1]}, expected T={self.T}") - x = x.transpose(1, 2) # (B, T, D) - return x.squeeze(0) if squeeze_B else x - - -def count_params(module: nn.Module, trainable_only: bool = False) -> int: - if trainable_only: - return sum(p.numel() for p in module.parameters() if p.requires_grad) - return sum(p.numel() for p in module.parameters()) - - -def print_param_breakdown(module: nn.Module, trainable_only: bool = False) -> None: - total = 0 - for name, p in module.named_parameters(): - if trainable_only and not p.requires_grad: - continue - n = p.numel() - total += n - print(f"{name:60s} {tuple(p.shape)!s:20s} {n}") - print(f"\nTOTAL params: {total}") - -if __name__ == "__main__": - B, T, D = 8, 100, 140 - - enc = LargeTemporalEncoder(action_dim=D) - dec = LargeTemporalDecoder(action_dim=D, use_layernorm=True) - - x = torch.randn(B, T, D) - z = enc(x) - x_hat = dec(z) - - print(count_params(enc)) - print(count_params(enc, trainable_only=True)) - print_param_breakdown(enc) - - \ No newline at end of file diff --git a/egomimic/models/denoising_policy.py b/egomimic/models/denoising_policy.py index 25ccc641..5c5225b0 100644 --- a/egomimic/models/denoising_policy.py +++ b/egomimic/models/denoising_policy.py @@ -5,6 +5,7 @@ import torch.nn.functional as F from egomimic.models.denoising_nets import ConditionalUnet1D +from egomimic.rldb.embodiment.embodiment import get_embodiment class DenoisingPolicy(nn.Module): @@ -23,29 +24,59 @@ def __init__( self, model: ConditionalUnet1D, action_horizon: int, - infer_ac_dims: dict, num_inference_steps: int = None, + embodiment_specs: dict = None, **kwargs, ): super().__init__() self.model = model self.action_horizon = action_horizon - self.infer_ac_dims = infer_ac_dims self.num_inference_steps = num_inference_steps + self.embodiment_specs = embodiment_specs + self.codec_enabled = False + + _codecs = {} + if embodiment_specs is not None: + for _emb_name, _spec in embodiment_specs.items(): + if _spec.get("encoder") is not None: + _codecs[f"{_emb_name}_encoder"] = _spec["encoder"] + if _spec.get("decoder") is not None: + _codecs[f"{_emb_name}_decoder"] = _spec["decoder"] + if _codecs: + self.codecs = nn.ModuleDict(_codecs) self.padding = kwargs.get("padding", None) self.pooling = kwargs.get("pooling", None) - self.model_type = kwargs.get("model_type", None) - - if not infer_ac_dims: - raise ValueError("infer_ac_dims must be a non-empty dict") for name, param in self.model.named_parameters(): if not param.requires_grad: print(f"[warn] {name} has requires_grad=False") total_params = sum(p.numel() for p in self.model.parameters()) + if self.embodiment_specs is not None: + for embodiment_name, spec in self.embodiment_specs.items(): + if spec.get("ac_dims") is None: + raise ValueError(f"ac_dims must be specified for {embodiment_name}") + for embodiment_name, spec in self.embodiment_specs.items(): + if spec.get("encoder") is not None: + encoder_params = sum( + p.numel() for p in spec["encoder"].parameters() + ) + self.codec_enabled = True + if spec.get("decoder") is not None: + decoder_params = sum( + p.numel() for p in spec["decoder"].parameters() + ) + self.codec_enabled = True + print( + f"[{embodiment_name}] Encoder params: {encoder_params / 1e6:.2f}M" + ) + print( + f"[{embodiment_name}] Decoder params: {decoder_params / 1e6:.2f}M" + ) + total_params += encoder_params + decoder_params + print( f"[{self.__class__.__name__}] Total trainable parameters: {total_params / 1e6:.2f}M" ) @@ -60,7 +91,7 @@ def preprocess_sampling(self, global_cond, embodiment_name, generator=None): ( len(global_cond), self.action_horizon, - self.infer_ac_dims[embodiment_name], + self.embodiment_specs[embodiment_name].get("ac_dims"), ), dtype=global_cond.dtype, device=global_cond.device, @@ -68,7 +99,9 @@ def preprocess_sampling(self, global_cond, embodiment_name, generator=None): ) return noise, global_cond - def inference(self, noise, global_cond, generator=None) -> torch.Tensor: # pyright: ignore[reportUnusedParameter] + def inference( + self, noise, global_cond, embodiment_name, generator=None + ) -> torch.Tensor: # pyright: ignore[reportUnusedParameter] """ To be implemented in subclass: predict actions from noise and conditioning. """ @@ -78,13 +111,15 @@ def sample_action(self, global_cond, embodiment_name, generator=None): noise, global_cond = self.preprocess_sampling( global_cond, embodiment_name, generator ) - return self.inference(noise, global_cond, generator, embodiment_name) + return self.inference(noise, global_cond, embodiment_name, generator) - def forward(self, global_cond, embodiment_name): - cond, embodiment = global_cond - return self.sample_action(cond, embodiment, embodiment_name) + def forward(self, global_cond): + cond, embodiment_name = global_cond + return self.sample_action(cond, embodiment_name) - def predict(self, actions, global_cond, embodiment_name) -> Tuple[torch.Tensor, torch.Tensor]: + def predict( + self, actions, global_cond, embodiment_name + ) -> Tuple[torch.Tensor, torch.Tensor]: """ To be implemented in subclass: returns (prediction, target) given action input and conditioning. """ @@ -121,7 +156,10 @@ def preprocess_compute_loss(self, global_cond, data, embodiment_name): return actions, global_cond - def compute_loss(self, global_cond, data, embodiment_name): - actions, global_cond = self.preprocess_compute_loss(global_cond, data, embodiment_name) + def compute_loss(self, global_cond, data): + embodiment_name = get_embodiment(data["embodiment"][0].item()).lower() + actions, global_cond = self.preprocess_compute_loss( + global_cond, data, embodiment_name + ) pred, target = self.predict(actions, global_cond, embodiment_name) return self.loss_fn(pred, target) diff --git a/egomimic/models/fm_policy.py b/egomimic/models/fm_policy.py index 551853a6..27e74ee5 100644 --- a/egomimic/models/fm_policy.py +++ b/egomimic/models/fm_policy.py @@ -24,34 +24,37 @@ def __init__( self, model: ConditionalUnet1D, action_horizon, - infer_ac_dims, num_inference_steps=None, - encoder_map=None, + embodiment_specs=None, **kwargs, ): super().__init__( - model, action_horizon, infer_ac_dims, num_inference_steps, **kwargs + model, action_horizon, num_inference_steps, embodiment_specs, **kwargs ) self.time_dist = kwargs.get("time_dist", "beta") - self.encoder_map = encoder_map + self.dt = -1.0 / self.num_inference_steps - def step(self, x_t, t, global_cond): + def step(self, x_t, t, global_cond, embodiment_name): if len(t.shape) != 1: t = torch.tensor([t], device=global_cond.device) - v_t = self.model(x_t, t, global_cond) + v_t = self.denoising_model(x_t, t, global_cond, embodiment_name) return x_t + self.dt * v_t, t + self.dt @override - def inference(self, noise, global_cond, generator=None) -> torch.Tensor: + def inference( + self, noise, global_cond, embodiment_name, generator=None + ) -> torch.Tensor: self.dt = -1.0 / self.num_inference_steps x_t = noise time = torch.ones((len(global_cond)), device=global_cond.device) while time[0] >= -self.dt / 2: - x_t, time = self.step(x_t, time, global_cond) + x_t, time = self.step(x_t, time, global_cond, embodiment_name) return x_t @override - def predict(self, actions, global_cond) -> Tuple[torch.Tensor, torch.Tensor]: + def predict( + self, actions, global_cond, embodiment_name + ) -> Tuple[torch.Tensor, torch.Tensor]: noise = torch.randn(actions.shape, device=actions.device) batch_shape = (actions.shape[0],) if self.time_dist == "beta": @@ -67,8 +70,45 @@ def predict(self, actions, global_cond) -> Tuple[torch.Tensor, torch.Tensor]: x_t = time_expanded * noise + (1 - time_expanded) * actions u_t = noise - actions - v_t = self.model(x_t, time, global_cond) + v_t = self.denoising_model(x_t, time, global_cond, embodiment_name) target = u_t pred = v_t return pred, target + + def denoising_model(self, x_t, time, global_cond, embodiment_name): + if self.codec_enabled: + x_t = self.embodiment_specs[embodiment_name]["encoder"](x_t) + else: + x_t = x_t + v_t = self.model(x_t, time, global_cond) + if self.codec_enabled: + v_t = self.embodiment_specs[embodiment_name]["decoder"](v_t) + else: + v_t = v_t + return v_t + + +if __name__ == "__main__": + import hydra + from omegaconf import OmegaConf + + cfg = OmegaConf.load( + "/coc/flash7/paphiwetsa3/projects/EgoVerse/egomimic/hydra_configs/model/hpt_cotrain_flow_shared_head_latent.yaml" + ) + model = hydra.utils.instantiate(cfg.robomimic_model.head_specs.shared) + + # test the model + aria_input = torch.randn(8, 100, 140) + global_cond = torch.randn(8, 64, 256) + aria_output = model.step( + aria_input, torch.tensor([0.0]), global_cond, "aria_bimanual" + ) + aria_output_inference = model.inference(aria_input, global_cond, "aria_bimanual") + aria_output_predict = model.predict(aria_input, global_cond, "aria_bimanual") + + eva_input = torch.randn(8, 100, 14) + eva_output = model.step(eva_input, torch.tensor([0.0]), global_cond, "eva_bimanual") + eva_output_inference = model.inference(eva_input, global_cond, "eva_bimanual") + eva_output_predict = model.predict(eva_input, global_cond, "eva_bimanual") + breakpoint() diff --git a/egomimic/rldb/embodiment/human.py b/egomimic/rldb/embodiment/human.py index bab7432b..5c8e3da3 100644 --- a/egomimic/rldb/embodiment/human.py +++ b/egomimic/rldb/embodiment/human.py @@ -2,7 +2,9 @@ from typing import Literal -from egomimic.rldb.embodiment.embodiment import Embodiment +import numpy as np + +from egomimic.rldb.embodiment.embodiment import Embodiment, get_embodiment from egomimic.rldb.zarr.action_chunk_transforms import ( ActionChunkCoordinateFrameTransform, ConcatKeys, @@ -15,6 +17,7 @@ ) from egomimic.utils.type_utils import _to_numpy from egomimic.utils.viz_utils import ( + ColorPalette, _viz_axes, _viz_keypoints, _viz_traj, @@ -43,6 +46,30 @@ def get_transform_list( f"Unsupported mode '{mode}'. Expected one of: 'cartesian', 'keypoints'." ) + @classmethod + def viz_keypoints_gt_preds( + cls, predictions, batch, image_key, action_key, **kwargs + ): + embodiment_id = batch["embodiment"][0].item() + embodiment_name = get_embodiment(embodiment_id).lower() + + images = batch[image_key] + actions = batch[action_key] + pred_actions = predictions[f"{embodiment_name}_{action_key}"] + ims_list = [] + images = _to_numpy(images) + actions = _to_numpy(actions) + pred_actions = _to_numpy(pred_actions) + for i in range(images.shape[0]): + image = images[i] + action = actions[i] + pred_action = pred_actions[i] + ims = cls.viz(image, action, mode="keypoints", color="Reds", **kwargs) + ims = cls.viz(ims, pred_action, mode="keypoints", color="Greens", **kwargs) + ims_list.append(ims) + ims = np.stack(ims_list, axis=0) + return ims + @classmethod def viz_transformed_batch( cls, @@ -50,6 +77,7 @@ def viz_transformed_batch( mode=Literal["traj", "axes", "keypoints"], action_key="actions_cartesian", image_key=None, + **kwargs, ): image_key = image_key or cls.VIZ_IMAGE_KEY action_key = action_key or "actions_cartesian" @@ -59,7 +87,11 @@ def viz_transformed_batch( actions = _to_numpy(batch[action_key][0]) return cls.viz( - images=images, actions=actions, mode=mode, intrinsics_key=intrinsics_key + images=images, + actions=actions, + mode=mode, + intrinsics_key=intrinsics_key, + **kwargs, ) @classmethod @@ -87,13 +119,25 @@ def viz( **kwargs, ) if mode == "keypoints": + color = kwargs.get("color", None) + if color is not None and ColorPalette.is_valid(color): + n = len(cls.FINGER_COLORS) + colors = { + finger: ColorPalette.to_rgb(color, value=(i + 1) / (n + 1)) + for i, finger in enumerate(cls.FINGER_COLORS) + } + dot_color = ColorPalette.to_rgb(color, value=0.7) + else: + colors = cls.FINGER_COLORS + dot_color = cls.DOT_COLOR return _viz_keypoints( images=images, actions=actions, intrinsics_key=intrinsics_key, edges=cls.FINGER_EDGES, - colors=cls.FINGER_COLORS, edge_ranges=cls.FINGER_EDGE_RANGES, + colors=colors, + dot_color=dot_color, **kwargs, ) raise ValueError( @@ -227,6 +271,7 @@ class Aria(Human): ("ring", 9, 12), ("pinky", 12, 15), ] + DOT_COLOR = (255, 165, 0) class Scale(Human): diff --git a/egomimic/trainHydra.py b/egomimic/trainHydra.py index 7f2644c5..431d2f7d 100644 --- a/egomimic/trainHydra.py +++ b/egomimic/trainHydra.py @@ -1,6 +1,7 @@ import copy import os import signal +import subprocess from collections.abc import Mapping from typing import Any, Dict, List, Optional, Tuple @@ -114,7 +115,7 @@ def train(cfg: DictConfig) -> Tuple[Dict[str, Any], Dict[str, Any]]: data_schematic.infer_norm_from_dataset( norm_dataset, dataset_name, - sample_frac=0.005, + sample_frac=0.0001, benchmark_dir=os.path.join( cfg.trainer.default_root_dir, "benchmark_stats.json" ), @@ -216,6 +217,9 @@ def main(cfg: DictConfig) -> Optional[float]: :param cfg: DictConfig configuration composed by Hydra. :return: Optional[float] with optimized metric value. """ + script = os.path.join(os.path.dirname(__file__), "utils/aws/setup_secret.sh") + subprocess.run(["bash", script], check=True) + # apply extra utilities # (e.g. ask for tags if none are provided in cfg, print cfg tree, etc.) extras(cfg) diff --git a/egomimic/train_zarr.yaml b/egomimic/train_zarr.yaml new file mode 100644 index 00000000..26f14b3b --- /dev/null +++ b/egomimic/train_zarr.yaml @@ -0,0 +1,111 @@ +defaults: + - model: hpt_cotrain_flow_shared_head_latent + - visualization: eva_cartesian_aria_cartesian + - paths: default + - trainer: ddp + - debug: null + - logger: wandb + - data: eva_human_keypoints_cotrain + - callbacks: checkpoints + - override hydra/launcher: submitit + - _self_ + +name: latent_flow +description: latent_flow +ckpt_path: null +train: true +eval: false + +eval_class: + _target_: egomimic.scripts.evaluation.Eve + mode: real + arm: both + eval_path: "./logs/eval/${name}_${now:%Y-%m-%d_%H-%M-%S}" + +hydra: + run: + # Dir should be experiment_name/description_{timestamp} + dir: ./logs/${name}/${description}_${now:%Y-%m-%d_%H-%M-%S} + sweep: + dir: ./logs/${name}/${description}_${now:%Y-%m-%d_%H-%M-%S} + +launch_params: + gpus_per_node: 1 + nodes: 1 + +data_schematic: # Dynamically fill in these shapes from the dataset + _target_: egomimic.rldb.zarr.utils.DataSchematic + norm_mode: quantile + schematic_dict: + eva_bimanual: + front_img_1: #batch key + key_type: camera_keys # key type + zarr_key: observations.images.front_img_1 # dataset key + right_wrist_img: + key_type: camera_keys + zarr_key: observations.images.right_wrist_img + left_wrist_img: + key_type: camera_keys + zarr_key: observations.images.left_wrist_img + ee_pose: + key_type: proprio_keys + zarr_key: observations.state.ee_pose + joint_positions: + key_type: proprio_keys + zarr_key: observations.state.joint_positions + actions_joints: + key_type: action_keys + zarr_key: actions_joints + actions_cartesian: + key_type: action_keys + zarr_key: actions_cartesian + embodiment: + key_type: metadata_keys + zarr_key: metadata.embodiment + aria_bimanual: + front_img_1: + key_type: camera_keys + zarr_key: observations.images.front_img_1 + ee_pose: + key_type: proprio_keys + zarr_key: observations.state.ee_pose + actions_cartesian: + key_type: action_keys + zarr_key: actions_cartesian + actions_keypoints: + key_type: action_keys + zarr_key: actions_keypoints + keypoint_positions: + key_type: proprio_keys + zarr_key: observations.state.keypoints + embodiment: + key_type: metadata_keys + zarr_key: metadata.embodiment + mecka_bimanual: + front_img_1: + key_type: camera_keys + zarr_key: observations.images.front_img_1 + ee_pose: + key_type: proprio_keys + zarr_key: observations.state.ee_pose + actions_cartesian: + key_type: action_keys + zarr_key: actions_cartesian + embodiment: + key_type: metadata_keys + zarr_key: metadata.embodiment + scale_bimanual: + front_img_1: + key_type: camera_keys + zarr_key: observations.images.front_img_1 + ee_pose: + key_type: proprio_keys + zarr_key: observations.state.ee_pose + actions_cartesian: + key_type: action_keys + zarr_key: actions_cartesian + embodiment: + key_type: metadata_keys + zarr_key: metadata.embodiment + +seed: 42 diff --git a/egomimic/utils/viz_utils.py b/egomimic/utils/viz_utils.py index bff862bd..d4289a74 100644 --- a/egomimic/utils/viz_utils.py +++ b/egomimic/utils/viz_utils.py @@ -1,4 +1,5 @@ import cv2 +import matplotlib.pyplot as plt import numpy as np from scipy.spatial.transform import Rotation as R @@ -22,6 +23,13 @@ class ColorPalette: def is_valid(cls, name: str) -> bool: return name in vars(cls).values() + @classmethod + def to_rgb(cls, cmap_name: str, value: float = 0.7) -> tuple[int, int, int]: + """Convert a ColorPalette cmap name to an RGB tuple (0-255). + value: 0-1, where higher = darker shade.""" + rgba = plt.get_cmap(cmap_name)(value) + return tuple(int(c * 255) for c in rgba[:3]) + def _prepare_viz_image(img): if img.ndim == 3 and img.shape[0] in (1, 3): @@ -161,7 +169,7 @@ def _draw_rotation_at_anchor( def _viz_keypoints( - images, actions, intrinsics_key, edges, colors, edge_ranges, **kwargs + images, actions, intrinsics_key, edges, edge_ranges, colors, dot_color, **kwargs ): """Visualize all 21 MANO keypoints per hand, projected onto the image.""" # Prepare image @@ -178,7 +186,7 @@ def _viz_keypoints( keypoints = {} keypoints["left"] = left_keypoints.reshape(-1, 3) keypoints["right"] = right_keypoints.reshape(-1, 3) - for hand, dot_color in [("left", (0, 120, 255)), ("right", (255, 80, 0))]: + for hand, dot_color in [("left", dot_color), ("right", dot_color)]: kps_cam = keypoints[hand] # Camera frame -> pixels kps_px = cam_frame_to_cam_pixels(kps_cam, intrinsics) # (42, 3+) 21 per arm