diff --git a/.gitignore b/.gitignore index 315c803a..98ed51ac 100644 --- a/.gitignore +++ b/.gitignore @@ -33,4 +33,6 @@ lerobot_test/ **/lerobot_test/ **/lerobot_test/** **/robot/models/** -**/robot/models/ \ No newline at end of file +**/robot/models/ +*parquet* +*.zarr* \ No newline at end of file diff --git a/egomimic/hydra_configs/data/clothe_eva_aria_mecka.yaml b/egomimic/hydra_configs/data/clothe_eva_aria_mecka.yaml new file mode 100644 index 00000000..c250ad41 --- /dev/null +++ b/egomimic/hydra_configs/data/clothe_eva_aria_mecka.yaml @@ -0,0 +1,70 @@ +train_datasets: + dataset1: + _target_: egomimic.rldb.utils.S3RLDBDataset + bucket_name: rldb + mode: total + embodiment: eva_bimanual + filters: + task: fold clothes + local_files_only: true + indomain: + _target_: egomimic.rldb.utils.S3RLDBDataset + bucket_name: rldb + mode: total + embodiment: aria_bimanual + filters: + task: fold_clothes_indomain + local_files_only: true + everse_rl2: + _target_: egomimic.rldb.utils.S3RLDBDataset + bucket_name: rldb + mode: total + embodiment: aria_bimanual + filters: + task: fold clothes + lab: rl2 + operator: rl2 + local_files_only: true + everse_song: + _target_: egomimic.rldb.utils.S3RLDBDataset + bucket_name: rldb + mode: total + embodiment: aria_bimanual + filters: + task: fold clothes + lab: song + local_files_only: true + everse_wang: + _target_: egomimic.rldb.utils.S3RLDBDataset + bucket_name: rldb + mode: total + embodiment: aria_bimanual + filters: + task: fold clothes + lab: wang + local_files_only: true + everse_eth: + _target_: egomimic.rldb.utils.S3RLDBDataset + bucket_name: rldb + mode: total + embodiment: aria_bimanual + filters: + task: fold clothes + lab: eth + local_files_only: true + mecka_flagship: + _target_: egomimic.rldb.utils.S3RLDBDataset + bucket_name: rldb + mode: total + embodiment: mecka_bimanual + filters: + task: fold_clothes + local_files_only: true + mecka_freeform: + _target_: egomimic.rldb.utils.S3RLDBDataset + bucket_name: rldb + mode: total + embodiment: mecka_bimanual + filters: + task: folding_clothes + local_files_only: true diff --git a/egomimic/hydra_configs/data/mecka_test.yaml b/egomimic/hydra_configs/data/mecka_test.yaml index 7f190339..f7cf3494 100644 --- a/egomimic/hydra_configs/data/mecka_test.yaml +++ b/egomimic/hydra_configs/data/mecka_test.yaml @@ -3,19 +3,19 @@ _target_: egomimic.pl_utils.pl_data_utils.MultiDataModuleWrapper train_datasets: dataset1: _target_: rldb.utils.RLDBDataset - repo_id: "mecka_test" + repo_id: "aria_bimanual" mode: train - embodiment: "mecka_bimanual" - root: "/coc/flash7/acheluva3/EgoVerse/mecka_demo" + embodiment: "aria_bimanual" + root: "/nethome/paphiwetsa3/flash/projects/EgoVerse/datasets" local_files_only: True valid_datasets: dataset1: _target_: rldb.utils.RLDBDataset - repo_id: "mecka_test" + repo_id: "aria_bimanual" mode: valid - embodiment: "mecka_bimanual" - root: "/coc/flash7/acheluva3/EgoVerse/mecka_demo" + embodiment: "aria_bimanual" + root: "/nethome/paphiwetsa3/flash/projects/EgoVerse/datasets" local_files_only: True train_dataloader_params: @@ -26,4 +26,4 @@ train_dataloader_params: valid_dataloader_params: dataset1: batch_size: 32 - num_workers: 10 \ No newline at end of file + num_workers: 10 diff --git a/egomimic/hydra_configs/data/test_bimanual.yaml b/egomimic/hydra_configs/data/test_bimanual.yaml index 7ef496ba..39ce0a0e 100644 --- a/egomimic/hydra_configs/data/test_bimanual.yaml +++ b/egomimic/hydra_configs/data/test_bimanual.yaml @@ -5,7 +5,7 @@ train_datasets: datasets: rl2_lab: _target_: rldb.utils.FolderRLDBDataset - folder_path: /coc/cedarp-dxu345-0/datasets/egoverse/put_cup_on_saucer_egoverse/put_cup_on_saucer_rl2 + folder_path: /nethome/paphiwetsa3/flash/projects/EgoVerse/datasets embodiment: aria_bimanual mode: train local_files_only: true @@ -17,8 +17,7 @@ valid_datasets: datasets: eth_lab: _target_: rldb.utils.FolderRLDBDataset - folder_path: /coc/cedarp-dxu345-0/datasets/egoverse/put_cup_on_saucer_egoverse/put_cup_on_saucer_song - embodiment: aria_bimanual + folder_path: /nethome/paphiwetsa3/flash/projects/EgoVerse/datasets mode: valid local_files_only: true embodiment: "aria_bimanual" @@ -31,4 +30,4 @@ train_dataloader_params: valid_dataloader_params: dataset1: batch_size: 2 - num_workers: 10 \ No newline at end of file + num_workers: 10 diff --git a/egomimic/hydra_configs/data/viz_data.yaml b/egomimic/hydra_configs/data/viz_data.yaml new file mode 100644 index 00000000..243932df --- /dev/null +++ b/egomimic/hydra_configs/data/viz_data.yaml @@ -0,0 +1,64 @@ +_target_: egomimic.pl_utils.pl_data_utils.MultiDataModuleWrapper +train_datasets: + dataset1: + _target_: egomimic.rldb.utils.S3RLDBDataset + bucket_name: rldb + mode: total + embodiment: eva_bimanual + filters: + task: fold_clothes + local_files_only: true + + dataset2: + _target_: egomimic.rldb.utils.S3RLDBDataset + bucket_name: rldb + mode: total + embodiment: aria_bimanual + filters: + task: fold_clothes + local_files_only: true + + # dataset3: + # _target_: egomimic.rldb.utils.S3RLDBDataset + # bucket_name: rldb + # mode: total + # valid_ratio: 0.5 + # embodiment: mecka_bimanual + # cache_root: "/coc/flash7/rpunamiya6/.cache" + # filters: + # task: fold_clothes + # local_files_only: true + +valid_datasets: + dataset1: + _target_: egomimic.rldb.utils.S3RLDBDataset + bucket_name: rldb + mode: total + embodiment: eva_bimanual + filters: + task: fold clothes + local_files_only: true + dataset2: + _target_: egomimic.rldb.utils.S3RLDBDataset + bucket_name: rldb + mode: total + embodiment: aria_bimanual + filters: + task: fold_clothes_indomain + local_files_only: true + +train_dataloader_params: + dataset1: + batch_size: 256 + num_workers: 10 + dataset2: + batch_size: 256 + num_workers: 10 + +valid_dataloader_params: + dataset1: + batch_size: 256 + num_workers: 10 + dataset2: + batch_size: 256 + num_workers: 10 diff --git a/egomimic/hydra_configs/data/viz_data2.yaml b/egomimic/hydra_configs/data/viz_data2.yaml new file mode 100644 index 00000000..2a89c3cf --- /dev/null +++ b/egomimic/hydra_configs/data/viz_data2.yaml @@ -0,0 +1,54 @@ +train_datasets: + dataset1: + _target_: egomimic.rldb.utils.S3RLDBDataset + bucket_name: rldb + mode: total + embodiment: eva_bimanual + filters: + task: fold clothes + local_files_only: true + indomain: + _target_: egomimic.rldb.utils.S3RLDBDataset + bucket_name: rldb + mode: total + embodiment: aria_bimanual + filters: + task: fold_clothes_indomain + local_files_only: true + everse_rl2: + _target_: egomimic.rldb.utils.S3RLDBDataset + bucket_name: rldb + mode: total + embodiment: aria_bimanual + filters: + task: fold clothes + lab: rl2 + operator: rl2 + local_files_only: true + everse_song: + _target_: egomimic.rldb.utils.S3RLDBDataset + bucket_name: rldb + mode: total + embodiment: aria_bimanual + filters: + task: fold clothes + lab: song + local_files_only: true + everse_wang: + _target_: egomimic.rldb.utils.S3RLDBDataset + bucket_name: rldb + mode: total + embodiment: aria_bimanual + filters: + task: fold clothes + lab: wang + local_files_only: true + everse_eth: + _target_: egomimic.rldb.utils.S3RLDBDataset + bucket_name: rldb + mode: total + embodiment: aria_bimanual + filters: + task: fold clothes + lab: eth + local_files_only: true diff --git a/egomimic/hydra_configs/train.yaml b/egomimic/hydra_configs/train.yaml index 828a21b6..c1299d34 100644 --- a/egomimic/hydra_configs/train.yaml +++ b/egomimic/hydra_configs/train.yaml @@ -1,10 +1,10 @@ defaults: - - model: hpt_bc_flow_eva + - model: hpt_bc_flow_aria - paths: default - trainer: ddp - debug: null - logger: wandb - - data: eva_bc_s3 + - data: mecka_test - callbacks: checkpoints - override hydra/launcher: submitit - _self_ @@ -16,7 +16,7 @@ train: true eval: false eval_class: - _target_ : egomimic.scripts.evaluation.Eve + _target_: egomimic.scripts.evaluation.Eve mode: real arm: both eval_path: "./logs/eval/${name}_${now:%Y-%m-%d_%H-%M-%S}" @@ -93,10 +93,7 @@ data_schematic: # Dynamically fill in these shapes from the dataset embodiment: key_type: metadata_keys lerobot_key: metadata.embodiment - viz_img_key: - eva_bimanual: - front_img_1 - aria_bimanual: - front_img_1 - mecka_bimanual: - front_img_1 + viz_img_key: + eva_bimanual: front_img_1 + aria_bimanual: front_img_1 + mecka_bimanual: front_img_1 diff --git a/egomimic/models/hpt_nets.py b/egomimic/models/hpt_nets.py index 7214c29c..26765cb5 100644 --- a/egomimic/models/hpt_nets.py +++ b/egomimic/models/hpt_nets.py @@ -34,6 +34,9 @@ from functools import partial +from transformers import CLIPVisionModel +from transformers import AutoImageProcessor, AutoModel + from transformers import T5Tokenizer, T5Model, AutoTokenizer from transformers import CLIPTextModel, CLIPVisionModel # TODO: add CLIP @@ -606,6 +609,97 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: return y +class DinoV3(PolicyStem): + def __init__( + self, + output_dim, + model_type: str = "facebook/dinov3-vits16plus-pretrain-lvd1689m", + freeze_backbone: bool = True, + **kwargs, + ) -> None: + super().__init__(**kwargs) + self.model = AutoModel.from_pretrained( + model_type + ) + self.freeze_backbone = freeze_backbone + + if "conv" in model_type: + self.proj = nn.Linear(self.model.config.hidden_sizes[-1], output_dim) + else: + self.proj = nn.Linear(self.model.config.hidden_size, output_dim) + + if self.freeze_backbone: + for p in self.model.parameters(): + p.requires_grad = False + + self.model.eval() + else: + # fix for ViT DinoV3 to prevent unused params error + for name, p in self.model.named_parameters(): + if "mask_token" in name: + p.requires_grad = False + self.model.train() + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + Performs a forward pass of the model. + Args: + x: Image tensor with shape [B, T, N, 3, H, W] representing the batch size, + horizon, instance (e.g. num of views) + Returns: + Flatten tensor with shape [B, M, 512] + """ + B, T, N, C, H, W = x.shape + x = x.view(B * T * N, C, H, W) + + outputs = self.model(pixel_values=x) + outputs = outputs.last_hidden_state + outputs = self.proj(outputs) + + return outputs + +class CLIP(PolicyStem): + def __init__( + self, + output_dim, + model_type: str = "openai/clip-vit-base-patch32", + freeze_backbone: bool = True, + **kwargs, + ) -> None: + super().__init__(**kwargs) + + self.model = CLIPVisionModel.from_pretrained(model_type) + self.freeze_backbone = freeze_backbone + self.proj = nn.Linear(self.model.config.hidden_size, output_dim) + + if self.freeze_backbone: + for p in self.model.parameters(): + p.requires_grad = False + + self.model.eval() + + else: + self.model.train() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + Performs a forward pass of the model. + Args: + x: Image tensor with shape [B, T, N, 3, H, W] representing the batch size, + horizon, instance (e.g. num of views) + Returns: + Flatten tensor with shape [B, M, 512] + """ + B, T, N, C, H, W = x.shape + x = x.view(B * T * N, C, H, W) + + outputs = self.model(pixel_values=x) + outputs = outputs.pooler_output + outputs = self.proj(outputs) + outputs = outputs.view(B, T*N, -1) + + return outputs + + class ResNet(PolicyStem): def __init__( self, diff --git a/egomimic/scripts/data_visualisation.ipynb b/egomimic/scripts/data_visualisation.ipynb index 8eec78ef..479f4a71 100644 --- a/egomimic/scripts/data_visualisation.ipynb +++ b/egomimic/scripts/data_visualisation.ipynb @@ -2,9 +2,18 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/nethome/paphiwetsa3/flash/projects/EgoVerse/.venv/lib/python3.11/site-packages/torch/cuda/__init__.py:61: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.\n", + " import pynvml # type: ignore[import]\n" + ] + } + ], "source": [ "# IMPORTS\n", "from egomimic.rldb.utils import *\n", @@ -17,12 +26,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "8b62fc7671f24672b715b88fadf6e295", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Generating train split: 0 examples [00:00, ? examples/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "# Load dataset\n", - "root = \"/coc/flash7/paphiwetsa3/datasets/eva_test_data2/proc2/lerobot_test\"\n", + "root = \"/nethome/paphiwetsa3/flash/projects/EgoVerse/datasets\"\n", "repo_id = \"rpuns/aria_laundry_rl2\"\n", "\n", "episodes = [0, 1]\n", @@ -33,9 +57,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'observations.state.ee_pose': {'dtype': 'float64', 'shape': (12,), 'names': ['dim_0']}, 'observations.images.front_img_1': {'dtype': 'image', 'shape': (480, 640, 3), 'names': ['channel', 'height', 'width']}, 'actions_cartesian': {'dtype': 'prestacked_float64', 'shape': (100, 12), 'names': ['chunk_length', 'action_dim']}, 'metadata.embodiment': {'dtype': 'int32', 'shape': (1,), 'names': ['dim_0']}, 'timestamp': {'dtype': 'float32', 'shape': (1,), 'names': None}, 'frame_index': {'dtype': 'int64', 'shape': (1,), 'names': None}, 'episode_index': {'dtype': 'int64', 'shape': (1,), 'names': None}, 'index': {'dtype': 'int64', 'shape': (1,), 'names': None}, 'task_index': {'dtype': 'int64', 'shape': (1,), 'names': None}}\n" + ] + } + ], "source": [ "# Get metadata\n", "print(dataset.meta.info[\"features\"])\n", @@ -46,26 +78,34 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "5\n" + ] + } + ], "source": [ "print(dataset.embodiment)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "# Make data_loader\n", - "data_loader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=False)" + "data_loader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=False)" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -74,12 +114,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "def visualize_actions(ims, actions, extrinsics, intrinsics, arm=\"both\"):\n", - " for b in range(ims.shape[0]):\n", + " for b in range(actions.shape[0]):\n", " if actions.shape[-1] == 7 or actions.shape[-1] == 14:\n", " ac_type = \"joints\"\n", " elif actions.shape[-1] == 3 or actions.shape[-1] == 6:\n", @@ -96,33 +136,63 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(1, 100, 6)\n", + "Saved batch 0 images to ./visualization/\n", + "(1, 100, 6)\n", + "Saved batch 1 images to ./visualization/\n", + "(1, 100, 6)\n", + "Saved batch 2 images to ./visualization/\n", + "(1, 100, 6)\n", + "Saved batch 3 images to ./visualization/\n", + "(1, 100, 6)\n", + "Saved batch 4 images to ./visualization/\n", + "(1, 100, 6)\n", + "Saved batch 5 images to ./visualization/\n", + "(1, 100, 6)\n", + "Saved batch 6 images to ./visualization/\n" + ] + } + ], "source": [ "save_dir = \"./visualization/\"\n", "os.makedirs(save_dir, exist_ok=True)\n", "\n", - "num_batches = 1\n", + "num_batches = 6\n", "\n", "for i, data in enumerate(data_loader):\n", " if i > num_batches:\n", " break\n", " ims = (data[image_key].permute(0, 2, 3, 1).cpu().numpy() * 255.0).astype(np.uint8)\n", " actions = data[actions_key].cpu().numpy()\n", - " # print(actions_key)\n", - " print(actions[:10, :])\n", - "\n", - " ims_viz = visualize_actions(ims, actions[:, :3], camera_transforms.extrinsics, camera_transforms.intrinsics)\n", + " actions = actions[:1, ...]\n", + " ims = ims[:1, ...]\n", + " left_actions = actions[..., :3]\n", + " right_actions = actions[..., 7:10]\n", + " both_actions = np.concatenate([left_actions, right_actions], axis=-1)\n", + " print(both_actions.shape)\n", + " ims_viz = visualize_actions(ims, both_actions, camera_transforms.extrinsics, camera_transforms.intrinsics)\n", "\n", " for j, im in enumerate(ims_viz):\n", " img_tensor = torch.from_numpy(im).permute(2, 0, 1)\n", " save_path = os.path.join(save_dir, f\"image_{i}_{j}.png\")\n", " io.write_png(img_tensor, save_path)\n", "\n", - " print(f\"Saved batch {i} images to {save_dir}\")\n", - " break" + " print(f\"Saved batch {i} images to {save_dir}\")" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -141,9 +211,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.16" + "version": "3.11.11" } }, "nbformat": 4, "nbformat_minor": 2 -} \ No newline at end of file +} diff --git a/egomimic/scripts/visualization_process/debug/check_zar.py b/egomimic/scripts/visualization_process/debug/check_zar.py new file mode 100644 index 00000000..dd044654 --- /dev/null +++ b/egomimic/scripts/visualization_process/debug/check_zar.py @@ -0,0 +1,83 @@ +import json +from pathlib import Path + +import pandas as pd +import zarr + + +def main(): + # Default to the outputs produced by `process_image.py` + data_dir = Path("egomimic/scripts/visualization_process/fold_clothes_aria_eva") + manifest_path = data_dir / "manifest.json" + manifest = json.loads(manifest_path.read_text()) + + print("[INFO] manifest_path =", manifest_path) + print("[INFO] n_frames =", manifest["n_frames"]) + print("[INFO] embedding_dim =", manifest["embedding_dim"]) + print("[INFO] embed_store =", manifest["embed_store"]) + + # Load metadata parquet + meta_path = Path(manifest["metadata_parquet"]) + meta_df = pd.read_parquet(meta_path) + print("[INFO] metadata rows =", len(meta_df)) + print("[INFO] metadata cols =", len(meta_df.columns)) + # basic columns we expect + for col in ("global_index", "episode_hash"): + print("[INFO] has {} = {}".format(col, col in meta_df.columns)) + if len(meta_df) > 0: + first_row = meta_df.iloc[100].to_dict() + breakpoint() + print("[INFO] metadata[0] keys =", sorted(list(first_row.keys()))[:40], "...") + print("[INFO] metadata[0] =", first_row) + + # Load embeddings zarr for first image key + first_key = manifest["image_keys"][0] + zarr_path = Path(manifest["embeddings"][first_key]) + root = zarr.open_group(str(zarr_path), mode="r") + arr = root["embeddings"] + print("[INFO] zarr_path =", zarr_path) + print("[INFO] zarr array =", "embeddings") + print("[INFO] shape/dtype =", arr.shape, arr.dtype, "chunks=", arr.chunks) + + # Sanity: embeddings rows should match metadata rows for 1:1 alignment + if arr.shape[0] != len(meta_df): + raise RuntimeError( + "Row mismatch: embeddings has {} rows but metadata has {} rows".format( + arr.shape[0], len(meta_df) + ) + ) + + # Explicitly access a latent (embedding) row. + # This is the vector aligned with metadata row 0. + x0 = arr[0, :] # (D,) + x_last = arr[arr.shape[0] - 1, :] + print("[INFO] first latent shape =", getattr(x0, "shape", None), "dtype=", getattr(x0, "dtype", None)) + print("[INFO] last latent shape =", getattr(x_last, "shape", None), "dtype=", getattr(x_last, "dtype", None)) + # Print only a small slice to keep logs readable + try: + x0_slice = x0[:16] + print("[INFO] latent[0][:16] =", x0_slice) + # quick stats + x0_f = x0.astype("float32", copy=False) + print( + "[INFO] latent[0] stats min/max/mean =", + float(x0_f.min()), + float(x0_f.max()), + float(x0_f.mean()), + ) + + y = root["tsne_2d"][:10] # (10, 2) + print("tsne_2d[:10] =\n", y) + print("min_xy =", y.min(axis=0), "max_xy =", y.max(axis=0), "mean_xy =", y.mean(axis=0)) + except Exception as e: + print("[WARN] Could not slice/stats latent[0]:", e) + + # Check global_index alignment (expected: 0..n-1 in this one-batch run) + if "global_index" in meta_df.columns: + gi_min = int(meta_df["global_index"].min()) + gi_max = int(meta_df["global_index"].max()) + print("[INFO] global_index min/max =", gi_min, gi_max) + + +if __name__ == "__main__": + main() diff --git a/egomimic/scripts/visualization_process/debug/test_dinov3.py b/egomimic/scripts/visualization_process/debug/test_dinov3.py new file mode 100644 index 00000000..7c80ed9e --- /dev/null +++ b/egomimic/scripts/visualization_process/debug/test_dinov3.py @@ -0,0 +1,37 @@ +""" +Quick smoke test for DinoV3 from `egomimic/models/hpt_nets.py`. + +Runs a random forward pass and prints output shapes + +Checking if DinoV3 is working as expected. +""" + +import torch + +from egomimic.models.hpt_nets import DinoV3 + + +def main(): + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # `DinoV3.forward` expects images shaped [B, T, N, 3, H, W] + B, T, N, C, H, W = 2, 3, 1, 3, 224, 224 + x = torch.randn(B, T, N, C, H, W, device=device) + + model = DinoV3( + output_dim=256, + # default is "facebook/dinov3-vits16plus-pretrain-lvd1689m" + freeze_backbone=True, + ).to(device) + + model.eval() + with torch.no_grad(): + y = model(x) + + print("input: {}".format(tuple(x.shape))) + # typically [(B*T*N), tokens, output_dim] + print("output: {}".format(tuple(y.shape))) + + +if __name__ == "__main__": + main() diff --git a/egomimic/scripts/visualization_process/dim_reduce.py b/egomimic/scripts/visualization_process/dim_reduce.py new file mode 100644 index 00000000..85f23f90 --- /dev/null +++ b/egomimic/scripts/visualization_process/dim_reduce.py @@ -0,0 +1,244 @@ +""" +Run dimensionality reduction on saved embedding latents and store 2D coords back into the zarr. + +Reads: +- manifest.json (to find the embeddings zarr path) +- embeddings zarr group (expects dataset name "embeddings") + +Writes: +- dataset "_2d" into the same zarr group by default (tsne_2d/umap_2d/pca_2d), + shape (N, 2), float32 +""" + +import argparse +import json +from pathlib import Path + +import numpy as np +import zarr + + +def _load_embeddings(zarr_path: Path) -> np.ndarray: + root = zarr.open_group(str(zarr_path), mode="r") + if "embeddings" not in root: + raise KeyError( + "Expected dataset 'embeddings' in zarr group. Found keys: {}".format( + list(root.array_keys()) + ) + ) + arr = root["embeddings"] + # Load entire array into memory for t-SNE + x = arr[:] + print("x.shape =", x.shape) + # cuML prefers float32 + if x.dtype != np.float32: + x = x.astype(np.float32, copy=False) + return x + + +def _to_numpy(x): + # Convert cupy -> numpy if needed + try: + import cupy as cp + + if isinstance(x, cp.ndarray): + return cp.asnumpy(x) + except Exception: + pass + return np.asarray(x) + + +def _run_cuml_tsne( + x: np.ndarray, *, perplexity: float, random_state: int, learning_rate: float +) -> np.ndarray: + try: + from cuml import TSNE + except Exception as e: + raise RuntimeError( + "cuml is required. Make sure RAPIDS/cuML is installed in this environment." + ) from e + + # cuML TSNE returns a (N, 2) array-like (often cupy-backed); convert to numpy. + tsne = TSNE( + n_components=2, + perplexity=perplexity, + random_state=random_state, + init="random", + # NOTE: scikit-learn supports learning_rate="auto", but cuML expects numeric. + learning_rate=float(learning_rate), + ) + y = tsne.fit_transform(x) + + y = _to_numpy(y) + if y.ndim != 2 or y.shape[1] != 2: + raise RuntimeError("Unexpected TSNE output shape: {}".format(y.shape)) + return y.astype(np.float32, copy=False) + + +def _run_cuml_umap( + x: np.ndarray, *, n_neighbors: int, min_dist: float, metric: str, random_state: int +) -> np.ndarray: + try: + from cuml import UMAP + except Exception as e: + raise RuntimeError( + "cuml is required for UMAP. Make sure RAPIDS/cuML is installed in this environment." + ) from e + + umap = UMAP( + n_components=2, + n_neighbors=int(n_neighbors), + min_dist=float(min_dist), + metric=str(metric), + random_state=int(random_state), + ) + y = umap.fit_transform(x) + y = _to_numpy(y) + if y.ndim != 2 or y.shape[1] != 2: + raise RuntimeError("Unexpected UMAP output shape: {}".format(y.shape)) + return y.astype(np.float32, copy=False) + + +def _run_pca(x: np.ndarray, *, n_components: int, random_state: int) -> np.ndarray: + # Prefer GPU PCA if available; otherwise fall back to sklearn. + try: + from cuml import PCA # type: ignore + + pca = PCA(n_components=int(n_components), random_state=int(random_state)) + y = pca.fit_transform(x) + y = _to_numpy(y) + except Exception: + try: + from sklearn.decomposition import PCA # type: ignore + except Exception as e: + raise RuntimeError( + "PCA requires either cuML (preferred) or scikit-learn installed." + ) from e + + pca = PCA(n_components=int(n_components), random_state=int(random_state)) + y = pca.fit_transform(x) + y = np.asarray(y) + + if y.ndim != 2 or y.shape[1] != int(n_components): + raise RuntimeError("Unexpected PCA output shape: {}".format(y.shape)) + if y.shape[1] != 2: + raise RuntimeError("This script only supports 2D outputs; got PCA dim {}".format(y.shape[1])) + return y.astype(np.float32, copy=False) + + +def _write_2d(zarr_path: Path, *, y2d: np.ndarray, name: str, overwrite: bool) -> None: + root = zarr.open_group(str(zarr_path), mode="a") + + if name in root and not overwrite: + raise FileExistsError( + "Zarr dataset '{}' already exists at {}. Use --overwrite to replace.".format( + name, zarr_path + ) + ) + + chunks = (min(8192, y2d.shape[0]), 2) + root.create_dataset( + name, + shape=y2d.shape, + chunks=chunks, + dtype=np.float32, + overwrite=overwrite, + ) + root[name][:] = y2d + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument( + "--manifest", + type=str, + default="egomimic/scripts/visualization_process/fold_clothes_aria_eva_all_labs/manifest.json", + help="Path to manifest.json produced by process_image.py", + ) + ap.add_argument( + "--image-key", + type=str, + default="", + help="Optional image key to select from manifest['embeddings'] (defaults to first).", + ) + ap.add_argument( + "--method", + type=str, + default="tsne", + choices=("tsne", "umap", "pca"), + help="Dimensionality reduction method.", + ) + ap.add_argument( + "--out-name", + type=str, + default="", + help="Dataset name to write in zarr. Defaults to _2d.", + ) + # TSNE args + ap.add_argument("--perplexity", type=float, default=30.0, help="TSNE perplexity (tsne only).") + ap.add_argument( + "--learning-rate", + type=float, + default=200.0, + help="cuML TSNE learning rate (tsne only; must be numeric).", + ) + # UMAP args + ap.add_argument("--n-neighbors", type=int, default=15, help="UMAP n_neighbors (umap only).") + ap.add_argument("--min-dist", type=float, default=0.1, help="UMAP min_dist (umap only).") + ap.add_argument("--metric", type=str, default="euclidean", help="UMAP metric (umap only).") + # PCA args + ap.add_argument("--pca-components", type=int, default=2, help="PCA n_components (pca only).") + ap.add_argument("--seed", type=int, default=0) + ap.add_argument("--overwrite", action="store_true") + args = ap.parse_args() + + manifest_path = Path(args.manifest) + manifest = json.loads(manifest_path.read_text()) + + if manifest.get("embed_store") != "zarr": + raise RuntimeError("This script expects manifest embed_store == 'zarr'.") + + if args.image_key: + image_key = args.image_key + else: + image_key = manifest["image_keys"][0] + + zarr_path = Path(manifest["embeddings"][image_key]) + print("[INFO] zarr_path =", zarr_path) + print("[INFO] reading embeddings for key =", image_key) + + x = _load_embeddings(zarr_path) + print("[INFO] embeddings shape/dtype =", x.shape, x.dtype) + + if args.out_name: + out_name = args.out_name + else: + out_name = f"{args.method}_2d" + + if args.method == "tsne": + y2d = _run_cuml_tsne( + x, perplexity=args.perplexity, random_state=args.seed, learning_rate=args.learning_rate + ) + elif args.method == "umap": + y2d = _run_cuml_umap( + x, + n_neighbors=args.n_neighbors, + min_dist=args.min_dist, + metric=args.metric, + random_state=args.seed, + ) + elif args.method == "pca": + if int(args.pca_components) != 2: + raise ValueError("--pca-components must be 2 for this script (got {})".format(args.pca_components)) + y2d = _run_pca(x, n_components=args.pca_components, random_state=args.seed) + else: + raise RuntimeError("Unsupported method: {}".format(args.method)) + + print("[INFO] {} shape/dtype =".format(out_name), y2d.shape, y2d.dtype) + + _write_2d(zarr_path, y2d=y2d, name=out_name, overwrite=args.overwrite) + print("[DONE] wrote {} into {}".format(out_name, zarr_path)) + + +if __name__ == "__main__": + main() diff --git a/egomimic/scripts/visualization_process/process_image.py b/egomimic/scripts/visualization_process/process_image.py new file mode 100644 index 00000000..2e0e313d --- /dev/null +++ b/egomimic/scripts/visualization_process/process_image.py @@ -0,0 +1,531 @@ +""" +create_dino.py +============== + +Downloads RLDB datasets via `S3RLDBDataset`, writes a metadata table (Parquet), +and computes image embeddings using a DINO-family model (defaults to DINOv2 via torch.hub). + +Outputs (by default) to: +- metadata parquet: /metadata.parquet +- embeddings: /embeddings.zarr (or .npy memmap) + +Notes +----- +- "DINOv3" is not guaranteed to be available via torch.hub. This script will try to + load the requested hub repo, and falls back to DINOv2 if that fails. +- The RLDB datasets are HuggingFace / LeRobot datasets. Instantiating S3RLDBDataset + will sync needed episode folders locally. +""" + +from __future__ import annotations + +import argparse +import json +from pathlib import Path +from typing import Any, Dict, List + +import hydra +import numpy as np +from omegaconf import OmegaConf +import pandas as pd +import torch +from torch.utils.data import DataLoader, Subset + +from egomimic.rldb.utils import S3RLDBDataset, MultiRLDBDataset +from egomimic.algo.hpt import DinoV3 +from egomimic.utils.aws.aws_sql import create_default_engine, episode_table_to_df + + +def _parse_json_or_empty(s: str) -> dict: + if not s: + return {} + return json.loads(s) + + +def _safe_get(d: dict, k: str, default=None): + try: + return d.get(k, default) + except Exception: + return default + + +def _image_to_torch_uint8_bchw(x) -> torch.Tensor: + """ + Accept common dataset image formats: + - numpy uint8: HWC or CHW + - torch uint8/float: HWC/CHW/BCHW/BHWC + Returns uint8 BCHW. + """ + if isinstance(x, torch.Tensor): + t = x + else: + t = torch.from_numpy(np.asarray(x)) + + if t.ndim == 3: + # HWC or CHW -> add batch + t = t.unsqueeze(0) + if t.ndim != 4: + raise ValueError(f"Expected 3D/4D image tensor/array, got shape={tuple(t.shape)}") + + # If last dim looks like channels -> BHWC -> BCHW + if t.shape[-1] in (1, 3) and t.shape[1] not in (1, 3): + t = t.permute(0, 3, 1, 2).contiguous() + # Else assume already BCHW (or ambiguous) + + if t.dtype != torch.uint8: + # If floats in [0,1] or [-1,1], bring to uint8 best-effort + if t.is_floating_point(): + t = t.to(torch.float32) + t = torch.clamp(t, 0.0, 1.0) if t.max() <= 1.0 else torch.clamp(t, -1.0, 1.0) * 0.5 + 0.5 + t = torch.round(t * 255.0).to(torch.uint8) + else: + t = t.to(torch.uint8) + return t + + +def _bchw_u8_to_list_hwc_u8(img_bchw_u8: torch.Tensor) -> List[np.ndarray]: + """ + Convert uint8 BCHW torch tensor to a list of uint8 HWC numpy arrays. + """ + if img_bchw_u8.ndim != 4: + raise ValueError(f"Expected BCHW, got {tuple(img_bchw_u8.shape)}") + if img_bchw_u8.dtype != torch.uint8: + raise ValueError(f"Expected uint8 image tensor, got {img_bchw_u8.dtype}") + x = img_bchw_u8.permute(0, 2, 3, 1).contiguous().cpu().numpy() # BHWC uint8 + return [x[i] for i in range(x.shape[0])] + + +def _load_hpt_dinov3(model_id: str, *, output_dim: int, device: str): + """ + Load DINOv3 backbone via `DinoV3` from egomimic's HPT code, but keep a HF + `AutoImageProcessor` for correct pixel preprocessing. + + `DinoV3.forward` expects input shaped [B, T, N, 3, H, W] and returns projected + token embeddings shaped [(B*T*N), num_tokens, output_dim]. + """ + try: + from transformers import AutoImageProcessor + except Exception as e: + raise RuntimeError( + "Transformers is required for HF DINO models. Install with `pip install transformers`." + ) from e + + processor = AutoImageProcessor.from_pretrained(model_id) + stem = DinoV3(output_dim=output_dim, model_type=model_id, freeze_backbone=True) + stem.eval() + stem.to(device) + return processor, stem + + +@torch.no_grad() +def _embed_batch_dinov3(processor, stem: torch.nn.Module, images_hwc_u8: List[np.ndarray], device: str) -> torch.Tensor: + """ + Returns (B, D) embeddings (CLS token) using: + - HF processor -> pixel_values (B,C,H,W) + - DinoV3 stem -> token embeddings, then take token 0 (CLS) + """ + inputs = processor(images=images_hwc_u8, return_tensors="pt") + if "pixel_values" not in inputs: + raise RuntimeError("HF processor did not return 'pixel_values'.") + pixel_values = inputs["pixel_values"].to(device) # (B,C,H,W), float + # DinoV3 expects [B, T, N, C, H, W]; we treat each frame as T=1, N=1 + x = pixel_values.unsqueeze(1).unsqueeze(2) + tok = stem(x) # (B, num_tokens, D) because B*T*N == B + if tok.ndim != 3: + raise RuntimeError(f"Unexpected DinoV3 output shape: {tuple(tok.shape)}") + return tok[:, 0, :] # CLS token + + +def _flatten_metadata(sample: Dict[str, Any]) -> Dict[str, Any]: + """ + Make a metadata dict that is safe for DataFrame/Parquet. + We keep common RLDB/LeRobot fields if present and also include any `metadata.*` keys. + """ + out: Dict[str, Any] = {} + + # Common keys we expect in LeRobot datasets + for k in ("episode_index", "frame_index", "timestamp", "annotations", "task", "task_description"): + if k in sample: + out[k] = sample[k] + + # Some datasets include these: + for k in ("dataset_index", "index", "step", "episode_id"): + if k in sample and k not in out: + out[k] = sample[k] + + # Include all metadata.* keys (e.g. metadata.embodiment, etc.) + for k, v in sample.items(): + if isinstance(k, str) and k.startswith("metadata."): + out[k] = v + + # Make sure tensors/numpy become scalars where appropriate + for k, v in list(out.items()): + if isinstance(v, torch.Tensor): + if v.numel() == 1: + out[k] = v.item() + else: + out[k] = v.detach().cpu().numpy() + elif isinstance(v, np.ndarray): + if v.size == 1: + out[k] = v.item() + return out + + +def _py_scalar(v: Any) -> Any: + """Best-effort conversion for pandas/numpy scalars for Parquet friendliness.""" + try: + import pandas as _pd # local import to avoid hard dependency patterns + + if isinstance(v, _pd.Timestamp): + return v.isoformat() + except Exception: + pass + + # numpy scalar -> python scalar + try: + if isinstance(v, np.generic): + return v.item() + except Exception: + pass + + return v + + +def _instantiate_hydra(cfg_path: str): + """ + Instantiate a dataset from a Hydra-style YAML config. + + Example: + cfg_path="egomimic/hydra_configs/data/viz_data.yaml" + """ + try: + from hydra.utils import instantiate + from omegaconf import OmegaConf + except Exception as e: + raise RuntimeError( + "Hydra instantiation requires `hydra-core` and `omegaconf`." + ) from e + + cfg = OmegaConf.load(cfg_path) + return instantiate(cfg) + + +def _ensure_out_dir(out_dir: Path) -> None: + out_dir.mkdir(parents=True, exist_ok=True) + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument( + "--embodiment", + type=str, + default="", + help='e.g. "eva_right_arm" or "eva_bimanual". Required unless using --data-config.', + ) + ap.add_argument("--out-dir", type=str, default="egomimic/scripts/visualization_process/data2") + ap.add_argument( + "--data-config", + type=str, + default="", + help=( + "Optional Hydra YAML path for dataset instantiation, e.g. " + "egomimic/hydra_configs/data/viz_data.yaml. If set, the dataset is " + "created via hydra `instantiate()` from --data-split/--dataset-name." + ), + ) + ap.add_argument( + "--image-keys", + type=str, + nargs="+", + default=["observations.images.front_img_1"], + help="LeRobot image keys to embed (can pass multiple).", + ) + ap.add_argument( + "--model", + type=str, + default="facebook/dinov3-vitl16-pretrain-lvd1689m", + help="HuggingFace model id for DINO (e.g. facebook/dinov3-vitl16-pretrain-lvd1689m).", + ) + ap.add_argument("--batch-size", type=int, default=240) + ap.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu") + ap.add_argument("--num-frames", type=int, default=-1, help="Limit number of frames for debugging") + ap.add_argument( + "--every-k-datapoint", + type=int, + default=15, + help="Keep only every k-th datapoint (0,k,2k,...) to reduce compute. Set to 1 to keep all.", + ) + ap.add_argument( + "--debug-first-batch", + action="store_true", + help="Process/save only the first batch, then exit (useful for debugging).", + ) + + ap.add_argument( + "--embed-store", + type=str, + default="zarr", + choices=["zarr", "npy"], + help="Embedding storage format. zarr is chunked; npy is memmap.", + ) + ap.add_argument("--embed-dtype", type=str, default="float16", choices=["float16", "float32"]) + ap.add_argument("--chunk-size", type=int, default=8192, help="Write chunk size for zarr") + + args = ap.parse_args() + + out_dir = Path(args.out_dir) + _ensure_out_dir(out_dir) + + + # Dataset instantiation + # If --data-config is provided, treat it as a MultiDataModuleWrapper-style config + # and ONLY use its train_datasets (ignore valid_datasets entirely). + dataset_dict: Dict[str, torch.utils.data.Dataset] = {} + if args.data_config: + cfg = OmegaConf.load(args.data_config) + cfg_data = cfg.data if ("data" in cfg and "train_datasets" in cfg.data) else cfg + if "train_datasets" not in cfg_data: + raise KeyError( + "Expected 'train_datasets' in --data-config (or in data.train_datasets)." + ) + for dataset_name, ds_cfg in cfg_data.train_datasets.items(): + dataset_dict[str(dataset_name)] = hydra.utils.instantiate(ds_cfg) + else: + # CLI-configured dataset; instantiation triggers S3 sync + local load. + if not args.embodiment: + raise ValueError("--embodiment is required when not using --data-config") + filters = {"task": "fold_clothes"} + ds = S3RLDBDataset(embodiment=args.embodiment, mode="total", filters=filters) + dataset_dict[str(args.embodiment)] = ds + + if not dataset_dict: + raise RuntimeError("No datasets were instantiated.") + + dataset_names = list(dataset_dict.keys()) + + # Compute effective per-dataset lengths + global offsets into the shared embedding array + per_dataset_n: Dict[str, int] = {} + per_dataset_offset: Dict[str, int] = {} + per_dataset_keep_indices: Dict[str, List[int]] = {} + running = 0 + k_stride = int(args.every_k_datapoint) + if k_stride <= 0: + k_stride = 1 + for dataset_name in dataset_names: + ds_i = dataset_dict[dataset_name] + n_i = len(ds_i) + if args.num_frames > 0: + n_i = min(n_i, args.num_frames) + if args.debug_first_batch: + n_i = min(n_i, args.batch_size) + keep_idx = list(range(0, n_i, k_stride)) + per_dataset_keep_indices[dataset_name] = keep_idx + per_dataset_offset[dataset_name] = running + per_dataset_n[dataset_name] = len(keep_idx) + running += len(keep_idx) + + n_total = running + print( + "[INFO] Using {} train datasets; total frames to process = {}".format( + len(dataset_names), n_total + ) + ) + + # Model (HPT DinoV3 stem + HF processor) + # If the CLI flag was removed, default to 1024 (common for ViT-L features). + dino_output_dim = getattr(args, "dino_output_dim", 1024) + processor, stem = _load_hpt_dinov3( + args.model, output_dim=dino_output_dim, device=args.device + ) + + # Probe embedding dim + first = dataset_dict[dataset_names[0]][0] + probe_key = args.image_keys[0] + if probe_key not in first: + raise KeyError( + f"Image key '{probe_key}' not found in sample. Available keys (truncated): {list(first.keys())[:30]}" + ) + probe_img = _image_to_torch_uint8_bchw(first[probe_key]) # uint8 BCHW + probe_list = _bchw_u8_to_list_hwc_u8(probe_img) + probe_emb = _embed_batch_dinov3(processor, stem, probe_list, args.device) + emb_dim = int(probe_emb.shape[-1]) + print(f"[INFO] Embedding dim: {emb_dim}") + + embed_dtype = np.float16 if args.embed_dtype == "float16" else np.float32 + + # Storage setup + embed_paths = {} + embed_writers = {} + for k in args.image_keys: + safe_name = k.replace("/", "_").replace(".", "_") + if args.embed_store == "npy": + path = out_dir / f"embeddings__{safe_name}.npy" + arr = np.memmap(path, mode="w+", dtype=embed_dtype, shape=(n_total, emb_dim)) + embed_paths[k] = path + embed_writers[k] = arr + else: + # zarr + try: + import zarr + except Exception as e: + raise RuntimeError( + "zarr is not installed but --embed-store=zarr was requested. " + "Either install zarr (pip install zarr numcodecs) or use --embed-store npy." + ) from e + path = out_dir / f"embeddings__{safe_name}.zarr" + root = zarr.open_group(str(path), mode="w") + # Chunk over first dim + chunks = (min(args.chunk_size, n_total), emb_dim) + root.create_dataset( + "embeddings", + shape=(n_total, emb_dim), + chunks=chunks, + dtype=embed_dtype, + overwrite=True, + ) + embed_paths[k] = path + embed_writers[k] = root["embeddings"] + + # Metadata rows (we’ll write parquet at the end; for huge datasets you can switch to incremental writing) + meta_rows: List[Dict[str, Any]] = [] + + engine = create_default_engine() + df = episode_table_to_df(engine) + # Cache episode-level DB metadata by episode_hash for fast per-frame lookup. + # We prefix these keys as "db.*" when writing per-frame metadata rows. + episode_meta_by_hash: Dict[str, Dict[str, Any]] = {} + if "episode_hash" in df.columns: + df_unique = df.drop_duplicates(subset=["episode_hash"]) + for row in df_unique.to_dict(orient="records"): + ep_hash = row.get("episode_hash") + if ep_hash is None: + continue + # store sanitized scalars + episode_meta_by_hash[str(ep_hash)] = {k: _py_scalar(v) for k, v in row.items()} + + # Batch loop across train datasets, writing into one shared embeddings array per image key + bs = args.batch_size + processed = 0 + for dataset_name in dataset_names: + ds = dataset_dict[dataset_name] + keep_idx = per_dataset_keep_indices[dataset_name] + n_eff = len(keep_idx) + offset = per_dataset_offset[dataset_name] + + # Only load/process the kept indices (efficient: filters before model forward) + ds_for_loader = ds if (n_eff == len(ds) and keep_idx == list(range(len(ds)))) else Subset(ds, keep_idx) + loader = DataLoader( + ds_for_loader, + batch_size=bs, + shuffle=False, + num_workers=8, + collate_fn=lambda batch: batch, # keep list[dict] (no tensor stacking) + ) + + for batch_idx, batch_samples in enumerate(loader): + start = batch_idx * bs + end = start + len(batch_samples) + global_start = offset + start + global_end = offset + end + + # metadata + for i, sample in enumerate(batch_samples): + m = _flatten_metadata(sample) + m["dataset_name"] = dataset_name + m["dataset_offset"] = offset + # Index within the Subset (i.e., after every-k subsample), then map back + # to the original dataset index. + subset_pos = batch_idx * bs + i # == start + i + orig_ds_idx = keep_idx[subset_pos] if subset_pos < len(keep_idx) else subset_pos + m["dataset_local_index"] = int(orig_ds_idx) + m["embedding_global_index"] = int(global_start + i) + + # Per-sample index_map lookup (batch may span multiple episodes). + try: + idx_map_name, _ = ds.index_map[int(orig_ds_idx)] + if isinstance(idx_map_name, MultiRLDBDataset): + raise ValueError("idx_map_name is a MultiRLDBDataset, which is not supported") + ep_hash = str(idx_map_name) + m["episode_hash"] = ep_hash + + # Attach episode-level DB metadata (same for all frames in an episode) + db_row = episode_meta_by_hash.get(ep_hash) + if db_row: + for k, v in db_row.items(): + if k == "episode_hash": + continue + m[str(k)] = v + except Exception: + pass + + meta_rows.append(m) + + # embeddings per image key + for key in args.image_keys: + imgs_bchw = [] + for sample in batch_samples: + if key not in sample: + raise KeyError( + f"Missing image key '{key}' in sample. Keys: {list(sample.keys())[:30]}" + ) + imgs_bchw.append(_image_to_torch_uint8_bchw(sample[key])) + img_bchw = torch.cat(imgs_bchw, dim=0) # uint8 BCHW on CPU + images_hwc = _bchw_u8_to_list_hwc_u8(img_bchw) # list[np.uint8 HWC] + emb_t = _embed_batch_dinov3(processor, stem, images_hwc, args.device) + emb = emb_t.detach().cpu().numpy().astype(embed_dtype, copy=False) + + writer = embed_writers[key] + writer[global_start:global_end, :] = emb + + processed = global_end + if (processed // bs) % 10 == 0: + print(f"[INFO] Processed {processed}/{n_total}") + + if args.debug_first_batch: + print("[DEBUG] Exiting after first batch (--debug-first-batch).") + break + + if args.debug_first_batch: + break + + # Finalize memmaps + if args.embed_store == "npy": + for k, arr in embed_writers.items(): + if isinstance(arr, np.memmap): + arr.flush() + + # Write metadata + meta_df = pd.DataFrame(meta_rows) + meta_path = out_dir / "metadata.parquet" + meta_df.to_parquet(meta_path, index=False) + + # Small manifest + manifest = { + "n_frames": n_total, + "embedding_dim": emb_dim, + "image_keys": list(args.image_keys), + "embed_store": args.embed_store, + "embed_dtype": args.embed_dtype, + "every_k_datapoint": int(args.every_k_datapoint), + "embeddings": {k: str(p) for k, p in embed_paths.items()}, + "metadata_parquet": str(meta_path), + "datasets": { + name: { + "n_frames": int(per_dataset_n[name]), + "offset": int(per_dataset_offset[name]), + } + for name in dataset_names + }, + } + (out_dir / "manifest.json").write_text(json.dumps(manifest, indent=2)) + + print(f"[DONE] Wrote metadata: {meta_path}") + for k, p in embed_paths.items(): + print(f"[DONE] Wrote embeddings for {k}: {p}") + print(f"[DONE] Wrote manifest: {out_dir / 'manifest.json'}") + + +if __name__ == "__main__": + main() + + diff --git a/egomimic/scripts/visualization_process/process_image.sbatch b/egomimic/scripts/visualization_process/process_image.sbatch new file mode 100644 index 00000000..c740205b --- /dev/null +++ b/egomimic/scripts/visualization_process/process_image.sbatch @@ -0,0 +1,12 @@ +#!/bin/bash +#SBATCH --partition=hoffman-lab +#SBATCH --account=hoffman-lab +#SBATCH --nodes=1 +#SBATCH --gres=gpu:a40:1 +#SBATCH --cpus-per-task=12 +#SBATCH --output=logs/process_image/%j.out +cd /nethome/paphiwetsa3/flash/projects/EgoVerse + +source .venv/bin/activate + +srun --cpus-per-task=12 python egomimic/scripts/visualization_process/process_image.py --data-config /nethome/paphiwetsa3/flash/projects/EgoVerse/egomimic/hydra_configs/data/clothe_eva_aria_mecka.yaml --out-dir egomimic/scripts/visualization_process/fold_clothes_aria_eva_mecka_all diff --git a/egomimic/scripts/visualization_process/visualization.py b/egomimic/scripts/visualization_process/visualization.py new file mode 100644 index 00000000..d1fcb490 --- /dev/null +++ b/egomimic/scripts/visualization_process/visualization.py @@ -0,0 +1,339 @@ +""" +Gigantic 2D t-SNE scatter plot colored by a chosen metadata column. + +Reads: +- manifest.json (for zarr + metadata paths) +- metadata.parquet (label column is configurable; defaults to lab-like columns) +- embeddings zarr group (expects dataset 'tsne_2d' by default) + +Writes: +- a large PNG scatter plot to the data directory +""" + +import argparse +import json +from pathlib import Path +import re + +import matplotlib as mpl +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import zarr + + +mpl.rcParams["font.family"] = "monospace" +mpl.rcParams["font.monospace"] = [ + "SF Mono", + "Menlo", + "Monaco", + "Source Code Pro", + "IBM Plex Mono", + "DejaVu Sans Mono", + "Liberation Mono", +] + + +def _pick_label_column(df: pd.DataFrame, label_col: str) -> str: + """ + Resolve which metadata column to use as labels/colors. + - If label_col is provided, require it to exist. + - Else, fall back to common "lab" column names. + """ + if label_col: + if label_col not in df.columns: + raise KeyError( + "Requested --label-col '{}' not found in metadata. Available columns (truncated): {}".format( + label_col, list(df.columns)[:50] + ) + ) + return label_col + + for c in ("lab", "db.lab", "metadata.lab"): + if c in df.columns: + return c + raise KeyError( + "Could not infer a default label column. Tried: lab, db.lab, metadata.lab. " + "Pass --label-col to choose a column explicitly. Available columns (truncated): {}".format( + list(df.columns)[:50] + ) + ) + + +def _load_omit_configs(*, omit_configs_json: str, omit_configs_file: str) -> list[dict]: + """ + Loads omit configs as a list of dicts. + + Semantics: + - Each dict is a conjunction (AND) of column==value matches. + - The list is a disjunction (OR) across dicts. + - Any row matching ANY omit dict is removed from the plot. + """ + + omit_configs: list[dict] = [] + + if omit_configs_json: + parsed = json.loads(omit_configs_json) + if not isinstance(parsed, list) or not all(isinstance(x, dict) for x in parsed): + raise TypeError("--omit-configs-json must be a JSON list of dicts") + omit_configs.extend(parsed) + + if omit_configs_file: + p = Path(omit_configs_file) + parsed = json.loads(p.read_text()) + if not isinstance(parsed, list) or not all(isinstance(x, dict) for x in parsed): + raise TypeError("--omit-configs-file must point to a JSON file containing a list of dicts") + omit_configs.extend(parsed) + + # normalize any weird entries (e.g. empty dicts) + omit_configs = [d for d in omit_configs if len(d) > 0] + return omit_configs + + +def _apply_omit_configs( + meta_df: pd.DataFrame, y: np.ndarray, *, omit_configs: list[dict] +) -> tuple[pd.DataFrame, np.ndarray]: + if not omit_configs: + return meta_df, y + + for i, cfg in enumerate(omit_configs): + missing = [k for k in cfg.keys() if k not in meta_df.columns] + if missing: + raise KeyError( + "omit_configs[{}] refers to missing columns: {}. Available columns (truncated): {}".format( + i, missing, list(meta_df.columns)[:50] + ) + ) + + omit_mask = np.zeros(len(meta_df), dtype=bool) + for cfg in omit_configs: + m = pd.Series(True, index=meta_df.index) + for k, v in cfg.items(): + col = meta_df[k] + if v is None: + m = m & col.isna() + elif isinstance(v, str): + m = m & (col.astype(str) == v) + else: + m = m & (col == v) + omit_mask |= m.to_numpy(dtype=bool) + + keep_mask = ~omit_mask + kept = int(keep_mask.sum()) + removed = int(omit_mask.sum()) + print( + "[INFO] omit_configs removed {} / {} rows (kept {})".format( + removed, len(meta_df), kept + ) + ) + meta_df = meta_df.loc[keep_mask].reset_index(drop=True) + y = y[keep_mask] + return meta_df, y + + +def _apply_sample_every_k( + meta_df: pd.DataFrame, y: np.ndarray, *, sample_every_k: int +) -> tuple[pd.DataFrame, np.ndarray]: + if sample_every_k <= 1: + return meta_df, y + meta_df = meta_df.iloc[::sample_every_k].reset_index(drop=True) + y = y[::sample_every_k] + print( + "[INFO] sample_every_k={} kept {} / {} rows".format( + sample_every_k, len(meta_df), len(y) * sample_every_k + ) + ) + return meta_df, y + + +def _safe_filename(s: str, *, max_len: int = 120) -> str: + s = s.strip() + # Replace whitespace with underscores + s = re.sub(r"\s+", "_", s) + # Keep only common safe characters + s = re.sub(r"[^A-Za-z0-9._-]+", "_", s) + # Collapse repeats and trim + s = re.sub(r"_+", "_", s).strip("._-") + if not s: + s = "plot" + if len(s) > max_len: + s = s[:max_len].rstrip("._-") + return s + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument( + "--manifest", + type=str, + default="egomimic/scripts/visualization_process/fold_clothes_aria_eva_all_labs/manifest.json", + ) + ap.add_argument("--image-key", type=str, default="", help="Defaults to first manifest image key.") + ap.add_argument( + "--reduce-method", + type=str, + default="tsne", + choices=("tsne", "umap", "pca"), + help="Which 2D reduction result to visualize (selects _2d by default).", + ) + ap.add_argument( + "--reduce-name", + dest="reduce_name", + type=str, + default=None, + help="Dataset name inside the zarr group to visualize (overrides --reduce-method).", + ) + # Backwards-compatible alias (tsne-name historically meant "which 2D coords dataset to plot") + ap.add_argument( + "--tsne-name", + dest="reduce_name", + type=str, + default=None, + help="(Deprecated) Same as --reduce-name.", + ) + ap.add_argument( + "--label-col", + type=str, + default="robot_name", + help=( + "Metadata column to color points by (e.g. 'lab', 'db.operator', 'task', 'episode_hash'). " + "If omitted, tries lab-like columns: lab, db.lab, metadata.lab." + ), + ) + ap.add_argument("--out", type=str, default="", help="Output png path (defaults next to manifest).") + ap.add_argument("--figsize", type=float, nargs=2, default=(12, 12), help="Figure size in inches (W H).") + ap.add_argument("--dpi", type=int, default=400) + ap.add_argument("--point-size", type=float, default=40.0) + ap.add_argument("--alpha", type=float, default=0.2) + ap.add_argument( + "--title", + type=str, + default="", + help="If provided, overrides the default plot title.", + ) + ap.add_argument( + "--omit-configs-json", + type=str, + default="", + help=( + "JSON list of dicts specifying metadata rows to OMIT. " + "Example: '[{\"robot_name\":\"eva_bimanual\"}, {\"lab\":\"song\",\"operator\":\"rl2\"}]'. " + "Each dict is an AND across keys; the list is OR across dicts." + ), + ) + ap.add_argument( + "--omit-configs-file", + type=str, + default="", + help="Path to a JSON file containing a list of dicts (same format as --omit-configs-json).", + ) + ap.add_argument( + "--sample-every-k", + type=int, + default=1, + help="Keep every k-th datapoint (applied after omit filters). Use 1 to disable.", + ) + args = ap.parse_args() + + manifest_path = Path(args.manifest) + manifest = json.loads(manifest_path.read_text()) + + if args.image_key: + image_key = args.image_key + else: + image_key = manifest["image_keys"][0] + + zarr_path = Path(manifest["embeddings"][image_key]) + meta_path = Path(manifest["metadata_parquet"]) + + meta_df = pd.read_parquet(meta_path) + label_col = _pick_label_column(meta_df, args.label_col) + + root = zarr.open_group(str(zarr_path), mode="r") + reduce_name = args.reduce_name if args.reduce_name else f"{args.reduce_method}_2d" + if reduce_name not in root: + raise KeyError( + "Could not find '{}' in zarr group. Available arrays: {}".format( + reduce_name, list(root.array_keys()) + ) + ) + y = np.asarray(root[reduce_name][:]) # (N,2) + if y.ndim != 2 or y.shape[1] != 2: + raise RuntimeError("Unexpected 2D reduction shape for '{}': {}".format(reduce_name, y.shape)) + + if len(meta_df) != y.shape[0]: + raise RuntimeError( + "Row mismatch: metadata has {} rows but '{}' has {} rows".format( + len(meta_df), reduce_name, y.shape[0] + ) + ) + + omit_configs = _load_omit_configs( + omit_configs_json=args.omit_configs_json, + omit_configs_file=args.omit_configs_file, + ) + meta_df, y = _apply_omit_configs(meta_df, y, omit_configs=omit_configs) + meta_df, y = _apply_sample_every_k(meta_df, y, sample_every_k=args.sample_every_k) + + labels = meta_df[label_col].astype(str).fillna("unknown").to_numpy() + uniq_labels, label_codes = np.unique(labels, return_inverse=True) + + # Build a categorical colormap with enough distinct colors + cmap = plt.get_cmap("tab20", max(1, len(uniq_labels))) + + fig, ax = plt.subplots(figsize=tuple(args.figsize), dpi=args.dpi) + ax.scatter( + y[:, 0], + y[:, 1], + c=label_codes, + cmap=cmap, + s=args.point_size, + alpha=args.alpha, + linewidths=0, + rasterized=True, + ) + + if args.title: + title = args.title + else: + title = "t-SNE of embeddings (colored by {}: {})".format("label", label_col) + # Title at the very top (above legend + axes) + fig.suptitle(title, y=0.99, fontsize=24) + ax.grid(False) + + # Legend (label key): place at top, horizontal layout (figure-level for tighter spacing) + handles = [ + plt.Line2D([0], [0], marker="o", linestyle="", color=cmap(i), markersize=6) + for i in range(len(uniq_labels)) + ] + ncol = min(max(1, len(uniq_labels)), 10) + fig.legend( + handles, + uniq_labels.tolist(), + loc="upper center", + bbox_to_anchor=(0.5, 0.96), + frameon=False, + fontsize=16, + ncol=ncol, + borderaxespad=0.0, + columnspacing=1.0, + ) + + # Reserve minimal top space for suptitle + legend + fig.tight_layout(rect=(0.0, 0.0, 1.0, 0.97)) + + if args.out: + out_path = Path(args.out) + else: + if args.title: + out_path = manifest_path.parent / f"{_safe_filename(args.title)}.png" + else: + safe_label = label_col.replace("/", "_").replace(".", "_") + out_path = manifest_path.parent / f"tsne_by_{safe_label}.png" + out_path.parent.mkdir(parents=True, exist_ok=True) + fig.savefig(out_path, bbox_inches="tight") + print("[DONE] wrote", out_path) + + +if __name__ == "__main__": + main() diff --git a/egomimic/scripts/visualization_process/visualization_clothes_embodiment.py b/egomimic/scripts/visualization_process/visualization_clothes_embodiment.py new file mode 100644 index 00000000..4b0c6776 --- /dev/null +++ b/egomimic/scripts/visualization_process/visualization_clothes_embodiment.py @@ -0,0 +1,514 @@ +""" +Gigantic 2D t-SNE scatter plot colored by a chosen metadata column. + +Reads: +- manifest.json (for zarr + metadata paths) +- metadata.parquet (label column is configurable; defaults to lab-like columns) +- embeddings zarr group (expects dataset 'tsne_2d' by default) + +Writes: +- a large PNG scatter plot to the data directory + +Plot config notes: +- `plot_background_color`: figure/axes background (e.g. "#ecdbc7"). Empty/None disables. +- `plot_background_alpha`: optional float in [0,1] (defaults to 1.0). +""" + +import argparse +import json +from pathlib import Path +import re + +import matplotlib as mpl +import matplotlib.pyplot as plt +from matplotlib.colors import to_rgba +import numpy as np +import pandas as pd +import zarr + + +mpl.rcParams["font.family"] = "monospace" +mpl.rcParams["font.monospace"] = [ + "SF Mono", + "Menlo", + "Monaco", + "Source Code Pro", + "IBM Plex Mono", + "DejaVu Sans Mono", + "Liberation Mono", +] + + +def _pick_label_column(df: pd.DataFrame, label_col: str) -> str: + """ + Resolve which metadata column to use as labels/colors. + - If label_col is provided, require it to exist. + - Else, fall back to common "lab" column names. + """ + if label_col: + if label_col not in df.columns: + raise KeyError( + "Requested --label-col '{}' not found in metadata. Available columns (truncated): {}".format( + label_col, list(df.columns)[:50] + ) + ) + return label_col + + for c in ("lab", "db.lab", "metadata.lab"): + if c in df.columns: + return c + raise KeyError( + "Could not infer a default label column. Tried: lab, db.lab, metadata.lab. " + "Pass --label-col to choose a column explicitly. Available columns (truncated): {}".format( + list(df.columns)[:50] + ) + ) + + +def _load_omit_configs(*, omit_configs_json: str, omit_configs_file: str) -> list[dict]: + """ + Loads omit configs as a list of dicts. + + Semantics: + - Each dict is a conjunction (AND) of column==value matches. + - The list is a disjunction (OR) across dicts. + - Any row matching ANY omit dict is removed from the plot. + """ + + omit_configs: list[dict] = [] + + if omit_configs_json: + parsed = json.loads(omit_configs_json) + if not isinstance(parsed, list) or not all(isinstance(x, dict) for x in parsed): + raise TypeError("--omit-configs-json must be a JSON list of dicts") + omit_configs.extend(parsed) + + if omit_configs_file: + p = Path(omit_configs_file) + parsed = json.loads(p.read_text()) + if not isinstance(parsed, list) or not all(isinstance(x, dict) for x in parsed): + raise TypeError("--omit-configs-file must point to a JSON file containing a list of dicts") + omit_configs.extend(parsed) + + # normalize any weird entries (e.g. empty dicts) + omit_configs = [d for d in omit_configs if len(d) > 0] + return omit_configs + + +def _apply_omit_configs( + meta_df: pd.DataFrame, y: np.ndarray, *, omit_configs: list[dict] +) -> tuple[pd.DataFrame, np.ndarray]: + if not omit_configs: + return meta_df, y + + for i, cfg in enumerate(omit_configs): + missing = [k for k in cfg.keys() if k not in meta_df.columns] + if missing: + raise KeyError( + "omit_configs[{}] refers to missing columns: {}. Available columns (truncated): {}".format( + i, missing, list(meta_df.columns)[:50] + ) + ) + + omit_mask = np.zeros(len(meta_df), dtype=bool) + for cfg in omit_configs: + m = pd.Series(True, index=meta_df.index) + for k, v in cfg.items(): + col = meta_df[k] + if v is None: + m = m & col.isna() + elif isinstance(v, str): + m = m & (col.astype(str) == v) + else: + m = m & (col == v) + omit_mask |= m.to_numpy(dtype=bool) + + keep_mask = ~omit_mask + kept = int(keep_mask.sum()) + removed = int(omit_mask.sum()) + print( + "[INFO] omit_configs removed {} / {} rows (kept {})".format( + removed, len(meta_df), kept + ) + ) + meta_df = meta_df.loc[keep_mask].reset_index(drop=True) + y = y[keep_mask] + return meta_df, y + + +def _apply_sample_every_k( + meta_df: pd.DataFrame, y: np.ndarray, *, sample_every_k: int +) -> tuple[pd.DataFrame, np.ndarray]: + if sample_every_k <= 1: + return meta_df, y + before = len(meta_df) + meta_df = meta_df.iloc[::sample_every_k].reset_index(drop=True) + y = y[::sample_every_k] + print( + "[INFO] sample_every_k={} kept {} / {} rows".format( + sample_every_k, len(meta_df), before + ) + ) + return meta_df, y + + +def _load_plot_config(*, plot_config_json: str, plot_config_file: str) -> dict: + """ + Load a plotting config dict. + + Supported schema (both forms accepted): + - {"label_col": "robot_name", + "label_col_name": [{"eva_bimanual": {"color": "#...", "legend_name": "Robot"}}, ...]} + - {"label_col": "robot_name", + "label_col_name": {"eva_bimanual": {"color": "#...", "legend_name": "Robot"}, ...}} + """ + cfg: dict = {} + if plot_config_json: + cfg = json.loads(plot_config_json) + if not isinstance(cfg, dict): + raise TypeError("--plot-config-json must be a JSON object (dict)") + return cfg + if plot_config_file: + p = Path(plot_config_file) + cfg = json.loads(p.read_text()) + if not isinstance(cfg, dict): + raise TypeError("--plot-config-file must point to a JSON file containing an object (dict)") + return cfg + return {} + + +def _normalize_label_styles(plot_cfg: dict) -> tuple[list[str], dict[str, dict]]: + """ + Returns (ordered_label_values, label_value->style_dict). + """ + label_col_name = plot_cfg.get("label_col_name", None) + if not label_col_name: + return [], {} + + if isinstance(label_col_name, dict): + ordered = list(label_col_name.keys()) + styles = label_col_name + elif isinstance(label_col_name, list): + ordered = [] + styles = {} + for entry in label_col_name: + if not isinstance(entry, dict) or len(entry) != 1: + raise TypeError( + "plot_config['label_col_name'] entries must be dicts with a single key, got: {}".format( + entry + ) + ) + (k, v), = entry.items() + ordered.append(str(k)) + styles[str(k)] = v if isinstance(v, dict) else {} + else: + raise TypeError( + "plot_config['label_col_name'] must be a dict or list, got: {}".format(type(label_col_name)) + ) + return ordered, {str(k): (v if isinstance(v, dict) else {}) for k, v in styles.items()} + + +def _build_colors_and_legend( + labels: np.ndarray, + *, + ordered_styles: list[str], + style_map: dict[str, dict], +) -> tuple[np.ndarray, list[plt.Line2D], list[str]]: + """ + Returns (per_point_rgba Nx4, legend_handles, legend_names). + + - Labels listed in ordered_styles get their provided colors (if any) and legend names (if any). + - Remaining labels get colors from tab20. + - Legend order: ordered_styles first (if present in data), then remaining in first-seen order. + """ + labels = labels.astype(str) + present = set(labels.tolist()) + ordered_present = [v for v in ordered_styles if v in present] + + # Stable "first seen" order for labels not in ordered_styles + remainder = [] + seen = set(ordered_present) + for v in labels.tolist(): + if v in present and v not in seen: + seen.add(v) + remainder.append(v) + + # Assign colors + label_to_rgba: dict[str, tuple[float, float, float, float]] = {} + for v in ordered_present: + style = style_map.get(v, {}) + if "color" in style and style["color"]: + label_to_rgba[v] = to_rgba(style["color"]) + else: + # fallback color if not provided + label_to_rgba[v] = to_rgba("#4a4e69") + + if remainder: + cmap = plt.get_cmap("tab20", max(1, len(remainder))) + for i, v in enumerate(remainder): + label_to_rgba[v] = cmap(i) + + point_colors = np.asarray([label_to_rgba[v] for v in labels], dtype=float) + + # Legend labels (names) + legend_order = ordered_present + remainder + legend_names = [] + handles = [] + for v in legend_order: + style = style_map.get(v, {}) + legend_names.append(str(style.get("legend_name", v))) + handles.append(plt.Line2D([0], [0], marker="o", linestyle="", color=label_to_rgba[v], markersize=12)) + + return point_colors, handles, legend_names + + +def _safe_filename(s: str, *, max_len: int = 120) -> str: + s = s.strip() + # Replace whitespace with underscores + s = re.sub(r"\s+", "_", s) + # Keep only common safe characters + s = re.sub(r"[^A-Za-z0-9._-]+", "_", s) + # Collapse repeats and trim + s = re.sub(r"_+", "_", s).strip("._-") + if not s: + s = "plot" + if len(s) > max_len: + s = s[:max_len].rstrip("._-") + return s + + +def _apply_plot_background(*, fig: plt.Figure, ax: plt.Axes, plot_cfg: dict) -> None: + """ + Apply a plot background (figure + axes facecolor) from plot_cfg. + """ + bg = plot_cfg.get("plot_background_color", None) + if bg is None: + return + bg = str(bg).strip() + if not bg: + return + + alpha = plot_cfg.get("plot_background_alpha", 1.0) + try: + alpha = float(alpha) + except Exception: + alpha = 1.0 + alpha = float(np.clip(alpha, 0.0, 1.0)) + + rgba = to_rgba(bg, alpha=alpha) + fig.patch.set_facecolor(rgba) + ax.set_facecolor(rgba) + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument( + "--manifest", + type=str, + default="egomimic/scripts/visualization_process/fold_clothes_aria_eva_all_labs/manifest.json", + ) + ap.add_argument("--image-key", type=str, default="", help="Defaults to first manifest image key.") + ap.add_argument( + "--reduce-method", + type=str, + default="tsne", + choices=("tsne", "umap", "pca"), + help="Which 2D reduction result to visualize (selects _2d by default).", + ) + ap.add_argument( + "--reduce-name", + dest="reduce_name", + type=str, + default=None, + help="Dataset name inside the zarr group to visualize (overrides --reduce-method).", + ) + # Backwards-compatible alias (tsne-name historically meant "which 2D coords dataset to plot") + ap.add_argument( + "--tsne-name", + dest="reduce_name", + type=str, + default=None, + help="(Deprecated) Same as --reduce-name.", + ) + ap.add_argument( + "--label-col", + type=str, + default="robot_name", + help=( + "Metadata column to color points by (e.g. 'lab', 'db.operator', 'task', 'episode_hash'). " + "If omitted, tries lab-like columns: lab, db.lab, metadata.lab." + ), + ) + ap.add_argument("--out", type=str, default="", help="Output png path (defaults next to manifest).") + ap.add_argument("--figsize", type=float, nargs=2, default=(12, 12), help="Figure size in inches (W H).") + ap.add_argument("--dpi", type=int, default=400) + ap.add_argument("--point-size", type=float, default=40.0) + ap.add_argument("--alpha", type=float, default=0.2) + ap.add_argument( + "--title", + type=str, + default="", + help="If provided, overrides the default plot title.", + ) + ap.add_argument( + "--omit-configs-json", + type=str, + default="", + help=( + "JSON list of dicts specifying metadata rows to OMIT. " + "Example: '[{\"robot_name\":\"eva_bimanual\"}, {\"lab\":\"song\",\"operator\":\"rl2\"}]'. " + "Each dict is an AND across keys; the list is OR across dicts." + ), + ) + ap.add_argument( + "--omit-configs-file", + type=str, + default="", + help="Path to a JSON file containing a list of dicts (same format as --omit-configs-json).", + ) + ap.add_argument( + "--plot-config-json", + type=str, + default="", + help=( + "JSON object configuring label styles (colors/legend names). " + "If provided, overrides the in-script default mapping." + ), + ) + ap.add_argument( + "--plot-config-file", + type=str, + default="", + help="Path to a JSON file containing a plotting config object (same as --plot-config-json).", + ) + ap.add_argument( + "--sample-every-k", + type=int, + default=1, + help="Keep every k-th datapoint (applied after omit filters). Use 1 to disable.", + ) + args = ap.parse_args() + + default_plot_config = { + "label_col": args.label_col, + "plot_background_color": "#FFFFFF", + "label_col_name": [ + {"eva_bimanual": { + "color": "#009e73", + "legend_name": "Robot" + }}, + {"aria_bimanual": { + "color": "#2462a3", + "legend_name": "EgoVerse-A" + }}, + {"mecka_bimanual": { + "color": "#e5a423", + "legend_name": "EgoVerse-I" + }} + ] + } + + plot_cfg = default_plot_config | _load_plot_config( + plot_config_json=args.plot_config_json, + plot_config_file=args.plot_config_file, + ) + label_col = plot_cfg.get("label_col", args.label_col) + + manifest_path = Path(args.manifest) + manifest = json.loads(manifest_path.read_text()) + + if args.image_key: + image_key = args.image_key + else: + image_key = manifest["image_keys"][0] + + zarr_path = Path(manifest["embeddings"][image_key]) + meta_path = Path(manifest["metadata_parquet"]) + + meta_df = pd.read_parquet(meta_path) + label_col = _pick_label_column(meta_df, label_col) + + root = zarr.open_group(str(zarr_path), mode="r") + reduce_name = args.reduce_name if args.reduce_name else f"{args.reduce_method}_2d" + if reduce_name not in root: + raise KeyError( + "Could not find '{}' in zarr group. Available arrays: {}".format( + reduce_name, list(root.array_keys()) + ) + ) + y = np.asarray(root[reduce_name][:]) # (N,2) + if y.ndim != 2 or y.shape[1] != 2: + raise RuntimeError("Unexpected 2D reduction shape for '{}': {}".format(reduce_name, y.shape)) + + if len(meta_df) != y.shape[0]: + raise RuntimeError( + "Row mismatch: metadata has {} rows but '{}' has {} rows".format( + len(meta_df), reduce_name, y.shape[0] + ) + ) + + omit_configs = _load_omit_configs( + omit_configs_json=args.omit_configs_json, + omit_configs_file=args.omit_configs_file, + ) + meta_df, y = _apply_omit_configs(meta_df, y, omit_configs=omit_configs) + meta_df, y = _apply_sample_every_k(meta_df, y, sample_every_k=args.sample_every_k) + + labels = meta_df[label_col].astype(str).fillna("unknown").to_numpy() + ordered_styles, style_map = _normalize_label_styles(plot_cfg) + point_colors, legend_handles, legend_names = _build_colors_and_legend( + labels, ordered_styles=ordered_styles, style_map=style_map + ) + + fig, ax = plt.subplots(figsize=tuple(args.figsize), dpi=args.dpi) + _apply_plot_background(fig=fig, ax=ax, plot_cfg=plot_cfg) + ax.scatter( + y[:, 0], + y[:, 1], + c=point_colors, + s=args.point_size, + alpha=args.alpha, + linewidths=0, + rasterized=True, + ) + + if args.title: + title = args.title + else: + title = "t-SNE of embeddings (colored by {}: {})".format("label", label_col) + # Title at the very top (above legend + axes) + fig.suptitle(title, y=0.99, fontsize=24) + ax.grid(False) + + # Legend (label key): place at top, horizontal layout (figure-level for tighter spacing) + ncol = min(max(1, len(legend_names)), 10) + fig.legend( + legend_handles, + legend_names, + loc="upper center", + bbox_to_anchor=(0.5, 0.96), + frameon=False, + fontsize=24, + ncol=ncol, + borderaxespad=0.0, + columnspacing=1.0, + ) + + # Reserve minimal top space for suptitle + legend + fig.tight_layout(rect=(0.0, 0.0, 1.0, 0.97)) + + if args.out: + out_path = Path(args.out) + else: + if args.title: + out_path = manifest_path.parent / f"{_safe_filename(args.title)}.png" + else: + safe_label = label_col.replace("/", "_").replace(".", "_") + out_path = manifest_path.parent / f"tsne_by_{safe_label}.png" + out_path.parent.mkdir(parents=True, exist_ok=True) + fig.savefig(out_path, bbox_inches="tight", facecolor=fig.get_facecolor()) + print("[DONE] wrote", out_path) + + +if __name__ == "__main__": + main() diff --git a/egomimic/scripts/visualization_process/visualization_setups.txt b/egomimic/scripts/visualization_process/visualization_setups.txt new file mode 100644 index 00000000..25156794 --- /dev/null +++ b/egomimic/scripts/visualization_process/visualization_setups.txt @@ -0,0 +1,14 @@ +python egomimic/scripts/visualization_process/visualization.py \ + --title "omit robot data" \ + --omit-configs-json '[{"robot_name":"eva_bimanual"}]' \ + --label-col scene + +python egomimic/scripts/visualization_process/visualization.py \ + --reduce-method umap \ + --label-col robot_name \ + --title "UMAP" \ + --manifest "/nethome/paphiwetsa3/flash/projects/EgoVerse/egomimic/scripts/visualization_process/fold_clothes_aria_eva_mecka/manifest.json" + +python egomimic/scripts/visualization_process/dim_reduce.py \ + --manifest "/nethome/paphiwetsa3/flash/projects/EgoVerse/egomimic/scripts/visualization_process/fold_clothes_aria_eva_mecka/manifest.json" \ + --method umap diff --git a/egomimic/utils/aws/sql_tutorial.ipynb b/egomimic/utils/aws/sql_tutorial.ipynb index 78f6edc8..9e3e452b 100644 --- a/egomimic/utils/aws/sql_tutorial.ipynb +++ b/egomimic/utils/aws/sql_tutorial.ipynb @@ -179,7 +179,7 @@ ], "metadata": { "kernelspec": { - "display_name": "emimic", + "display_name": ".venv", "language": "python", "name": "python3" }, @@ -193,7 +193,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.14" + "version": "3.11.11" } }, "nbformat": 4,