facebookresearch · kpinchoco · Mar 8, 2024 · Mar 8, 2024 · Mar 8, 2024 · Mar 8, 2024
diff --git a/README.md b/README.md
@@ -352,32 +352,36 @@ python -m app.main_distributed \
 ## Launching Evaluations
 
 ### Local training
+<<<<<<< HEAD
+If you wish to debug your eval code or setup before launching a distributed training run, we provide the functionality to do so by running the pretraining script locally on a multi-GPU (or single-GPU) machine, however, reproducing the full eval would require launching distributed training.
+=======
 If you wish to debug your eval code or setup before launching a distributed training run, we provide the functionality to do so by running the evaluation script locally on a multi-GPU (or single-GPU) machine, however, reproducing the full eval would require launching distributed training.
-The single-machine implementation starts from the [eval/main.py](eval/main.py), which parses the experiment config file and runs the eval locally on a multi-GPU (or single-GPU) machine.
+>>>>>>> origin/main
+The single-machine implementation starts from the [evals/main.py](evals/main.py), which parses the experiment config file and runs the eval locally on a multi-GPU (or single-GPU) machine.
 
-For example, to run ImageNet image classification on GPUs "0", "1", and "2" on a local machine using the config [configs/eval/vitl16_in1k.yaml](configs/eval/vitl16_in1k.yaml), type the command:
+For example, to run ImageNet image classification on GPUs "0", "1", and "2" on a local machine using the config [configs/evals/vitl16_in1k.yaml](configs/evals/vitl16_in1k.yaml), type the command:
 ```bash
 python -m evals.main \
-  --fname configs/eval/vitl16_in1k.yaml \
+  --fname configs/evals/vitl16_in1k.yaml \
   --devices cuda:0 cuda:1 cuda:2
 ```
 
 
 ### Distributed training
-To launch a distributed evaluation run, the implementation starts from [eval/main_distributed.py](eval/main_distributed.py), which, in addition to parsing the config file, also allows for specifying details about distributed training. For distributed training, we use the popular open-source [submitit](https://github.com/facebookincubator/submitit) tool and provide examples for a SLURM cluster.
+To launch a distributed evaluation run, the implementation starts from [evals/main_distributed.py](evals/main_distributed.py), which, in addition to parsing the config file, also allows for specifying details about distributed training. For distributed training, we use the popular open-source [submitit](https://github.com/facebookincubator/submitit) tool and provide examples for a SLURM cluster.
 
-For example, to launch a distributed ImageNet image classification experiment using the config [configs/eval/vitl16_in1k.yaml](configs/eval/vitl16_in1k.yaml), type the command:
+For example, to launch a distributed ImageNet image classification experiment using the config [configs/evals/vitl16_in1k.yaml](configs/evals/vitl16_in1k.yaml), type the command:
 ```bash
 python -m evals.main_distributed \
-  --fname configs/eval/vitl16_in1k.yaml \
+  --fname configs/evals/vitl16_in1k.yaml \
   --folder $path_to_save_stderr_and_stdout \
   --partition $slurm_partition
 ```
 
-Similarly, to launch a distributed K400 video classification experiment using the config [configs/eval/vitl16_k400.yaml](configs/eval/vitl16_k400.yaml), type the command:
+Similarly, to launch a distributed K400 video classification experiment using the config [configs/evals/vitl16_k400_16x8x3.yaml](configs/evals/vitl16_k400_16x8x3.yaml), type the command:
 ```bash
 python -m evals.main_distributed \
-  --fname configs/eval/vitl16_k400.yaml \
+  --fname configs/evals/vitl16_k400_16x8x3.yaml \
   --folder $path_to_save_stderr_and_stdout \
   --partition $slurm_partition
 ```

diff --git a/configs/evals/vitl16_k400_16x8x3.yaml b/configs/evals/vitl16_k400_16x8x3.yaml
@@ -1,20 +1,21 @@
-nodes: 8
-tasks_per_node: 8
-tag: k400-16x8x3
+
+nodes: 1
+tasks_per_node: 1
+tag: pegs-probe-300
 eval_name: video_classification_frozen
 resume_checkpoint: false
 data:
-  dataset_train: /your_path_to_kinetics400_train_csv_file_index.csv
-  dataset_val: /your_path_to_kinetics400_val_csv_file_index.csv
+  dataset_train: /scratch/ki2130/my-jepa/jepa/p_dummy.csv
+  dataset_val: /scratch/ki2130/my-jepa/jepa/p_dummy.csv
   dataset_type: VideoDataset
-  num_classes: 400
+  num_classes: 165
   frames_per_clip: 16
   num_segments: 8
   num_views_per_segment: 3
   frame_step: 4
 optimization:
   attend_across_segments: true
-  num_epochs: 20
+  num_epochs: 300
   resolution: 224
   batch_size: 4
   weight_decay: 0.01
@@ -34,6 +35,6 @@ pretrain:
   tight_silu: false
   use_sdpa: true
   patch_size: 16
-  folder: /your_absolute_file_path_to_directory_where_pretrained_models_are_contained/
-  checkpoint: jepa-latest.pth.tar  # name of pretrained model file inside folder
+  folder:  /scratch/ki2130/ #/your_absolute_file_path_to_directory_where_pretrained_models_are_contained/
+  checkpoint: vitl16.pth.tar #jepa-latest.pth.tar  # name of pretrained model file inside folder
   write_tag: jepa
diff --git a/evals/main_distributed.py b/evals/main_distributed.py
@@ -73,8 +73,8 @@ def checkpoint(self):
 def launch_evals_with_parsed_args(
     args_for_evals,
     submitit_folder,
-    partition='learnlab,learnfair',
-    timeout=4300,
+    # partition='a100_2',
+    timeout="48:00:00",
     nodes=1,
     tasks_per_node=1,
     delay_seconds=10,
@@ -90,13 +90,17 @@ def launch_evals_with_parsed_args(
         folder=os.path.join(submitit_folder, 'job_%j'),
         slurm_max_num_timeout=20)
     executor.update_parameters(
-        slurm_partition=partition,
-        slurm_mem_per_gpu='55G',
+        # slurm_partition=partition,
+        slurm_mem='128G',
         timeout_min=timeout,
         nodes=nodes,
         tasks_per_node=tasks_per_node,
-        cpus_per_task=12,
-        gpus_per_node=tasks_per_node)
+        cpus_per_task=8,
+        gpus_per_node=1,
+	slurm_mail_type='ALL',
+	slurm_mail_user='ki2130@nyu.edu',
+	slurm_job_name='model-jepa2')
+        # slurm_additional_parameters={'gres': 'gpu:a100:1'})
 
     if exclude_nodes is not None:
         executor.update_parameters(slurm_exclude=exclude_nodes)
@@ -149,7 +153,7 @@ def launch_evals():
     launch_evals_with_parsed_args(
         args_for_evals=configs,
         submitit_folder=args.folder,
-        partition=args.partition,
+        # partition=args.partition,
         timeout=args.time,
         nodes=nodes,
         tasks_per_node=tasks_per_node,
@@ -160,3 +164,5 @@ def launch_evals():
 if __name__ == '__main__':
     args = parser.parse_args()
     launch_evals()
+    print("made it!")
+
diff --git a/evals/scaffold.py b/evals/scaffold.py
@@ -19,6 +19,6 @@ def main(
     resume_preempt=False
 ):
     logger.info(f'Running evaluation: {eval_name}')
-    return importlib.import_module(f'evals.{eval_name}.eval').main(
+    return importlib.import_module(f'evals.{eval_name}.pegs_eval').main(
         args_eval=args_eval,
         resume_preempt=resume_preempt)
diff --git a/evals/video_classification_frozen/kp_utils.py b/evals/video_classification_frozen/kp_utils.py
@@ -0,0 +1,30 @@
+import os
+import matplotlib.pyplot as plt
+from PIL import Image
+
+def plot_guess_img(input_tensor, output_filename, reference_img='evals/video_classification_frozen/reference_with_border.png', scale=2000):
+    assert input_tensor.numel() == 165, "input tensor must have exactly 165 entries"
+    background = Image.open(reference_img)
+
+    plt.figure(figsize=(background.width / 100, background.height / 100), dpi=100)
+    plt.imshow(background)
+    plt.axis('off')
+
+    start_x, start_y = 520, 228
+    spacing = 63
+    scatter_x = []
+    scatter_y = []
+    sizes = []
+
+    for idx in range(input_tensor.numel()):
+        row = idx // 15
+        col = idx % 15
+        x = start_x + col * spacing
+        y = start_y + row * spacing
+        scatter_x.append(x)
+        scatter_y.append(y)
+        sizes.append(input_tensor[idx].item() * scale)
+
+    plt.scatter(scatter_x, scatter_y, s=sizes, c='green', alpha=0.4)
+    plt.savefig(os.path.join('plots/plots-300/', output_filename), bbox_inches='tight', pad_inches=0)
+    plt.close()