Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
135 commits
Select commit Hold shift + click to select a range
2631242
fix readme typo
pcintara Mar 8, 2024
d2acb84
Merge pull request #1 from pclittle/fix-readme-typo
kpinchoco Mar 8, 2024
5deb714
fix README typos evals/ and k400 yaml filename
Mar 8, 2024
d545b7b
fix README typos evals/ and k400 yaml filename
Mar 8, 2024
48e007e
fix typos
Mar 8, 2024
1b3392a
added jepa. in from jepa.evals.scaffold
Mar 11, 2024
df85d19
updated vitl16_k400 yaml and main_distributed files
Mar 28, 2024
14f22a1
edited config data paths and main_distributed parameters for child job
Mar 29, 2024
f0bbbdf
add files
Apr 4, 2024
a9f0ba2
add files modified
Apr 4, 2024
afd3a0c
test eval directory rather than csv
pcintara Apr 9, 2024
2e1df39
Merge pull request #2 from pclittle/main
kpinchoco Apr 9, 2024
e0a5ab8
changes from 4-9-24
Apr 11, 2024
164b0ac
troubleshoot pegs_eval and linear probe
Apr 21, 2024
e4a1bc8
update get_item for pendulum csv
kpinchoco Apr 21, 2024
7ba87e7
add p_mini csvs
Apr 21, 2024
1455c5c
update p_mini_train.csv
kpinchoco Apr 21, 2024
c8e4463
update p_mini_test.csv
kpinchoco Apr 21, 2024
4b96e0a
update num_classes to 165
kpinchoco Apr 21, 2024
4c57830
remove extra dimension in label_tensor
kpinchoco Apr 21, 2024
0c5a3c5
label to label_tensor in squeeze
kpinchoco Apr 21, 2024
ab63f8c
function for plotting model outputs
pcintara Apr 21, 2024
07c77f7
Merge pull request #3 from pclittle/kp_utils
kpinchoco Apr 22, 2024
46fcf59
print encoder output shape
kpinchoco Apr 22, 2024
dbce99e
update filepath to reference border png
kpinchoco Apr 22, 2024
c098819
update filepath to reference border png
kpinchoco Apr 23, 2024
fee8769
update filepath to reference border png
kpinchoco Apr 23, 2024
c7268fa
absolute filepath reference border png
kpinchoco Apr 23, 2024
a3d0db5
output_filename as parameter
kpinchoco Apr 23, 2024
878373f
update filepaths for plot imgs
kpinchoco Apr 23, 2024
1a3e266
update path from kp_utils
kpinchoco Apr 23, 2024
97743ad
update filepath to kp_utils again
kpinchoco Apr 23, 2024
202a072
change to relative filepath
kpinchoco Apr 23, 2024
da7ff26
rearrange nondefault and default arguments
kpinchoco Apr 23, 2024
3db2baf
comment out outputs png
kpinchoco Apr 23, 2024
04ce6de
add third video to p_mini_train.csv
kpinchoco Apr 23, 2024
a240bfb
print pos_embed shape x
kpinchoco Apr 23, 2024
a09aeb7
mp4 to npy
kpinchoco Apr 23, 2024
23adcbc
add print statements
kpinchoco Apr 23, 2024
b27023d
flattened_x
kpinchoco Apr 23, 2024
25c5e07
what is outputs
kpinchoco Apr 23, 2024
ff44176
convert type label tensor
kpinchoco Apr 23, 2024
cd8bb36
update loss
kpinchoco Apr 23, 2024
91f6294
update slicing of outputs
kpinchoco Apr 23, 2024
75cce82
return loss not top1_meter.avg
kpinchoco Apr 23, 2024
412bb58
fix sum_softmax and top1_acc
kpinchoco Apr 23, 2024
b4f6b6c
change loop to top1_acc
kpinchoco Apr 23, 2024
d802e3a
comment out acc
kpinchoco Apr 23, 2024
dedd29c
comment out print outputs
kpinchoco Apr 23, 2024
c1487ec
avg loss and remove % from train and test print
kpinchoco Apr 23, 2024
1ac3f39
5 videos
kpinchoco Apr 24, 2024
a11b1ba
2 one-peg videos
kpinchoco Apr 24, 2024
f39385e
comment print statements in run_one_epoch
kpinchoco Apr 24, 2024
96dd6c6
20 epochs
kpinchoco Apr 24, 2024
8493149
take out extra space before labels filepath
kpinchoco Apr 24, 2024
e3b3c18
decrease num_workers to 8 from 12
kpinchoco Apr 24, 2024
90ed933
128G from 55G slurm mem
kpinchoco Apr 24, 2024
1c621a5
gpus_per_node = 2 rather than tasks_per_node
kpinchoco Apr 24, 2024
712ff56
sum across dim=1 instead of flatten
kpinchoco Apr 24, 2024
87efceb
add print statements back
kpinchoco Apr 24, 2024
373be9c
comment out pos embed print statement
kpinchoco Apr 24, 2024
87d256c
uncomment final shape prints
kpinchoco Apr 24, 2024
ea1483e
update nn.linear
kpinchoco Apr 24, 2024
d12a038
import torch
kpinchoco Apr 24, 2024
b4cf5b4
add yaml and pegs_eval.py files - stil getting Nans for loss
Jun 18, 2024
2ae2bb7
add pegs_eval2.py
Jun 18, 2024
0a3a595
Update pegs_eval2.py
kpinchoco Jun 18, 2024
5cc933d
change criterion back to cross-entropy loss
kpinchoco Jun 18, 2024
fcd9d7a
check gradient norm after loss.backward()
kpinchoco Jun 18, 2024
4476302
add debug print statements
kpinchoco Jun 18, 2024
d4a6666
change back to pegs_eval.py
kpinchoco Jun 18, 2024
4a08333
Update vitl16_k400_16x8x3.yaml
kpinchoco Jun 27, 2024
dee1341
print min/max of encoder output
kpinchoco Jun 27, 2024
0266ff6
print min/max after sum and after linear
kpinchoco Jun 27, 2024
28096ee
comment out print label
kpinchoco Jun 27, 2024
2fdaa43
apply softmax before linear
kpinchoco Jun 27, 2024
01a3477
edit softmax syntax
kpinchoco Jun 27, 2024
45d5e0b
add avgpool(10, stride = 2)
kpinchoco Jul 1, 2024
3168f88
avgpool1d
kpinchoco Jul 1, 2024
afadbd1
fix nn.linear
kpinchoco Jul 1, 2024
d4635e8
AvgPool1d(20, stride=2)
kpinchoco Jul 1, 2024
3be2477
nn.AvgPool1d(50, stride=10)
kpinchoco Jul 1, 2024
5b16a7c
nn.AvgPool1d(50, stride=10)
kpinchoco Jul 1, 2024
19c33ea
508 to 98 in nn.linear
kpinchoco Jul 1, 2024
87f4cd5
10 epochs
kpinchoco Jul 1, 2024
900d9af
adaptivepool 1x330
kpinchoco Jul 1, 2024
241f689
comment out plot_img for now
kpinchoco Jul 1, 2024
e00493a
labels[i].unsqueeze(0)
kpinchoco Jul 1, 2024
94b1023
comment out print statements
kpinchoco Jul 30, 2024
5bb3f21
comment out print statements
kpinchoco Jul 30, 2024
9d4aed0
500 epochs
kpinchoco Jul 30, 2024
f7d1bab
1 gpu, 8:00:00 timeout, slurm_additional_options='--gres=gpu:a100:1'
kpinchoco Jul 31, 2024
795489d
timeout="8:00:00"
kpinchoco Jul 31, 2024
6e0ec23
gres='gpu:a100:1'
kpinchoco Jul 31, 2024
fb1f4d1
slurm_gres='gpu:a100:1'
kpinchoco Jul 31, 2024
eda3c70
additional_parameters={"gres": "gpu:v100:1"}
kpinchoco Jul 31, 2024
d2785d1
slurm_additional_parameters={"gres": "gpu:v100:1"}
kpinchoco Jul 31, 2024
d56f9ed
gres='gpu:v100:1' and partition='gpu'
kpinchoco Jul 31, 2024
3c97dd7
slurm_additional_parameters={"--gres": "gpu:v100:1"}
kpinchoco Jul 31, 2024
174b121
remove specify gpu type
kpinchoco Jul 31, 2024
fcc1e97
timeout="48:00:00"
kpinchoco Jul 31, 2024
0688102
partition='a100_2',
kpinchoco Jul 31, 2024
2e38d8c
slurm_additional_parameters={'gres': 'gpu:a100:1'}
kpinchoco Jul 31, 2024
e61f5d6
slurm_additional_parameters={'--gres': 'gpu:a100:1'})
kpinchoco Jul 31, 2024
95d7963
slurm_additional_parameters={'gres': 'gpu:a100:1'})
kpinchoco Jul 31, 2024
40c9f4e
slurm_additional_parameters={'gres': 'gpu:a100:1(S:0-1)'})
kpinchoco Jul 31, 2024
8c0f56d
Update main_distributed.py
kpinchoco Jul 31, 2024
63bacf9
comment out partition
kpinchoco Jul 31, 2024
8cfb1f1
sigmoid_outputs
kpinchoco Aug 1, 2024
2d71e62
save in checkpoint-models folder instead of ki2130/video_classificati…
kpinchoco Aug 1, 2024
95ca2bd
save to plots folder
kpinchoco Aug 1, 2024
2390a57
10 epochs
kpinchoco Aug 1, 2024
b8ee411
change tag to pegs-probe
kpinchoco Aug 1, 2024
cc92170
comment out gpu a100 specification for now
kpinchoco Aug 1, 2024
79df9a4
Create a sigmoid activation function object
kpinchoco Aug 1, 2024
3519897
sigmoid_outputs on tensor part not list
kpinchoco Aug 1, 2024
3204299
3 epochs
kpinchoco Aug 1, 2024
30435c5
print sigmoid outputs shape
kpinchoco Aug 1, 2024
753cf84
print sigmoid_outputs
kpinchoco Aug 1, 2024
513c9c0
input_tensor[0,idx]
kpinchoco Aug 1, 2024
97c90b5
import os
kpinchoco Aug 1, 2024
c6be822
input_tensor[idx]
kpinchoco Aug 1, 2024
464ec72
sigmoid_outputs.squeeze(0)
kpinchoco Aug 1, 2024
e2e06f6
scale=500 instead of 150
kpinchoco Aug 1, 2024
9da3f7b
scale=1000
kpinchoco Aug 1, 2024
32c6d1a
scale=2000
kpinchoco Aug 1, 2024
5de24d6
comment out min/max of encoded tensor
kpinchoco Aug 1, 2024
4c01cc8
comment out print sigmoid_outputs
kpinchoco Aug 1, 2024
02f3f66
500 epochs
kpinchoco Aug 1, 2024
bffc9a9
pegs-probe-500
kpinchoco Aug 1, 2024
ba4a3a2
epochs 300, pegs-probe-300
kpinchoco Aug 1, 2024
d542e57
add subfolder plots-300
kpinchoco Aug 1, 2024
203e17b
plots-300/
kpinchoco Aug 1, 2024
875218c
take out code you don't need from run_one_epoch
kpinchoco Aug 2, 2024
6dffc08
add code to track train/test loss
kpinchoco Aug 2, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 12 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -352,32 +352,36 @@ python -m app.main_distributed \
## Launching Evaluations

### Local training
<<<<<<< HEAD
If you wish to debug your eval code or setup before launching a distributed training run, we provide the functionality to do so by running the pretraining script locally on a multi-GPU (or single-GPU) machine, however, reproducing the full eval would require launching distributed training.
=======
If you wish to debug your eval code or setup before launching a distributed training run, we provide the functionality to do so by running the evaluation script locally on a multi-GPU (or single-GPU) machine, however, reproducing the full eval would require launching distributed training.
The single-machine implementation starts from the [eval/main.py](eval/main.py), which parses the experiment config file and runs the eval locally on a multi-GPU (or single-GPU) machine.
>>>>>>> origin/main
The single-machine implementation starts from the [evals/main.py](evals/main.py), which parses the experiment config file and runs the eval locally on a multi-GPU (or single-GPU) machine.

For example, to run ImageNet image classification on GPUs "0", "1", and "2" on a local machine using the config [configs/eval/vitl16_in1k.yaml](configs/eval/vitl16_in1k.yaml), type the command:
For example, to run ImageNet image classification on GPUs "0", "1", and "2" on a local machine using the config [configs/evals/vitl16_in1k.yaml](configs/evals/vitl16_in1k.yaml), type the command:
```bash
python -m evals.main \
--fname configs/eval/vitl16_in1k.yaml \
--fname configs/evals/vitl16_in1k.yaml \
--devices cuda:0 cuda:1 cuda:2
```


### Distributed training
To launch a distributed evaluation run, the implementation starts from [eval/main_distributed.py](eval/main_distributed.py), which, in addition to parsing the config file, also allows for specifying details about distributed training. For distributed training, we use the popular open-source [submitit](https://github.com/facebookincubator/submitit) tool and provide examples for a SLURM cluster.
To launch a distributed evaluation run, the implementation starts from [evals/main_distributed.py](evals/main_distributed.py), which, in addition to parsing the config file, also allows for specifying details about distributed training. For distributed training, we use the popular open-source [submitit](https://github.com/facebookincubator/submitit) tool and provide examples for a SLURM cluster.

For example, to launch a distributed ImageNet image classification experiment using the config [configs/eval/vitl16_in1k.yaml](configs/eval/vitl16_in1k.yaml), type the command:
For example, to launch a distributed ImageNet image classification experiment using the config [configs/evals/vitl16_in1k.yaml](configs/evals/vitl16_in1k.yaml), type the command:
```bash
python -m evals.main_distributed \
--fname configs/eval/vitl16_in1k.yaml \
--fname configs/evals/vitl16_in1k.yaml \
--folder $path_to_save_stderr_and_stdout \
--partition $slurm_partition
```

Similarly, to launch a distributed K400 video classification experiment using the config [configs/eval/vitl16_k400.yaml](configs/eval/vitl16_k400.yaml), type the command:
Similarly, to launch a distributed K400 video classification experiment using the config [configs/evals/vitl16_k400_16x8x3.yaml](configs/evals/vitl16_k400_16x8x3.yaml), type the command:
```bash
python -m evals.main_distributed \
--fname configs/eval/vitl16_k400.yaml \
--fname configs/evals/vitl16_k400_16x8x3.yaml \
--folder $path_to_save_stderr_and_stdout \
--partition $slurm_partition
```
Expand Down
19 changes: 10 additions & 9 deletions configs/evals/vitl16_k400_16x8x3.yaml
Original file line number Diff line number Diff line change
@@ -1,20 +1,21 @@
nodes: 8
tasks_per_node: 8
tag: k400-16x8x3

nodes: 1
tasks_per_node: 1
tag: pegs-probe-300
eval_name: video_classification_frozen
resume_checkpoint: false
data:
dataset_train: /your_path_to_kinetics400_train_csv_file_index.csv
dataset_val: /your_path_to_kinetics400_val_csv_file_index.csv
dataset_train: /scratch/ki2130/my-jepa/jepa/p_dummy.csv
dataset_val: /scratch/ki2130/my-jepa/jepa/p_dummy.csv
dataset_type: VideoDataset
num_classes: 400
num_classes: 165
frames_per_clip: 16
num_segments: 8
num_views_per_segment: 3
frame_step: 4
optimization:
attend_across_segments: true
num_epochs: 20
num_epochs: 300
resolution: 224
batch_size: 4
weight_decay: 0.01
Expand All @@ -34,6 +35,6 @@ pretrain:
tight_silu: false
use_sdpa: true
patch_size: 16
folder: /your_absolute_file_path_to_directory_where_pretrained_models_are_contained/
checkpoint: jepa-latest.pth.tar # name of pretrained model file inside folder
folder: /scratch/ki2130/ #/your_absolute_file_path_to_directory_where_pretrained_models_are_contained/
checkpoint: vitl16.pth.tar #jepa-latest.pth.tar # name of pretrained model file inside folder
write_tag: jepa
20 changes: 13 additions & 7 deletions evals/main_distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,8 @@ def checkpoint(self):
def launch_evals_with_parsed_args(
args_for_evals,
submitit_folder,
partition='learnlab,learnfair',
timeout=4300,
# partition='a100_2',
timeout="48:00:00",
nodes=1,
tasks_per_node=1,
delay_seconds=10,
Expand All @@ -90,13 +90,17 @@ def launch_evals_with_parsed_args(
folder=os.path.join(submitit_folder, 'job_%j'),
slurm_max_num_timeout=20)
executor.update_parameters(
slurm_partition=partition,
slurm_mem_per_gpu='55G',
# slurm_partition=partition,
slurm_mem='128G',
timeout_min=timeout,
nodes=nodes,
tasks_per_node=tasks_per_node,
cpus_per_task=12,
gpus_per_node=tasks_per_node)
cpus_per_task=8,
gpus_per_node=1,
slurm_mail_type='ALL',
slurm_mail_user='ki2130@nyu.edu',
slurm_job_name='model-jepa2')
# slurm_additional_parameters={'gres': 'gpu:a100:1'})

if exclude_nodes is not None:
executor.update_parameters(slurm_exclude=exclude_nodes)
Expand Down Expand Up @@ -149,7 +153,7 @@ def launch_evals():
launch_evals_with_parsed_args(
args_for_evals=configs,
submitit_folder=args.folder,
partition=args.partition,
# partition=args.partition,
timeout=args.time,
nodes=nodes,
tasks_per_node=tasks_per_node,
Expand All @@ -160,3 +164,5 @@ def launch_evals():
if __name__ == '__main__':
args = parser.parse_args()
launch_evals()
print("made it!")

2 changes: 1 addition & 1 deletion evals/scaffold.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,6 @@ def main(
resume_preempt=False
):
logger.info(f'Running evaluation: {eval_name}')
return importlib.import_module(f'evals.{eval_name}.eval').main(
return importlib.import_module(f'evals.{eval_name}.pegs_eval').main(
args_eval=args_eval,
resume_preempt=resume_preempt)
30 changes: 30 additions & 0 deletions evals/video_classification_frozen/kp_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import os
import matplotlib.pyplot as plt
from PIL import Image

def plot_guess_img(input_tensor, output_filename, reference_img='evals/video_classification_frozen/reference_with_border.png', scale=2000):
assert input_tensor.numel() == 165, "input tensor must have exactly 165 entries"
background = Image.open(reference_img)

plt.figure(figsize=(background.width / 100, background.height / 100), dpi=100)
plt.imshow(background)
plt.axis('off')

start_x, start_y = 520, 228
spacing = 63
scatter_x = []
scatter_y = []
sizes = []

for idx in range(input_tensor.numel()):
row = idx // 15
col = idx % 15
x = start_x + col * spacing
y = start_y + row * spacing
scatter_x.append(x)
scatter_y.append(y)
sizes.append(input_tensor[idx].item() * scale)

plt.scatter(scatter_x, scatter_y, s=sizes, c='green', alpha=0.4)
plt.savefig(os.path.join('plots/plots-300/', output_filename), bbox_inches='tight', pad_inches=0)
plt.close()
Loading