forked from tencent-ailab/IP-Adapter
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathjob_train_osc.sh
More file actions
128 lines (108 loc) · 4.58 KB
/
job_train_osc.sh
File metadata and controls
128 lines (108 loc) · 4.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#!/bin/bash
#SBATCH --job-name=ipadapter_bioclip
#SBATCH --account=PAS2136
#SBATCH --time=1-00:00:00
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=16
#SBATCH --gres=gpu:2
### change 5-digit MASTER_PORT as you wish, slurm will raise Error if duplicated with others
### change WORLD_SIZE as gpus/node * num_nodes
export MASTER_PORT=12355
export WORLD_SIZE2=14
### get the first node name as master address - customized for vgg slurm
### e.g. master(gnodee[2-5],gnoded1) == gnodee2
echo "NODELIST="${SLURM_NODELIST}
echo "SLURM_NTASKS="${SLURM_NTASKS}
echo "SLURM_PROCID="${SLURM_PROCID}
master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
export MASTER_ADDR=$master_addr
echo "MASTER_ADDR="$MASTER_ADDR
# module spider cuda
# module load cuda/12.3.0
module load miniconda3
conda info --envs
conda activate taxa_bind
echo "Starting accelerate..."
## context 4
# accelerate launch --num_processes 2 --multi_gpu tutorial_train.py \
# --pretrained_model_name_or_path="runwayml/stable-diffusion-v1-5" \
# --image_encoder_path="/users/PAS2136/mridul/scratchpad/taxabind/IP-Adapter/models/image_encoder" \
# --data_json_file="/fs/ess/PAS2136/bio_diffusion/data/inat/images/train_mini_birds.json" \
# --data_root_path="/fs/ess/PAS2136/bio_diffusion/data/inat/images" \
# --mixed_precision="fp16" \
# --resolution=512 \
# --train_batch_size=64 \
# --dataloader_num_workers=8 \
# --learning_rate=1e-04 \
# --weight_decay=0.01 \
# --output_dir="/fs/ess/PAS2136/bio_diffusion/ip-adapter_runs/bioclip/extra_context4_2gpus" \
# --save_steps=2000 \
# --report_to="wandb" \
# --clip_extra_context_tokens=4 \
## context 1
# accelerate launch --num_processes 2 --multi_gpu tutorial_train.py \
# --pretrained_model_name_or_path="runwayml/stable-diffusion-v1-5" \
# --image_encoder_path="/users/PAS2136/mridul/scratchpad/taxabind/IP-Adapter/models/image_encoder" \
# --data_json_file="/fs/ess/PAS2136/bio_diffusion/data/inat/images/train_mini_birds.json" \
# --data_root_path="/fs/ess/PAS2136/bio_diffusion/data/inat/images" \
# --mixed_precision="fp16" \
# --resolution=512 \
# --train_batch_size=64 \
# --dataloader_num_workers=4 \
# --learning_rate=1e-04 \
# --weight_decay=0.01 \
# --output_dir="/fs/ess/PAS2136/bio_diffusion/ip-adapter_runs/bioclip/extra_context1_2gpus" \
# --save_steps=2000 \
# --report_to="wandb" \
# --clip_extra_context_tokens=1 \
### no projection layer
# accelerate launch --num_processes 2 --multi_gpu tutorial_train.py \
# --pretrained_model_name_or_path="runwayml/stable-diffusion-v1-5" \
# --image_encoder_path="/users/PAS2136/mridul/scratchpad/taxabind/IP-Adapter/models/image_encoder" \
# --data_json_file="/fs/ess/PAS2136/bio_diffusion/data/inat/images/train_mini_birds.json" \
# --data_root_path="/fs/ess/PAS2136/bio_diffusion/data/inat/images" \
# --mixed_precision="fp16" \
# --resolution=512 \
# --train_batch_size=64 \
# --dataloader_num_workers=4 \
# --learning_rate=1e-04 \
# --weight_decay=0.01 \
# --output_dir="/fs/ess/PAS2136/bio_diffusion/ip-adapter_runs/bioclip/no_projection_layer" \
# --save_steps=2000 \
# --report_to="wandb" \
# --clip_extra_context_tokens=1 \
# --no_projection_layer
# ### location encoder
# accelerate launch --num_processes 2 --multi_gpu tutorial_train.py \
# --pretrained_model_name_or_path="runwayml/stable-diffusion-v1-5" \
# --image_encoder_path="/users/PAS2136/mridul/scratchpad/taxabind/IP-Adapter/models/image_encoder" \
# --data_json_file="/fs/ess/PAS2136/bio_diffusion/data/inat/images/train_mini_birds_subset.json" \
# --data_root_path="/fs/ess/PAS2136/bio_diffusion/data/inat/images" \
# --mixed_precision="fp16" \
# --resolution=512 \
# --train_batch_size=64 \
# --dataloader_num_workers=8 \
# --learning_rate=1e-04 \
# --weight_decay=0.01 \
# --output_dir="/fs/ess/PAS2136/bio_diffusion/ip-adapter_runs/bioclip/location_2gpus" \
# --save_steps=2000 \
# --report_to="wandb" \
# --clip_extra_context_tokens=4 \
# --image_encoder='location'
### taxabind encoder
accelerate launch --num_processes 2 --multi_gpu tutorial_train.py \
--pretrained_model_name_or_path="runwayml/stable-diffusion-v1-5" \
--image_encoder_path="/users/PAS2136/mridul/scratchpad/taxabind/IP-Adapter/models/image_encoder" \
--data_json_file="/fs/ess/PAS2136/bio_diffusion/data/inat/images/train_mini_birds_subset.json" \
--data_root_path="/fs/ess/PAS2136/bio_diffusion/data/inat/images" \
--mixed_precision="fp16" \
--resolution=512 \
--train_batch_size=64 \
--dataloader_num_workers=8 \
--learning_rate=1e-04 \
--weight_decay=0.01 \
--output_dir="/fs/ess/PAS2136/bio_diffusion/ip-adapter_runs/bioclip/taxabind_2gpus" \
--save_steps=2000 \
--report_to="wandb" \
--clip_extra_context_tokens=4 \
--image_encoder='taxabind'