forked from tencent-ailab/IP-Adapter
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathjob_train_arc.sh
More file actions
187 lines (153 loc) · 6.87 KB
/
job_train_arc.sh
File metadata and controls
187 lines (153 loc) · 6.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
#!/bin/bash
#SBATCH -J tx+loc_proj
#SBATCH --cpus-per-task=12
#SBATCH --time=40:00:00
#SBATCH --gres=gpu:4
#SBATCH --partition=h200_normal_q
#SBATCH --account=memtrack
#SBATCH --output=/scratch/bio_diffusion/slurm_logs/%j-%x.out
########## SBATCH --qos=tc_a100_normal_short
### change 5-digit MASTER_PORT as you wish, slurm will raise Error if duplicated with others
### change WORLD_SIZE as gpus/node * num_nodes
export MASTER_PORT=15897
export WORLD_SIZE2=14
# Threads = cpus-per-task
export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK}
### get the first node name as master address - customized for vgg slurm
### e.g. master(gnodee[2-5],gnoded1) == gnodee2
echo "NODELIST="${SLURM_NODELIST}
echo "SLURM_NTASKS="${SLURM_NTASKS}
echo "SLURM_PROCID="${SLURM_PROCID}
master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
export MASTER_ADDR=$master_addr
echo "MASTER_ADDR="$MASTER_ADDR
# Load required modules and activate Conda environment
module reset
module load Miniconda3/24.7.1-0
conda init
source ~/.bashrc
source activate bio_up
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib/
echo "Starting accelerate..."
## context 4
# accelerate launch --num_processes 4 --multi_gpu tutorial_train.py \
# --pretrained_model_name_or_path="runwayml/stable-diffusion-v1-5" \
# --image_encoder_path="/home/mridul/IP-Adapter-fork/sdxl_models/image_encoder" \
# --data_json_file="/projects/ml4science/DATASETS/iNaturalist/train_mini_birds_arc.json" \
# --data_root_path="/projects/ml4science/DATASETS/iNaturalist/images" \
# --mixed_precision="fp16" \
# --resolution=512 \
# --train_batch_size=64 \
# --dataloader_num_workers=4 \
# --learning_rate=1e-04 \
# --weight_decay=0.01 \
# --output_dir="/scratch/bio_diffusion/ip-adapter_runs/bioclip/sdxl_extra_context4" \
# --save_steps=2000 \
# --report_to="wandb" \
# --clip_extra_context_tokens=4
## context 1
# accelerate launch --num_processes 4 --multi_gpu --main_process_port ${MASTER_PORT} tutorial_train.py \
# --pretrained_model_name_or_path="runwayml/stable-diffusion-v1-5" \
# --image_encoder_path="/home/medha/Projects/bio-diffusion/IP-Adapter-fork/sdxl_models/image_encoder" \
# --data_json_file="/projects/ml4science/DATASETS/iNaturalist/train_mini_birds_arc.json" \
# --data_root_path="/projects/ml4science/DATASETS/iNaturalist/images" \
# --mixed_precision="fp16" \
# --resolution=512 \
# --train_batch_size=64 \
# --dataloader_num_workers=4 \
# --learning_rate=1e-04 \
# --weight_decay=0.01 \
# --output_dir="/scratch/bio_diffusion/ip-adapter_runs/bioclip/sdxl_extra_context1" \
# --save_steps=2000 \
# --report_to="wandb" \
# --clip_extra_context_tokens=1
### no projection layer
# accelerate launch --num_processes 2 --multi_gpu --main_process_port ${MASTER_PORT} tutorial_train.py \
# --pretrained_model_name_or_path="runwayml/stable-diffusion-v1-5" \
# --image_encoder_path="/home/medha/Projects/bio-diffusion/IP-Adapter-fork/models/image_encoder" \
# --data_json_file="/projects/ml4science/DATASETS/iNaturalist/train_mini_birds_arc.json" \
# --data_root_path="/projects/ml4science/DATASETS/iNaturalist/images" \
# --mixed_precision="fp16" \
# --resolution=512 \
# --train_batch_size=64 \
# --dataloader_num_workers=4 \
# --learning_rate=1e-04 \
# --weight_decay=0.01 \
# --output_dir="/scratch/bio_diffusion/ip-adapter_runs/bioclip/no_proj" \
# --save_steps=2000 \
# --report_to="wandb" \
# --clip_extra_context_tokens=1 \
# --no_projection_layer
#########################
# asic ip adapter debug run
# accelerate launch --num_processes 4 --multi_gpu tutorial_train.py \
# --pretrained_model_name_or_path="runwayml/stable-diffusion-v1-5" \
# --image_encoder_path="/home/medha/Projects/bio-diffusion/IP-Adapter/models/image_encoder" \
# --data_json_file="/projects/ml4science/DATASETS/iNaturalist/train_mini_birds_arc.json" \
# --data_root_path="/projects/ml4science/DATASETS/iNaturalist/images" \
# --mixed_precision="fp16" \
# --resolution=512 \
# --train_batch_size=64 \
# --dataloader_num_workers=4 \
# --learning_rate=1e-04 \
# --weight_decay=0.01 \
# --output_dir="/scratch/bio_diffusion/ip-adapter_runs/bioclip/debug_save" \
# --save_steps=2000 \
# --report_to="wandb"
##############################################
### combined bioclip + location adapter runs
# accelerate launch --num_processes 4 --multi_gpu --main_process_port ${MASTER_PORT} tutorial_train_location.py \
# --pretrained_model_name_or_path="runwayml/stable-diffusion-v1-5" \
# --image_encoder_path="/home/medha/Projects/bio-diffusion/IP-Adapter-fork/models/image_encoder" \
# --data_json_file="/projects/ml4science/DATASETS/iNaturalist/train_mini_birds_subset_arc.json" \
# --data_root_path="/projects/ml4science/DATASETS/iNaturalist/images" \
# --mixed_precision="fp16" \
# --resolution=512 \
# --train_batch_size=64 \
# --dataloader_num_workers=4 \
# --learning_rate=1e-04 \
# --weight_decay=0.01 \
# --output_dir="/scratch/bio_diffusion/ip-adapter_runs/bioclip/bioclip_location_context4" \
# --save_steps=2000 \
# --report_to="wandb" \
# --clip_extra_context_tokens=4 \
# --image_encoder='bioclip' \
# --image_encoder_secondary='location' \
# --image_proj_secondary
##### combined taxabind + location double projection layers adapter runs
accelerate launch --num_processes 4 --multi_gpu --main_process_port ${MASTER_PORT} tutorial_train_location.py \
--pretrained_model_name_or_path="runwayml/stable-diffusion-v1-5" \
--image_encoder_path="/home/medha/Projects/bio-diffusion/IP-Adapter-fork/models/image_encoder" \
--data_json_file="/projects/ml4science/DATASETS/iNaturalist/train_mini_birds_subset_arc.json" \
--data_root_path="/projects/ml4science/DATASETS/iNaturalist/images" \
--mixed_precision="fp16" \
--resolution=512 \
--train_batch_size=64 \
--dataloader_num_workers=4 \
--learning_rate=1e-04 \
--weight_decay=0.01 \
--output_dir="/scratch/bio_diffusion/ip-adapter_runs/bioclip/taxabind_location_context4_doubleproj" \
--save_steps=2000 \
--report_to="wandb" \
--clip_extra_context_tokens=4 \
--image_encoder='taxabind' \
--image_encoder_secondary='location' \
--image_proj_secondary
##### combined taxabind + location single projection layer conct embeds adapter runs
# accelerate launch --num_processes 4 --multi_gpu --main_process_port ${MASTER_PORT} tutorial_train_location.py \
# --pretrained_model_name_or_path="runwayml/stable-diffusion-v1-5" \
# --image_encoder_path="/home/medha/Projects/bio-diffusion/IP-Adapter-fork/models/image_encoder" \
# --data_json_file="/projects/ml4science/DATASETS/iNaturalist/train_mini_birds_subset_arc.json" \
# --data_root_path="/projects/ml4science/DATASETS/iNaturalist/images" \
# --mixed_precision="fp16" \
# --resolution=512 \
# --train_batch_size=64 \
# --dataloader_num_workers=4 \
# --learning_rate=1e-04 \
# --weight_decay=0.01 \
# --output_dir="/scratch/bio_diffusion/ip-adapter_runs/bioclip/taxabind_location_context4_concat" \
# --save_steps=2000 \
# --report_to="wandb" \
# --clip_extra_context_tokens=4 \
# --image_encoder='taxabind' \
# --image_encoder_secondary='location'