-
Notifications
You must be signed in to change notification settings - Fork 6
Expand file tree
/
Copy pathCaption_inference.sh
More file actions
33 lines (29 loc) · 904 Bytes
/
Caption_inference.sh
File metadata and controls
33 lines (29 loc) · 904 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
#!/usr/bin/bash
conda activate VLV
rm -rf ~/.cache/torch_extensions
export CUDA_HOME=$(dirname $(dirname $(which nvcc)))
IMAGE_PATH="./assets"
CKPT_DIR="..."
OUTPUT_DIR="./caption.json"
GUIDANCE_SCALE=2.0
NUM_INFERENCE_STEPS=50
deepspeed --num_gpus=8\
--num_nodes=1\
train/Caption_inference.py \
--clip_decoder_checkpoint ${CKPT_DIR}/model.pt \
--qwen_model ./pretrained_checkpoints/Qwen2.5-3B \
--stable_diffusion_model_path ./pretrained_checkpoints/stable-diffusion-2-1-base \
--florence2_model_path ./models/Florence2large \
--input_path ${IMAGE_PATH} \
--output_path ${OUTPUT_DIR} \
--guidance_scale ${GUIDANCE_SCALE} \
--num_inference_steps ${NUM_INFERENCE_STEPS} \
--image_size 384 \
--use_text_encoder \
--local_rank 0 \
--learnable_token_length 77 \
--fp32 \
--distributed \
--deepspeed \
--verbose \
--batch_size 2