diff --git a/Classification/cnns/train_fp16.sh b/Classification/cnns/train_fp16.sh new file mode 100755 index 0000000..cda1d15 --- /dev/null +++ b/Classification/cnns/train_fp16.sh @@ -0,0 +1,52 @@ +rm -rf core.* +rm -rf ./output/snapshots/* + +if [ -n "$1" ]; then + NUM_EPOCH=$1 +else + NUM_EPOCH=50 +fi +echo NUM_EPOCH=$NUM_EPOCH + +# training with imagenet +if [ -n "$2" ]; then + DATA_ROOT=$2 +else + DATA_ROOT=/data/imagenet/ofrecord +fi +echo DATA_ROOT=$DATA_ROOT + +LOG_FOLDER=../logs +mkdir -p $LOG_FOLDER +LOGFILE=$LOG_FOLDER/resnet_training.log + +export PYTHONUNBUFFERED=1 +echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED +export NCCL_LAUNCH_MODE=PARALLEL +echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE + +python3 of_cnn_train_val.py \ + --train_data_dir=$DATA_ROOT/train \ + --train_data_part_num=256 \ + --val_data_dir=$DATA_ROOT/validation \ + --val_data_part_num=256 \ + --num_nodes=1 \ + --gpu_num_per_node=8 \ + --optimizer="sgd" \ + --momentum=0.875 \ + --label_smoothing=0.1 \ + --learning_rate=1.024 \ + --loss_print_every_n_iter=100 \ + --batch_size_per_device=128 \ + --val_batch_size_per_device=50 \ + --use_fp16 \ + --channel_last=True \ + --pad_output \ + --fuse_bn_relu=True \ + --fuse_bn_add_relu=True \ + --nccl_fusion_threshold_mb=16 \ + --nccl_fusion_max_ops=24 \ + --num_epoch=$NUM_EPOCH \ + --model="resnet50" 2>&1 | tee ${LOGFILE} + +echo "Writting log to ${LOGFILE}" diff --git a/Classification/cnns/train_fp32.sh b/Classification/cnns/train_fp32.sh new file mode 100755 index 0000000..ce06884 --- /dev/null +++ b/Classification/cnns/train_fp32.sh @@ -0,0 +1,50 @@ +rm -rf core.* +rm -rf ./output/snapshots/* + +if [ -n "$1" ]; then + NUM_EPOCH=$1 +else + NUM_EPOCH=50 +fi +echo NUM_EPOCH=$NUM_EPOCH + +# training with imagenet +if [ -n "$2" ]; then + DATA_ROOT=$2 +else + DATA_ROOT=/data/imagenet/ofrecord +fi +echo DATA_ROOT=$DATA_ROOT + +LOG_FOLDER=../logs +mkdir -p $LOG_FOLDER +LOGFILE=$LOG_FOLDER/resnet_training.log + +export PYTHONUNBUFFERED=1 +echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED +export NCCL_LAUNCH_MODE=PARALLEL +echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE + +python3 of_cnn_train_val.py \ + --train_data_dir=$DATA_ROOT/train \ + --train_data_part_num=256 \ + --val_data_dir=$DATA_ROOT/validation \ + --val_data_part_num=256 \ + --num_nodes=1 \ + --gpu_num_per_node=8 \ + --optimizer="sgd" \ + --momentum=0.875 \ + --label_smoothing=0.1 \ + --learning_rate=0.512 \ + --loss_print_every_n_iter=100 \ + --batch_size_per_device=64 \ + --val_batch_size_per_device=50 \ + --channel_last=False \ + --fuse_bn_relu=True \ + --fuse_bn_add_relu=True \ + --nccl_fusion_threshold_mb=16 \ + --nccl_fusion_max_ops=24 \ + --num_epoch=$NUM_EPOCH \ + --model="resnet50" 2>&1 | tee ${LOGFILE} + +echo "Writting log to ${LOGFILE}"