From a1e0d5b8d8052a01e813b4b894a5226844f75b55 Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Tue, 24 Aug 2021 12:21:33 +0800 Subject: [PATCH 1/8] for resnet regration --- Classification/cnns/of_cnn_train_val.py | 3 ++- Classification/cnns/train.sh | 6 ++++-- Classification/cnns/train_fp16.sh | 8 ++------ 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/Classification/cnns/of_cnn_train_val.py b/Classification/cnns/of_cnn_train_val.py index 3d5cbbd..3258849 100755 --- a/Classification/cnns/of_cnn_train_val.py +++ b/Classification/cnns/of_cnn_train_val.py @@ -21,7 +21,8 @@ import config as configs from util import Snapshot, InitNodes, Metric from job_function_util import get_train_config, get_val_config -import resnet_model +# import resnet_model +import resnet_rename as resnet_model import resnext_model import vgg_model import alexnet_model diff --git a/Classification/cnns/train.sh b/Classification/cnns/train.sh index 6aa2b80..5a78d34 100755 --- a/Classification/cnns/train.sh +++ b/Classification/cnns/train.sh @@ -19,6 +19,7 @@ echo DATA_ROOT=$DATA_ROOT LOG_FOLDER=../logs mkdir -p $LOG_FOLDER LOGFILE=$LOG_FOLDER/resnet_training.log +export PYTHONUNBUFFERED=1 python3 of_cnn_train_val.py \ --train_data_dir=$DATA_ROOT/train \ @@ -26,15 +27,16 @@ python3 of_cnn_train_val.py \ --val_data_dir=$DATA_ROOT/validation \ --val_data_part_num=256 \ --num_nodes=1 \ - --gpu_num_per_node=8 \ + --gpu_num_per_node=4 \ --optimizer="sgd" \ --momentum=0.875 \ --label_smoothing=0.1 \ --learning_rate=1.024 \ --loss_print_every_n_iter=100 \ - --batch_size_per_device=128 \ + --batch_size_per_device=32 \ --val_batch_size_per_device=50 \ --num_epoch=$NUM_EPOCH \ --model="resnet50" 2>&1 | tee ${LOGFILE} + #--model="resnet50" 2>&1 | tee ${LOGFILE} echo "Writting log to ${LOGFILE}" diff --git a/Classification/cnns/train_fp16.sh b/Classification/cnns/train_fp16.sh index 7ecfa5c..0c59ef0 100755 --- a/Classification/cnns/train_fp16.sh +++ b/Classification/cnns/train_fp16.sh @@ -26,18 +26,14 @@ export NCCL_LAUNCH_MODE=PARALLEL echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE python3 of_cnn_train_val.py \ - --train_data_dir=$DATA_ROOT/train \ - --train_data_part_num=256 \ - --val_data_dir=$DATA_ROOT/validation \ - --val_data_part_num=256 \ --num_nodes=1 \ - --gpu_num_per_node=8 \ + --gpu_num_per_node=1 \ --optimizer="sgd" \ --momentum=0.875 \ --label_smoothing=0.1 \ --learning_rate=1.536 \ --loss_print_every_n_iter=100 \ - --batch_size_per_device=192 \ + --batch_size_per_device=64 \ --val_batch_size_per_device=50 \ --use_fp16 \ --channel_last=True \ From 0efe45261ce6a6a62eadf1d9ef59cd2f9687b05a Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Fri, 27 Aug 2021 18:50:55 +0800 Subject: [PATCH 2/8] rename variable --- Classification/cnns/resnet_model.py | 83 +++++++++++++++-------------- 1 file changed, 44 insertions(+), 39 deletions(-) diff --git a/Classification/cnns/resnet_model.py b/Classification/cnns/resnet_model.py index 7e9c1fc..784f924 100755 --- a/Classification/cnns/resnet_model.py +++ b/Classification/cnns/resnet_model.py @@ -15,6 +15,7 @@ """ import oneflow.compatible.single_client as flow +from util import build_watch_cb, build_watch_diff_cb BLOCK_COUNTS = [3, 4, 6, 3] BLOCK_FILTERS = [256, 512, 1024, 2048] @@ -50,7 +51,7 @@ def _conv2d( else: shape = (filters, input.shape[1], kernel_size, kernel_size) weight = flow.get_variable( - name + "-weight", + name + ".weight", shape=shape, dtype=input.dtype, initializer=self.weight_initializer, @@ -58,6 +59,9 @@ def _conv2d( model_name="weight", trainable=self.trainable, ) + if 'conv1' == name: + flow.watch(weight, build_watch_cb('conv1_weight')) + flow.watch_diff(weight, build_watch_diff_cb('conv1_weight_grad')) return flow.nn.conv2d( input, @@ -113,7 +117,7 @@ def _batch_norm_relu(self, inputs, name=None, last=False): name=name + "_bn_relu", ) else: - return flow.nn.relu(self._batch_norm(inputs, name + "_bn", last=last)) + return flow.nn.relu(self._batch_norm(inputs, name, last=last)) def _batch_norm_add_relu(self, inputs, addend, name=None, last=False): if self.fuse_bn_add_relu: @@ -139,7 +143,7 @@ def _batch_norm_add_relu(self, inputs, addend, name=None, last=False): ) else: return flow.nn.relu( - self._batch_norm(inputs, name + "_bn", last=last) + addend + self._batch_norm(inputs, name, last=last) + addend ) def conv2d_affine(self, input, name, filters, kernel_size, strides): @@ -150,21 +154,21 @@ def conv2d_affine(self, input, name, filters, kernel_size, strides): def bottleneck_transformation( self, input, block_name, filters, filters_inner, strides ): - a = self.conv2d_affine(input, block_name + "_branch2a", filters_inner, 1, 1) - a = self._batch_norm_relu(a, block_name + "_branch2a") + a = self.conv2d_affine(input, block_name + ".conv1", filters_inner, 1, 1) + a = self._batch_norm_relu(a, block_name + ".bn1") - b = self.conv2d_affine(a, block_name + "_branch2b", filters_inner, 3, strides) - b = self._batch_norm_relu(b, block_name + "_branch2b") + b = self.conv2d_affine(a, block_name + ".conv2", filters_inner, 3, strides) + b = self._batch_norm_relu(b, block_name + ".bn2") - c = self.conv2d_affine(b, block_name + "_branch2c", filters, 1, 1) + c = self.conv2d_affine(b, block_name + ".conv3", filters, 1, 1) return c def residual_block(self, input, block_name, filters, filters_inner, strides_init): - if strides_init != 1 or block_name == "res2_0": + if strides_init != 1 or block_name == "layer1.0": shortcut = self.conv2d_affine( - input, block_name + "_branch1", filters, 1, strides_init + input, block_name + ".downsample.0", filters, 1, strides_init ) - shortcut = self._batch_norm(shortcut, block_name + "_branch1_bn") + shortcut = self._batch_norm(shortcut, block_name + ".downsample.1") else: shortcut = input @@ -172,7 +176,7 @@ def residual_block(self, input, block_name, filters, filters_inner, strides_init input, block_name, filters, filters_inner, strides_init, ) return self._batch_norm_add_relu( - bottleneck, shortcut, block_name + "_branch2c", last=True + bottleneck, shortcut, block_name + ".bn3", last=True ) def residual_stage( @@ -180,7 +184,7 @@ def residual_stage( ): output = input for i in range(counts): - block_name = "%s_%d" % (stage_name, i) + block_name = "%s.%d" % (stage_name, i) output = self.residual_block( output, block_name, filters, filters_inner, stride_init if i == 0 else 1 ) @@ -192,7 +196,7 @@ def resnet_conv_x_body(self, input): for i, (counts, filters, filters_inner) in enumerate( zip(BLOCK_COUNTS, BLOCK_FILTERS, BLOCK_FILTERS_INNER) ): - stage_name = "res%d" % (i + 2) + stage_name = "layer%d" % (i + 1) output = self.residual_stage( output, stage_name, counts, filters, filters_inner, 1 if i == 0 else 2 ) @@ -201,7 +205,7 @@ def resnet_conv_x_body(self, input): def resnet_stem(self, input): conv1 = self._conv2d("conv1", input, 64, 7, 2) - conv1_bn = self._batch_norm_relu(conv1, "conv1") + conv1_bn = self._batch_norm_relu(conv1, "bn1") pool1 = flow.nn.max_pool2d( conv1_bn, ksize=3, @@ -232,28 +236,29 @@ def resnet50(images, args, trainable=True, training=True): else: paddings = ((0, 0), (0, 1), (0, 0), (0, 0)) images = flow.pad(images, paddings=paddings) - with flow.scope.namespace("Resnet"): - stem = builder.resnet_stem(images) - body = builder.resnet_conv_x_body(stem) - pool5 = flow.nn.avg_pool2d( - body, - ksize=7, - strides=1, - padding="VALID", - data_format=builder.data_format, - name="pool5", - ) - fc1001 = flow.layers.dense( - flow.reshape(pool5, (pool5.shape[0], -1)), - units=1000, - use_bias=True, - kernel_initializer=flow.variance_scaling_initializer( - 2, "fan_in", "random_normal" - ), - bias_initializer=flow.zeros_initializer(), - kernel_regularizer=weight_regularizer, - bias_regularizer=weight_regularizer, - trainable=trainable, - name="fc1001", - ) + # with flow.scope.namespace("resnet50"): + stem = builder.resnet_stem(images) + body = builder.resnet_conv_x_body(stem) + pool5 = flow.nn.avg_pool2d( + body, + ksize=7, + strides=1, + padding="VALID", + data_format=builder.data_format, + name="avgpool", + ) + fc1001 = flow.layers.dense( + flow.reshape(pool5, (pool5.shape[0], -1)), + units=1000, + use_bias=True, + kernel_initializer=flow.variance_scaling_initializer( + 2, "fan_in", "random_normal" + ), + bias_initializer=flow.zeros_initializer(), + kernel_regularizer=weight_regularizer, + bias_regularizer=weight_regularizer, + trainable=trainable, + name="fc", + ) return fc1001 + From 1bf65a28019126a1a74168a1a93eb275b4d352e9 Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Fri, 27 Aug 2021 19:09:58 +0800 Subject: [PATCH 3/8] fix --- Classification/cnns/of_cnn_train_val.py | 3 +-- Classification/cnns/train_fp16.sh | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/Classification/cnns/of_cnn_train_val.py b/Classification/cnns/of_cnn_train_val.py index 3258849..3d5cbbd 100755 --- a/Classification/cnns/of_cnn_train_val.py +++ b/Classification/cnns/of_cnn_train_val.py @@ -21,8 +21,7 @@ import config as configs from util import Snapshot, InitNodes, Metric from job_function_util import get_train_config, get_val_config -# import resnet_model -import resnet_rename as resnet_model +import resnet_model import resnext_model import vgg_model import alexnet_model diff --git a/Classification/cnns/train_fp16.sh b/Classification/cnns/train_fp16.sh index 0c59ef0..119a0b0 100755 --- a/Classification/cnns/train_fp16.sh +++ b/Classification/cnns/train_fp16.sh @@ -27,13 +27,13 @@ echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE python3 of_cnn_train_val.py \ --num_nodes=1 \ - --gpu_num_per_node=1 \ + --gpu_num_per_node=8 \ --optimizer="sgd" \ --momentum=0.875 \ --label_smoothing=0.1 \ --learning_rate=1.536 \ --loss_print_every_n_iter=100 \ - --batch_size_per_device=64 \ + --batch_size_per_device=192 \ --val_batch_size_per_device=50 \ --use_fp16 \ --channel_last=True \ From e6c05485b9bc6507afb663edd54ab8fdd3305765 Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Fri, 27 Aug 2021 19:15:51 +0800 Subject: [PATCH 4/8] rm watch --- Classification/cnns/resnet_model.py | 4 ---- Classification/cnns/train.sh | 6 +++--- Classification/cnns/train_fp16.sh | 4 ++++ 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Classification/cnns/resnet_model.py b/Classification/cnns/resnet_model.py index 784f924..cbb48a7 100755 --- a/Classification/cnns/resnet_model.py +++ b/Classification/cnns/resnet_model.py @@ -15,7 +15,6 @@ """ import oneflow.compatible.single_client as flow -from util import build_watch_cb, build_watch_diff_cb BLOCK_COUNTS = [3, 4, 6, 3] BLOCK_FILTERS = [256, 512, 1024, 2048] @@ -59,9 +58,6 @@ def _conv2d( model_name="weight", trainable=self.trainable, ) - if 'conv1' == name: - flow.watch(weight, build_watch_cb('conv1_weight')) - flow.watch_diff(weight, build_watch_diff_cb('conv1_weight_grad')) return flow.nn.conv2d( input, diff --git a/Classification/cnns/train.sh b/Classification/cnns/train.sh index 5a78d34..4950648 100755 --- a/Classification/cnns/train.sh +++ b/Classification/cnns/train.sh @@ -12,7 +12,7 @@ echo NUM_EPOCH=$NUM_EPOCH if [ -n "$2" ]; then DATA_ROOT=$2 else - DATA_ROOT=/data/imagenet/ofrecord + DATA_ROOT=/dataset/ImageNet/ofrecord fi echo DATA_ROOT=$DATA_ROOT @@ -27,13 +27,13 @@ python3 of_cnn_train_val.py \ --val_data_dir=$DATA_ROOT/validation \ --val_data_part_num=256 \ --num_nodes=1 \ - --gpu_num_per_node=4 \ + --gpu_num_per_node=8 \ --optimizer="sgd" \ --momentum=0.875 \ --label_smoothing=0.1 \ --learning_rate=1.024 \ --loss_print_every_n_iter=100 \ - --batch_size_per_device=32 \ + --batch_size_per_device=128 \ --val_batch_size_per_device=50 \ --num_epoch=$NUM_EPOCH \ --model="resnet50" 2>&1 | tee ${LOGFILE} diff --git a/Classification/cnns/train_fp16.sh b/Classification/cnns/train_fp16.sh index 119a0b0..7ecfa5c 100755 --- a/Classification/cnns/train_fp16.sh +++ b/Classification/cnns/train_fp16.sh @@ -26,6 +26,10 @@ export NCCL_LAUNCH_MODE=PARALLEL echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE python3 of_cnn_train_val.py \ + --train_data_dir=$DATA_ROOT/train \ + --train_data_part_num=256 \ + --val_data_dir=$DATA_ROOT/validation \ + --val_data_part_num=256 \ --num_nodes=1 \ --gpu_num_per_node=8 \ --optimizer="sgd" \ From 12ea6901762647eaa6362a28db58782ebc3a7c2f Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Fri, 27 Aug 2021 19:17:16 +0800 Subject: [PATCH 5/8] fix --- Classification/cnns/train.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Classification/cnns/train.sh b/Classification/cnns/train.sh index 4950648..82425d4 100755 --- a/Classification/cnns/train.sh +++ b/Classification/cnns/train.sh @@ -12,7 +12,7 @@ echo NUM_EPOCH=$NUM_EPOCH if [ -n "$2" ]; then DATA_ROOT=$2 else - DATA_ROOT=/dataset/ImageNet/ofrecord + DATA_ROOT=/data/imagenet/ofrecord fi echo DATA_ROOT=$DATA_ROOT From fdbb3c011db20ed801e550cf1f4f39c8ec2934da Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Fri, 27 Aug 2021 19:18:19 +0800 Subject: [PATCH 6/8] fix --- Classification/cnns/train.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/Classification/cnns/train.sh b/Classification/cnns/train.sh index 82425d4..6b66c14 100755 --- a/Classification/cnns/train.sh +++ b/Classification/cnns/train.sh @@ -37,6 +37,5 @@ python3 of_cnn_train_val.py \ --val_batch_size_per_device=50 \ --num_epoch=$NUM_EPOCH \ --model="resnet50" 2>&1 | tee ${LOGFILE} - #--model="resnet50" 2>&1 | tee ${LOGFILE} echo "Writting log to ${LOGFILE}" From 162f345de385da5eb2350c76bc284b500b882d27 Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Fri, 27 Aug 2021 19:19:08 +0800 Subject: [PATCH 7/8] rm lines --- Classification/cnns/train.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/Classification/cnns/train.sh b/Classification/cnns/train.sh index 6b66c14..6aa2b80 100755 --- a/Classification/cnns/train.sh +++ b/Classification/cnns/train.sh @@ -19,7 +19,6 @@ echo DATA_ROOT=$DATA_ROOT LOG_FOLDER=../logs mkdir -p $LOG_FOLDER LOGFILE=$LOG_FOLDER/resnet_training.log -export PYTHONUNBUFFERED=1 python3 of_cnn_train_val.py \ --train_data_dir=$DATA_ROOT/train \ From 5b12f09dfaa0a85378fce1627bbcb468fc3856c8 Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Fri, 27 Aug 2021 19:22:44 +0800 Subject: [PATCH 8/8] fix decay batches --- Classification/cnns/optimizer_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Classification/cnns/optimizer_util.py b/Classification/cnns/optimizer_util.py index 43cd977..146bf85 100755 --- a/Classification/cnns/optimizer_util.py +++ b/Classification/cnns/optimizer_util.py @@ -68,7 +68,7 @@ def set_up_optimizer(loss, args): batches_per_epoch = math.ceil(args.num_examples / train_batch_size) warmup_batches = batches_per_epoch * args.warmup_epochs num_train_batches = batches_per_epoch * args.num_epochs - decay_batches = num_train_batches - warmup_batches + decay_batches = num_train_batches# - warmup_batches exponential_decay_batches = batches_per_epoch * args.lr_decay_epochs # set up warmup strategy