From a1e0d5b8d8052a01e813b4b894a5226844f75b55 Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Tue, 24 Aug 2021 12:21:33 +0800
Subject: [PATCH 1/8] for resnet regration

---
 Classification/cnns/of_cnn_train_val.py | 3 ++-
 Classification/cnns/train.sh            | 6 ++++--
 Classification/cnns/train_fp16.sh       | 8 ++------
 3 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/Classification/cnns/of_cnn_train_val.py b/Classification/cnns/of_cnn_train_val.py
index 3d5cbbd..3258849 100755
--- a/Classification/cnns/of_cnn_train_val.py
+++ b/Classification/cnns/of_cnn_train_val.py
@@ -21,7 +21,8 @@
 import config as configs
 from util import Snapshot, InitNodes, Metric
 from job_function_util import get_train_config, get_val_config
-import resnet_model
+# import resnet_model
+import resnet_rename as resnet_model
 import resnext_model
 import vgg_model
 import alexnet_model
diff --git a/Classification/cnns/train.sh b/Classification/cnns/train.sh
index 6aa2b80..5a78d34 100755
--- a/Classification/cnns/train.sh
+++ b/Classification/cnns/train.sh
@@ -19,6 +19,7 @@ echo DATA_ROOT=$DATA_ROOT
 LOG_FOLDER=../logs
 mkdir -p $LOG_FOLDER
 LOGFILE=$LOG_FOLDER/resnet_training.log
+export PYTHONUNBUFFERED=1
 
 python3 of_cnn_train_val.py \
      --train_data_dir=$DATA_ROOT/train \
@@ -26,15 +27,16 @@ python3 of_cnn_train_val.py \
      --val_data_dir=$DATA_ROOT/validation \
      --val_data_part_num=256 \
      --num_nodes=1 \
-     --gpu_num_per_node=8 \
+     --gpu_num_per_node=4 \
      --optimizer="sgd" \
      --momentum=0.875 \
      --label_smoothing=0.1 \
      --learning_rate=1.024 \
      --loss_print_every_n_iter=100 \
-     --batch_size_per_device=128 \
+     --batch_size_per_device=32 \
      --val_batch_size_per_device=50 \
      --num_epoch=$NUM_EPOCH \
      --model="resnet50" 2>&1 | tee ${LOGFILE}
+     #--model="resnet50" 2>&1 | tee ${LOGFILE}
 
 echo "Writting log to ${LOGFILE}"
diff --git a/Classification/cnns/train_fp16.sh b/Classification/cnns/train_fp16.sh
index 7ecfa5c..0c59ef0 100755
--- a/Classification/cnns/train_fp16.sh
+++ b/Classification/cnns/train_fp16.sh
@@ -26,18 +26,14 @@ export NCCL_LAUNCH_MODE=PARALLEL
 echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE
 
 python3 of_cnn_train_val.py \
-     --train_data_dir=$DATA_ROOT/train \
-     --train_data_part_num=256 \
-     --val_data_dir=$DATA_ROOT/validation \
-     --val_data_part_num=256 \
      --num_nodes=1 \
-     --gpu_num_per_node=8 \
+     --gpu_num_per_node=1 \
      --optimizer="sgd" \
      --momentum=0.875 \
      --label_smoothing=0.1 \
      --learning_rate=1.536 \
      --loss_print_every_n_iter=100 \
-     --batch_size_per_device=192 \
+     --batch_size_per_device=64 \
      --val_batch_size_per_device=50 \
      --use_fp16 \
      --channel_last=True \

From 0efe45261ce6a6a62eadf1d9ef59cd2f9687b05a Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Fri, 27 Aug 2021 18:50:55 +0800
Subject: [PATCH 2/8] rename variable

---
 Classification/cnns/resnet_model.py | 83 +++++++++++++++--------------
 1 file changed, 44 insertions(+), 39 deletions(-)

diff --git a/Classification/cnns/resnet_model.py b/Classification/cnns/resnet_model.py
index 7e9c1fc..784f924 100755
--- a/Classification/cnns/resnet_model.py
+++ b/Classification/cnns/resnet_model.py
@@ -15,6 +15,7 @@
 """
 
 import oneflow.compatible.single_client as flow
+from util import build_watch_cb, build_watch_diff_cb
 
 BLOCK_COUNTS = [3, 4, 6, 3]
 BLOCK_FILTERS = [256, 512, 1024, 2048]
@@ -50,7 +51,7 @@ def _conv2d(
         else:
             shape = (filters, input.shape[1], kernel_size, kernel_size)
         weight = flow.get_variable(
-            name + "-weight",
+            name + ".weight",
             shape=shape,
             dtype=input.dtype,
             initializer=self.weight_initializer,
@@ -58,6 +59,9 @@ def _conv2d(
             model_name="weight",
             trainable=self.trainable,
         )
+        if 'conv1' ==  name:
+            flow.watch(weight, build_watch_cb('conv1_weight'))
+            flow.watch_diff(weight, build_watch_diff_cb('conv1_weight_grad'))
 
         return flow.nn.conv2d(
             input,
@@ -113,7 +117,7 @@ def _batch_norm_relu(self, inputs, name=None, last=False):
                 name=name + "_bn_relu",
             )
         else:
-            return flow.nn.relu(self._batch_norm(inputs, name + "_bn", last=last))
+            return flow.nn.relu(self._batch_norm(inputs, name, last=last))
 
     def _batch_norm_add_relu(self, inputs, addend, name=None, last=False):
         if self.fuse_bn_add_relu:
@@ -139,7 +143,7 @@ def _batch_norm_add_relu(self, inputs, addend, name=None, last=False):
             )
         else:
             return flow.nn.relu(
-                self._batch_norm(inputs, name + "_bn", last=last) + addend
+                self._batch_norm(inputs, name, last=last) + addend
             )
 
     def conv2d_affine(self, input, name, filters, kernel_size, strides):
@@ -150,21 +154,21 @@ def conv2d_affine(self, input, name, filters, kernel_size, strides):
     def bottleneck_transformation(
         self, input, block_name, filters, filters_inner, strides
     ):
-        a = self.conv2d_affine(input, block_name + "_branch2a", filters_inner, 1, 1)
-        a = self._batch_norm_relu(a, block_name + "_branch2a")
+        a = self.conv2d_affine(input, block_name + ".conv1", filters_inner, 1, 1)
+        a = self._batch_norm_relu(a, block_name + ".bn1")
 
-        b = self.conv2d_affine(a, block_name + "_branch2b", filters_inner, 3, strides)
-        b = self._batch_norm_relu(b, block_name + "_branch2b")
+        b = self.conv2d_affine(a, block_name + ".conv2", filters_inner, 3, strides)
+        b = self._batch_norm_relu(b, block_name + ".bn2")
 
-        c = self.conv2d_affine(b, block_name + "_branch2c", filters, 1, 1)
+        c = self.conv2d_affine(b, block_name + ".conv3", filters, 1, 1)
         return c
 
     def residual_block(self, input, block_name, filters, filters_inner, strides_init):
-        if strides_init != 1 or block_name == "res2_0":
+        if strides_init != 1 or block_name == "layer1.0":
             shortcut = self.conv2d_affine(
-                input, block_name + "_branch1", filters, 1, strides_init
+                input, block_name + ".downsample.0", filters, 1, strides_init
             )
-            shortcut = self._batch_norm(shortcut, block_name + "_branch1_bn")
+            shortcut = self._batch_norm(shortcut, block_name + ".downsample.1")
         else:
             shortcut = input
 
@@ -172,7 +176,7 @@ def residual_block(self, input, block_name, filters, filters_inner, strides_init
             input, block_name, filters, filters_inner, strides_init,
         )
         return self._batch_norm_add_relu(
-            bottleneck, shortcut, block_name + "_branch2c", last=True
+            bottleneck, shortcut, block_name + ".bn3", last=True
         )
 
     def residual_stage(
@@ -180,7 +184,7 @@ def residual_stage(
     ):
         output = input
         for i in range(counts):
-            block_name = "%s_%d" % (stage_name, i)
+            block_name = "%s.%d" % (stage_name, i)
             output = self.residual_block(
                 output, block_name, filters, filters_inner, stride_init if i == 0 else 1
             )
@@ -192,7 +196,7 @@ def resnet_conv_x_body(self, input):
         for i, (counts, filters, filters_inner) in enumerate(
             zip(BLOCK_COUNTS, BLOCK_FILTERS, BLOCK_FILTERS_INNER)
         ):
-            stage_name = "res%d" % (i + 2)
+            stage_name = "layer%d" % (i + 1)
             output = self.residual_stage(
                 output, stage_name, counts, filters, filters_inner, 1 if i == 0 else 2
             )
@@ -201,7 +205,7 @@ def resnet_conv_x_body(self, input):
 
     def resnet_stem(self, input):
         conv1 = self._conv2d("conv1", input, 64, 7, 2)
-        conv1_bn = self._batch_norm_relu(conv1, "conv1")
+        conv1_bn = self._batch_norm_relu(conv1, "bn1")
         pool1 = flow.nn.max_pool2d(
             conv1_bn,
             ksize=3,
@@ -232,28 +236,29 @@ def resnet50(images, args, trainable=True, training=True):
         else:
             paddings = ((0, 0), (0, 1), (0, 0), (0, 0))
         images = flow.pad(images, paddings=paddings)
-    with flow.scope.namespace("Resnet"):
-        stem = builder.resnet_stem(images)
-        body = builder.resnet_conv_x_body(stem)
-        pool5 = flow.nn.avg_pool2d(
-            body,
-            ksize=7,
-            strides=1,
-            padding="VALID",
-            data_format=builder.data_format,
-            name="pool5",
-        )
-        fc1001 = flow.layers.dense(
-            flow.reshape(pool5, (pool5.shape[0], -1)),
-            units=1000,
-            use_bias=True,
-            kernel_initializer=flow.variance_scaling_initializer(
-                2, "fan_in", "random_normal"
-            ),
-            bias_initializer=flow.zeros_initializer(),
-            kernel_regularizer=weight_regularizer,
-            bias_regularizer=weight_regularizer,
-            trainable=trainable,
-            name="fc1001",
-        )
+    # with flow.scope.namespace("resnet50"):
+    stem = builder.resnet_stem(images)
+    body = builder.resnet_conv_x_body(stem)
+    pool5 = flow.nn.avg_pool2d(
+        body,
+        ksize=7,
+        strides=1,
+        padding="VALID",
+        data_format=builder.data_format,
+        name="avgpool",
+    )
+    fc1001 = flow.layers.dense(
+        flow.reshape(pool5, (pool5.shape[0], -1)),
+        units=1000,
+        use_bias=True,
+        kernel_initializer=flow.variance_scaling_initializer(
+            2, "fan_in", "random_normal"
+        ),
+        bias_initializer=flow.zeros_initializer(),
+        kernel_regularizer=weight_regularizer,
+        bias_regularizer=weight_regularizer,
+        trainable=trainable,
+        name="fc",
+    )
     return fc1001
+

From 1bf65a28019126a1a74168a1a93eb275b4d352e9 Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Fri, 27 Aug 2021 19:09:58 +0800
Subject: [PATCH 3/8] fix

---
 Classification/cnns/of_cnn_train_val.py | 3 +--
 Classification/cnns/train_fp16.sh       | 4 ++--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/Classification/cnns/of_cnn_train_val.py b/Classification/cnns/of_cnn_train_val.py
index 3258849..3d5cbbd 100755
--- a/Classification/cnns/of_cnn_train_val.py
+++ b/Classification/cnns/of_cnn_train_val.py
@@ -21,8 +21,7 @@
 import config as configs
 from util import Snapshot, InitNodes, Metric
 from job_function_util import get_train_config, get_val_config
-# import resnet_model
-import resnet_rename as resnet_model
+import resnet_model
 import resnext_model
 import vgg_model
 import alexnet_model
diff --git a/Classification/cnns/train_fp16.sh b/Classification/cnns/train_fp16.sh
index 0c59ef0..119a0b0 100755
--- a/Classification/cnns/train_fp16.sh
+++ b/Classification/cnns/train_fp16.sh
@@ -27,13 +27,13 @@ echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE
 
 python3 of_cnn_train_val.py \
      --num_nodes=1 \
-     --gpu_num_per_node=1 \
+     --gpu_num_per_node=8 \
      --optimizer="sgd" \
      --momentum=0.875 \
      --label_smoothing=0.1 \
      --learning_rate=1.536 \
      --loss_print_every_n_iter=100 \
-     --batch_size_per_device=64 \
+     --batch_size_per_device=192 \
      --val_batch_size_per_device=50 \
      --use_fp16 \
      --channel_last=True \

From e6c05485b9bc6507afb663edd54ab8fdd3305765 Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Fri, 27 Aug 2021 19:15:51 +0800
Subject: [PATCH 4/8] rm watch

---
 Classification/cnns/resnet_model.py | 4 ----
 Classification/cnns/train.sh        | 6 +++---
 Classification/cnns/train_fp16.sh   | 4 ++++
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/Classification/cnns/resnet_model.py b/Classification/cnns/resnet_model.py
index 784f924..cbb48a7 100755
--- a/Classification/cnns/resnet_model.py
+++ b/Classification/cnns/resnet_model.py
@@ -15,7 +15,6 @@
 """
 
 import oneflow.compatible.single_client as flow
-from util import build_watch_cb, build_watch_diff_cb
 
 BLOCK_COUNTS = [3, 4, 6, 3]
 BLOCK_FILTERS = [256, 512, 1024, 2048]
@@ -59,9 +58,6 @@ def _conv2d(
             model_name="weight",
             trainable=self.trainable,
         )
-        if 'conv1' ==  name:
-            flow.watch(weight, build_watch_cb('conv1_weight'))
-            flow.watch_diff(weight, build_watch_diff_cb('conv1_weight_grad'))
 
         return flow.nn.conv2d(
             input,
diff --git a/Classification/cnns/train.sh b/Classification/cnns/train.sh
index 5a78d34..4950648 100755
--- a/Classification/cnns/train.sh
+++ b/Classification/cnns/train.sh
@@ -12,7 +12,7 @@ echo NUM_EPOCH=$NUM_EPOCH
 if [ -n "$2" ]; then
     DATA_ROOT=$2
 else
-    DATA_ROOT=/data/imagenet/ofrecord
+    DATA_ROOT=/dataset/ImageNet/ofrecord
 fi
 echo DATA_ROOT=$DATA_ROOT
 
@@ -27,13 +27,13 @@ python3 of_cnn_train_val.py \
      --val_data_dir=$DATA_ROOT/validation \
      --val_data_part_num=256 \
      --num_nodes=1 \
-     --gpu_num_per_node=4 \
+     --gpu_num_per_node=8 \
      --optimizer="sgd" \
      --momentum=0.875 \
      --label_smoothing=0.1 \
      --learning_rate=1.024 \
      --loss_print_every_n_iter=100 \
-     --batch_size_per_device=32 \
+     --batch_size_per_device=128 \
      --val_batch_size_per_device=50 \
      --num_epoch=$NUM_EPOCH \
      --model="resnet50" 2>&1 | tee ${LOGFILE}
diff --git a/Classification/cnns/train_fp16.sh b/Classification/cnns/train_fp16.sh
index 119a0b0..7ecfa5c 100755
--- a/Classification/cnns/train_fp16.sh
+++ b/Classification/cnns/train_fp16.sh
@@ -26,6 +26,10 @@ export NCCL_LAUNCH_MODE=PARALLEL
 echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE
 
 python3 of_cnn_train_val.py \
+     --train_data_dir=$DATA_ROOT/train \
+     --train_data_part_num=256 \
+     --val_data_dir=$DATA_ROOT/validation \
+     --val_data_part_num=256 \
      --num_nodes=1 \
      --gpu_num_per_node=8 \
      --optimizer="sgd" \

From 12ea6901762647eaa6362a28db58782ebc3a7c2f Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Fri, 27 Aug 2021 19:17:16 +0800
Subject: [PATCH 5/8] fix

---
 Classification/cnns/train.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Classification/cnns/train.sh b/Classification/cnns/train.sh
index 4950648..82425d4 100755
--- a/Classification/cnns/train.sh
+++ b/Classification/cnns/train.sh
@@ -12,7 +12,7 @@ echo NUM_EPOCH=$NUM_EPOCH
 if [ -n "$2" ]; then
     DATA_ROOT=$2
 else
-    DATA_ROOT=/dataset/ImageNet/ofrecord
+    DATA_ROOT=/data/imagenet/ofrecord
 fi
 echo DATA_ROOT=$DATA_ROOT
 

From fdbb3c011db20ed801e550cf1f4f39c8ec2934da Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Fri, 27 Aug 2021 19:18:19 +0800
Subject: [PATCH 6/8] fix

---
 Classification/cnns/train.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Classification/cnns/train.sh b/Classification/cnns/train.sh
index 82425d4..6b66c14 100755
--- a/Classification/cnns/train.sh
+++ b/Classification/cnns/train.sh
@@ -37,6 +37,5 @@ python3 of_cnn_train_val.py \
      --val_batch_size_per_device=50 \
      --num_epoch=$NUM_EPOCH \
      --model="resnet50" 2>&1 | tee ${LOGFILE}
-     #--model="resnet50" 2>&1 | tee ${LOGFILE}
 
 echo "Writting log to ${LOGFILE}"

From 162f345de385da5eb2350c76bc284b500b882d27 Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Fri, 27 Aug 2021 19:19:08 +0800
Subject: [PATCH 7/8] rm lines

---
 Classification/cnns/train.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Classification/cnns/train.sh b/Classification/cnns/train.sh
index 6b66c14..6aa2b80 100755
--- a/Classification/cnns/train.sh
+++ b/Classification/cnns/train.sh
@@ -19,7 +19,6 @@ echo DATA_ROOT=$DATA_ROOT
 LOG_FOLDER=../logs
 mkdir -p $LOG_FOLDER
 LOGFILE=$LOG_FOLDER/resnet_training.log
-export PYTHONUNBUFFERED=1
 
 python3 of_cnn_train_val.py \
      --train_data_dir=$DATA_ROOT/train \

From 5b12f09dfaa0a85378fce1627bbcb468fc3856c8 Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Fri, 27 Aug 2021 19:22:44 +0800
Subject: [PATCH 8/8] fix decay batches

---
 Classification/cnns/optimizer_util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Classification/cnns/optimizer_util.py b/Classification/cnns/optimizer_util.py
index 43cd977..146bf85 100755
--- a/Classification/cnns/optimizer_util.py
+++ b/Classification/cnns/optimizer_util.py
@@ -68,7 +68,7 @@ def set_up_optimizer(loss, args):
     batches_per_epoch = math.ceil(args.num_examples / train_batch_size)
     warmup_batches = batches_per_epoch * args.warmup_epochs
     num_train_batches = batches_per_epoch * args.num_epochs
-    decay_batches = num_train_batches - warmup_batches
+    decay_batches = num_train_batches# - warmup_batches
     exponential_decay_batches = batches_per_epoch * args.lr_decay_epochs
 
     # set up warmup strategy