From 0ae720c75879508962267b5f97e0beab19ca6d72 Mon Sep 17 00:00:00 2001
From: gongel <gongel@qq.com>
Date: Thu, 13 Apr 2023 07:14:31 +0000
Subject: [PATCH 1/4] add dropout

---
 test_dropout/test_dropout_incubate.py         | 360 +++++++++++++++++
 test_dropout/test_nn_Dropout_develop.py       | 377 +++++++++++++++++
 .../test_nn_functional_dropout_develop.py     | 380 ++++++++++++++++++
 3 files changed, 1117 insertions(+)
 create mode 100644 test_dropout/test_dropout_incubate.py
 create mode 100644 test_dropout/test_nn_Dropout_develop.py
 create mode 100644 test_dropout/test_nn_functional_dropout_develop.py

diff --git a/test_dropout/test_dropout_incubate.py b/test_dropout/test_dropout_incubate.py
new file mode 100644
index 0000000..cea762d
--- /dev/null
+++ b/test_dropout/test_dropout_incubate.py
@@ -0,0 +1,360 @@
+import numpy as np
+import paddle
+import torch
+import unittest
+from paddle.fluid.layers.utils import map_structure
+import sys
+sys.path.append("..")
+from utils import TOLERANCE, convert_dtype_to_torch_type
+from paddle.fluid import core
+
+seed = 1234
+
+def set_seed():
+    np.random.seed(seed)
+    paddle.seed(seed)
+    torch.manual_seed(seed)
+    if core.is_compiled_with_cuda():
+        paddle.set_flags({'FLAGS_cudnn_deterministic': True})
+        torch.backends.cudnn.deterministic = True
+        torch.cuda.manual_seed_all(seed)
+
+
+class TestMatmulIncubateCase1_FP32(unittest.TestCase):
+    def setUp(self):
+        set_seed()
+        self.init_params()
+        self.init_threshold()
+        self.init_np_inputs_and_dout()
+        x_torch, dout_torch = self.gen_torch_inputs_and_dout()
+        out_torch, out_grads_torch = self.cal_torch_res(x_torch, dout_torch)
+        del x_torch 
+        del dout_torch 
+        self.out_torch = out_torch.cpu().detach().numpy()
+        self.out_grads_torch = map_structure(
+                                lambda x: x.cpu().numpy(),
+                                out_grads_torch,
+                            )
+        del out_torch, out_grads_torch
+        torch.cuda.empty_cache()
+
+    def init_params(self):
+        self.np_input_dir = "./inputs_case1.npz"
+        self.dtype = "float32"
+        self.save_static_res_path = "./static_develop_res_case1_fp32.npz"
+        self.save_eager_res_path = "./eager_develop_res_case1_fp32.npz"
+    
+    def init_threshold(self):
+        self.atol = TOLERANCE[self.dtype]["atol"]
+        self.rtol = TOLERANCE[self.dtype]["rtol"]
+
+    def init_np_inputs_and_dout(self):
+        np_inputs_array = np.load(self.np_input_dir)
+        # get np array from npz file
+        self.np_x = np_inputs_array["x"]
+        self.p = float(np_inputs_array["p"])
+        self.np_dout = np_inputs_array["dout"]
+        # convert np array dtype
+        if self.dtype == "float16":
+            self.np_x = self.np_x.astype("float16")
+            self.np_dout = self.np_dout.astype("float16")
+    
+    def gen_torch_inputs_and_dout(self):
+        x_torch = torch.tensor(
+            self.np_x,
+            device='cuda',
+            dtype=convert_dtype_to_torch_type(self.dtype)
+            if self.dtype != 'bfloat16'
+            else torch.float32,
+            requires_grad=True,
+        )
+        dout_torch = torch.tensor(
+            self.np_dout,
+            device='cuda',
+            dtype=convert_dtype_to_torch_type(self.dtype)
+            if self.dtype != 'bfloat16'
+            else torch.float32,
+            requires_grad=True,
+        )
+        return x_torch, dout_torch
+    
+    def gen_eager_inputs_and_dout(self):
+        x_eager = paddle.to_tensor(
+            self.np_x,
+            dtype=self.dtype if self.dtype != 'bfloat16' else "float32",
+            place="gpu",
+        )
+        x_eager.stop_gradient = False
+        dout_eager = paddle.to_tensor(
+            self.np_dout,
+            dtype=self.dtype if self.dtype != 'bfloat16' else "float32",
+            place="gpu",
+        )
+        dout_eager.stop_gradient = False
+        return x_eager, dout_eager
+
+    def gen_static_inputs_and_dout(self):
+        x_static = paddle.static.data(
+            'x',
+            shape=self.np_x.shape,
+            dtype=self.dtype if self.dtype != "bfloat16" else "float32",
+        )
+        x_static.stop_gradient = False
+        dout_static = paddle.static.data(
+            'dout',
+            shape=self.np_dout.shape,
+            dtype=self.dtype if self.dtype != "bfloat16" else "float32",
+        )
+        dout_static.stop_gradient = False
+        return x_static, dout_static
+
+    def cal_torch_res(self, x, dout):
+        x_t = x
+        dout_t = dout
+        if self.dtype == "bfloat16":
+            x_t = x.to(dtype=torch.bfloat16)
+            dout_t = dout.to(dtype=torch.bfloat16)
+        torch.manual_seed(seed)
+        out = torch.nn.functional.dropout(x_t, p=self.p)
+        out_grads = torch.autograd.grad([out], [x], grad_outputs=[dout_t])
+        if self.dtype == "bfloat16":
+            out = out.to(dtype=torch.float32)
+        return out, out_grads
+
+    def cal_eager_res(self, x, dout):
+        x_t = x
+        dout_t = dout
+        if self.dtype == "bfloat16":
+            x_t = paddle.cast(x, dtype="uint16")
+            dout_t = paddle.cast(dout, dtype="uint16")
+        paddle.seed(seed)
+        out = paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout(x_t, p=self.p)
+        out_grads = paddle.grad(
+            [out], [x], grad_outputs=[dout_t], retain_graph=True
+        )
+        if self.dtype == "bfloat16":
+            out = paddle.cast(out, dtype="float32")
+        return out, out_grads
+
+    def cal_static_res(self, x, dout):
+        x_t = x
+        dout_t = dout
+        if self.dtype == "bfloat16":
+            x_t = paddle.cast(x, dtype="uint16")
+            dout_t = paddle.cast(dout, dtype="uint16")
+        paddle.seed(seed)
+        out = paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout(x_t, p=self.p)
+        out_grads = paddle.static.gradients(
+            [out], [x], target_gradients=[dout_t]
+        )
+        if self.dtype == "bfloat16":
+            out = paddle.cast(out, dtype="float32")
+        return out, out_grads
+
+    def test_eager_accuracy(self):
+        # get develop eager res
+        develop_res_array = np.load(self.save_eager_res_path)
+        out_eager_develop = develop_res_array["out_eager"]
+        out_eager_grad_0_develop = develop_res_array["out_grads_eager_0"]
+        out_eager_grads_develop = [out_eager_grad_0_develop]
+
+        # calculate incubate eager res
+        x_eager, dout_eager = self.gen_eager_inputs_and_dout()
+        out_eager, out_grads_eager = self.cal_eager_res(x_eager, dout_eager)
+        del x_eager
+        del dout_eager
+        paddle.device.cuda.empty_cache()
+        out_eager_np = out_eager.numpy()
+        out_grads_eager_np = map_structure(
+                                lambda x: x.numpy(),
+                                out_grads_eager,
+                            )
+        del out_eager
+        del out_grads_eager
+        paddle.device.cuda.empty_cache()
+        # compare incubate eager res with develop eager res
+        np.testing.assert_equal(
+            out_eager_np,
+            out_eager_develop,
+            err_msg=(
+                'Incubate: compare paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout incubate eager forward res with develop eager forward res failed in %s dtype'
+            )
+            % self.dtype,
+        )
+        for idx in range(len(out_grads_eager_np)):
+            np.testing.assert_equal(
+                out_grads_eager_np[idx],
+                out_eager_grads_develop[idx],
+            err_msg=(
+                'Incubate: compare paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout incubate eager grad res with develop eager grad res failed in %s dtype'
+            )
+                % self.dtype,
+            )
+    
+    def test_static_accuracy(self):
+        # get develop static res
+        develop_res_array = np.load(self.save_static_res_path)
+        out_static_develop = develop_res_array["out_static"]
+        out_grads_static_0_develop = develop_res_array["out_grads_static_0"]
+        out_grads_static_develop = [out_grads_static_0_develop]
+
+        # calculate incubate static res
+        with paddle.fluid.framework._dygraph_guard(None):
+            mp, sp = paddle.static.Program(), paddle.static.Program()
+            with paddle.static.program_guard(mp, sp):
+                x_static, dout_static = self.gen_static_inputs_and_dout()
+                (out_static, out_grads_static) = self.cal_static_res(
+                    x_static,
+                    dout_static,
+                )
+            exe = paddle.static.Executor(
+                place=paddle.CUDAPlace(0)
+            )
+            exe.run(sp)
+            out = exe.run(
+                mp,
+                feed={"x": self.np_x, "dout": self.np_dout},
+                fetch_list=[out_static] + out_grads_static,
+            )
+            out_static, out_grads_static = out[0], out[1:]
+        
+        # compare incubate static res with develop static res
+        np.testing.assert_equal(
+            out_static,
+            out_static_develop,
+            err_msg=(
+                'Incubate: compare paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout incubate static forward res with develop static forward res failed in %s dtype'
+            )
+            % self.dtype,
+        )
+        for idx in range(len(out_grads_static)):
+            np.testing.assert_equal(
+                out_grads_static[idx],
+                out_grads_static_develop[idx],
+            err_msg=(
+                'Incubate: compare paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout incubate static grad res with develop static grad res failed in %s dtype'
+            )
+                % self.dtype,
+            )
+
+    def test_eager_stability(self):
+        x_eager, dout_eager = self.gen_eager_inputs_and_dout()
+        out_eager_baseline, out_grads_eager_baseline = self.cal_eager_res(x_eager, dout_eager)
+        out_eager_baseline_np = out_eager_baseline.numpy()
+        out_grads_eager_baseline_np = map_structure(
+                                lambda x: x.numpy(),
+                                out_grads_eager_baseline,
+                            )
+        del out_eager_baseline
+        del out_grads_eager_baseline
+        paddle.device.cuda.empty_cache()
+
+        for i in range(50):
+            out_eager, out_grads_eager = self.cal_eager_res(x_eager, dout_eager)
+            out_eager = out_eager.numpy()
+            out_grads_eager = map_structure(
+                                    lambda x: x.numpy(),
+                                    out_grads_eager,
+                                )
+            np.testing.assert_equal(
+                out_eager,
+                out_eager_baseline_np,
+                err_msg=(
+                    'Incubate: paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout eager forward is unstable in %s dtype'
+                )
+                % self.dtype,
+            )
+            for idx in range(len(out_grads_eager)):
+                np.testing.assert_equal(
+                    out_grads_eager[idx],
+                    out_grads_eager_baseline_np[idx],
+                    err_msg=(
+                        'Incubate: paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout eager grad is unstable in %s dtype'
+                    )
+                    % self.dtype,
+                )
+
+    def test_static_stability(self):
+        with paddle.fluid.framework._dygraph_guard(None):
+            paddle.framework.random._manual_program_seed(seed)
+            mp, sp = paddle.static.Program(), paddle.static.Program()
+            with paddle.static.program_guard(mp, sp):
+                x_static, dout_static = self.gen_static_inputs_and_dout()
+                (out_static_pg, out_grads_static_pg) = self.cal_static_res(
+                    x_static,
+                    dout_static,
+                )
+            exe = paddle.static.Executor(
+                place=paddle.CUDAPlace(0)
+            )
+            exe.run(sp)
+            out = exe.run(
+                mp,
+                feed={"x": self.np_x, "dout": self.np_dout},
+                fetch_list=[out_static_pg] + out_grads_static_pg,
+            )
+            out_static_baseline, out_grads_static_baseline = out[0], out[1:]
+            for i in range(50):
+                out = exe.run(
+                    mp,
+                    feed={"x": self.np_x, "dout": self.np_dout},
+                    fetch_list=[out_static_pg] + out_grads_static_pg,
+                )
+                out_static, out_grads_static = out[0], out[1:]
+                np.testing.assert_equal(
+                    out_static,
+                    out_static_baseline,
+                    err_msg=(
+                        'Incubate: paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout static forward is unstable in %s dtype'
+                    )
+                    % self.dtype,
+                )
+                for idx in range(len(out_grads_static)):
+                    np.testing.assert_equal(
+                        out_grads_static[idx],
+                        out_grads_static_baseline[idx],
+                        err_msg=(
+                            'Incubate: paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout static grad is unstable in %s dtype'
+                        )
+                        % self.dtype,
+                    )
+
+
+class TestMatmulIncubateCase1_FP16(TestMatmulIncubateCase1_FP32):
+    def init_params(self):
+        self.np_input_dir = "./inputs_case1.npz"
+        self.dtype = "float16"
+        self.save_static_res_path = "./static_develop_res_case1_fp16.npz"
+        self.save_eager_res_path = "./eager_develop_res_case1_fp16.npz"
+
+class TestMatmulIncubateCase1_BFP16(TestMatmulIncubateCase1_FP32):
+    def init_params(self):
+        self.np_input_dir = "./inputs_case1.npz"
+        self.dtype = "bfloat16"
+        self.save_static_res_path = "./static_develop_res_case1_bfp16.npz"
+        self.save_eager_res_path = "./eager_develop_res_case1_bfp16.npz"
+
+class TestMatmulIncubateCase2_FP32(TestMatmulIncubateCase1_FP32):
+    def init_params(self):
+        self.np_input_dir = "./inputs_case2.npz"
+        self.dtype = "float32"
+        self.save_static_res_path = "./static_develop_res_case2_fp32.npz"
+        self.save_eager_res_path = "./eager_develop_res_case2_fp32.npz"
+
+class TestMatmulIncubateCase2_FP16(TestMatmulIncubateCase1_FP32):
+    def init_params(self):
+        self.np_input_dir = "./inputs_case2.npz"
+        self.dtype = "float16"
+        self.save_static_res_path = "./static_develop_res_case2_fp16.npz"
+        self.save_eager_res_path = "./eager_develop_res_case2_fp16.npz"
+
+class TestMatmulIncubateCase2_BFP16(TestMatmulIncubateCase1_FP32):
+    def init_params(self):
+        self.np_input_dir = "./inputs_case2.npz"
+        self.dtype = "bfloat16"
+        self.save_static_res_path = "./static_develop_res_case2_bfp16.npz"
+        self.save_eager_res_path = "./eager_develop_res_case2_bfp16.npz"
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test_dropout/test_nn_Dropout_develop.py b/test_dropout/test_nn_Dropout_develop.py
new file mode 100644
index 0000000..ddba168
--- /dev/null
+++ b/test_dropout/test_nn_Dropout_develop.py
@@ -0,0 +1,377 @@
+import numpy as np
+import paddle
+import torch
+import unittest
+import sys
+sys.path.append("..")
+from utils import TOLERANCE, convert_dtype_to_torch_type
+from paddle.fluid import core
+from paddle.utils import map_structure
+
+seed = 1234
+np.random.seed(seed)
+paddle.seed(seed)
+torch.manual_seed(seed)
+if core.is_compiled_with_cuda():
+    paddle.set_flags({'FLAGS_cudnn_deterministic': True})
+    torch.backends.cudnn.deterministic = True
+    torch.cuda.manual_seed_all(seed)
+
+def generate_np_inputs_and_dout():
+    p = 0.1
+
+    x_case1 = np.random.random(size=[1, 4096, 12288]).astype("float32")
+    dout_case1 = np.random.random(size=[1, 4096, 12288]).astype("float32")
+
+    x_case2 = np.random.random(size=[1, 32, 4096, 4096]).astype("float32")
+    dout_case2 = np.random.random(size=[1, 32, 4096, 4096]).astype("float32")
+
+    np.savez("./inputs_case1.npz", x=x_case1, p=p, dout=dout_case1)
+    np.savez("./inputs_case2.npz", x=x_case2, p=p, dout=dout_case2)
+
+
+class TestMatmulDevelopCase1_FP32(unittest.TestCase):
+    def setUp(self):
+        self.init_params()
+        self.init_threshold()
+        self.init_np_inputs_and_dout()
+        x_torch, dout_torch = self.gen_torch_inputs_and_dout()
+        out_torch, out_grads_torch = self.cal_torch_res(x_torch, dout_torch)
+        del x_torch
+        del dout_torch
+        self.out_torch = out_torch.cpu().detach().numpy()
+        self.out_grads_torch = map_structure(
+            lambda x: x.cpu().numpy(),
+            out_grads_torch,
+        )
+        del out_torch, out_grads_torch
+        torch.cuda.empty_cache()
+
+    def init_params(self):
+        self.np_input_dir = "./inputs_case1.npz"
+        self.dtype = "float32"
+        self.save_static_res_path = "./static_develop_res_case1_fp32.npz"
+        self.save_eager_res_path = "./eager_develop_res_case1_fp32.npz"
+
+    def init_threshold(self):
+        self.atol = TOLERANCE[self.dtype]["atol"]
+        self.rtol = TOLERANCE[self.dtype]["rtol"]
+
+    def init_np_inputs_and_dout(self):
+        np_inputs_array = np.load(self.np_input_dir)
+        # get np array from npz file
+        self.np_x = np_inputs_array["x"]
+        self.np_dout = np_inputs_array["dout"]
+        self.p = float(np_inputs_array["p"])
+        # convert np array dtype
+        if self.dtype == "float16":
+            self.np_x = self.np_x.astype("float16")
+            self.np_dout = self.np_dout.astype("float16")
+
+    def gen_torch_inputs_and_dout(self):
+        x_torch = torch.tensor(
+            self.np_x,
+            device='cuda',
+            dtype=convert_dtype_to_torch_type(self.dtype)
+            if self.dtype != 'bfloat16'
+            else torch.float32,
+            requires_grad=True,
+        )
+        dout_torch = torch.tensor(
+            self.np_dout,
+            device='cuda',
+            dtype=convert_dtype_to_torch_type(self.dtype)
+            if self.dtype != 'bfloat16'
+            else torch.float32,
+            requires_grad=True,
+        )
+        return x_torch, dout_torch
+
+    def gen_eager_inputs_and_dout(self):
+        x_eager = paddle.to_tensor(
+            self.np_x,
+            dtype=self.dtype if self.dtype != 'bfloat16' else "float32",
+            place="gpu",
+        )
+        x_eager.stop_gradient = False
+        dout_eager = paddle.to_tensor(
+            self.np_dout,
+            dtype=self.dtype if self.dtype != 'bfloat16' else "float32",
+            place="gpu",
+        )
+        dout_eager.stop_gradient = False
+        return x_eager, dout_eager
+
+    def gen_static_inputs_and_dout(self):
+        x_static = paddle.static.data(
+            'x',
+            shape=self.np_x.shape,
+            dtype=self.dtype if self.dtype != "bfloat16" else "float32",
+        )
+        x_static.stop_gradient = False
+        dout_static = paddle.static.data(
+            'dout',
+            shape=self.np_dout.shape,
+            dtype=self.dtype if self.dtype != "bfloat16" else "float32",
+        )
+        dout_static.stop_gradient = False
+        return x_static, dout_static
+
+    def cal_torch_res(self, x, dout):
+        x_t = x
+        dout_t = dout
+        if self.dtype == "bfloat16":
+            x_t = x.to(dtype=torch.bfloat16)
+            dout_t = dout.to(dtype=torch.bfloat16)
+        torch.manual_seed(seed)
+        out = torch.nn.Dropout(p=self.p)(x_t)
+        out_grads = torch.autograd.grad([out], [x], grad_outputs=[dout_t])
+        if self.dtype == "bfloat16":
+            out = out.to(dtype=torch.float32)
+        return out, out_grads
+
+    def cal_eager_res(self, x, dout):
+        x_t = x
+        dout_t = dout
+        if self.dtype == "bfloat16":
+            x_t = paddle.cast(x, dtype="uint16")
+            dout_t = paddle.cast(dout, dtype="uint16")
+        paddle.seed(seed)
+        out = paddle.nn.Dropout(p=self.p)(x_t)
+        out_grads = paddle.grad(
+            [out], [x], grad_outputs=[dout_t]
+        )
+        if self.dtype == "bfloat16":
+            out = paddle.cast(out, dtype="float32")
+        return out, out_grads
+
+    def cal_static_res(self, x, dout):
+        x_t = x
+        dout_t = dout
+        if self.dtype == "bfloat16":
+            x_t = paddle.cast(x, dtype="uint16")
+            dout_t = paddle.cast(dout, dtype="uint16")
+        paddle.seed(seed)
+        out = paddle.nn.Dropout(p=self.p)(x_t)
+        out_grads = paddle.static.gradients(
+            [out], [x], target_gradients=[dout_t]
+        )
+        if self.dtype == "bfloat16":
+            out = paddle.cast(out, dtype="float32")
+        return out, out_grads
+
+    def test_eager_accuracy(self):
+        x_eager, dout_eager = self.gen_eager_inputs_and_dout()
+        out_eager, out_grads_eager = self.cal_eager_res(x_eager, dout_eager)
+        del x_eager
+        del dout_eager
+        paddle.device.cuda.empty_cache()
+        out_eager_np = out_eager.numpy()
+        out_grads_eager_np = map_structure(
+            lambda x: x.numpy(),
+            out_grads_eager,
+        )
+        del out_eager
+        del out_grads_eager
+        paddle.device.cuda.empty_cache()
+        # save eager res for test_matmul_incubate
+        np.savez(self.save_eager_res_path, out_eager=out_eager_np,
+                 out_grads_eager_0=out_grads_eager_np[0])
+
+        # compare eager res with torch
+        np.testing.assert_allclose(
+            out_eager_np,
+            self.out_torch,
+            self.atol,
+            self.rtol,
+            err_msg=(
+                'Develop: compare paddle.nn.functional.dropout eager forward res with torch failed in %s dtype'
+            )
+            % self.dtype,
+        )
+        for idx in range(len(out_grads_eager_np)):
+            np.testing.assert_allclose(
+                out_grads_eager_np[idx],
+                self.out_grads_torch[idx],
+                self.atol,
+                self.rtol,
+                err_msg=(
+                    'Develop: compare paddle.nn.Dropout eager grad res with torch failed in %s dtype'
+                )
+                % self.dtype,
+            )
+    def test_static_accuracy(self):
+        with paddle.fluid.framework._dygraph_guard(None):
+            mp, sp = paddle.static.Program(), paddle.static.Program()
+            with paddle.static.program_guard(mp, sp):
+                x_static, dout_static = self.gen_static_inputs_and_dout()
+                (out_static, out_grads_static) = self.cal_static_res(
+                    x_static,
+                    dout_static,
+                )
+            exe = paddle.static.Executor(
+                place=paddle.CUDAPlace(0)
+            )
+            exe.run(sp)
+            out = exe.run(
+                mp,
+                feed={"x": self.np_x, "dout": self.np_dout},
+                fetch_list=[out_static] + out_grads_static,
+            )
+            out_static, out_grads_static = out[0], out[1:]
+
+        # save static res for test_matmul_incubate
+        np.savez(self.save_static_res_path, out_static=out_static,
+                 out_grads_static_0=out_grads_static[0])
+
+        # compare static res with torch
+        np.testing.assert_allclose(
+            out_static,
+            self.out_torch,
+            self.atol,
+            self.rtol,
+            err_msg=(
+                'Develop: compare paddle.nn.Dropout static forward res with torch failed in %s dtype'
+            )
+            % self.dtype,
+        )
+        for idx in range(len(out_grads_static)):
+            np.testing.assert_allclose(
+                out_grads_static[idx],
+                self.out_grads_torch[idx],
+                self.atol,
+                self.rtol,
+                err_msg=(
+                    'Develop: compare paddle.nn.Dropout static grad res with torch failed in %s dtype'
+                )
+                % self.dtype,
+            )
+    
+    def test_eager_stability(self):
+        x_eager, dout_eager = self.gen_eager_inputs_and_dout()
+        out_eager_baseline, out_grads_eager_baseline = self.cal_eager_res(
+            x_eager, dout_eager)
+        out_eager_baseline_np = out_eager_baseline.numpy()
+        out_grads_eager_baseline_np = map_structure(
+            lambda x: x.numpy(),
+            out_grads_eager_baseline,
+        )
+        del out_eager_baseline
+        del out_grads_eager_baseline
+        paddle.device.cuda.empty_cache()
+
+        for i in range(50):
+            out_eager, out_grads_eager = self.cal_eager_res(
+                x_eager, dout_eager)
+            out_eager = out_eager.numpy()
+            out_grads_eager = map_structure(
+                lambda x: x.numpy(),
+                out_grads_eager,
+            )
+            np.testing.assert_equal(
+                out_eager,
+                out_eager_baseline_np,
+                err_msg=(
+                    'Develop: paddle.nn.Dropout eager forward is unstable in %s dtype'
+                )
+                % self.dtype,
+            )
+            for idx in range(len(out_grads_eager)):
+                np.testing.assert_equal(
+                    out_grads_eager[idx],
+                    out_grads_eager_baseline_np[idx],
+                    err_msg=(
+                        'Develop: paddle.nn.Dropout eager grad is unstable in %s dtype'
+                    )
+                    % self.dtype,
+                )
+
+    def test_static_stability(self):
+        with paddle.fluid.framework._dygraph_guard(None):
+            paddle.framework.random._manual_program_seed(seed)
+            mp, sp = paddle.static.Program(), paddle.static.Program()
+            with paddle.static.program_guard(mp, sp):
+                x_static, dout_static = self.gen_static_inputs_and_dout()
+                (out_static_pg, out_grads_static_pg) = self.cal_static_res(
+                    x_static,
+                    dout_static,
+                )
+            exe = paddle.static.Executor(
+                place=paddle.CUDAPlace(0)
+            )
+
+            exe.run(sp)
+            out = exe.run(
+                mp,
+                feed={"x": self.np_x, "dout": self.np_dout},
+                fetch_list=[out_static_pg] + out_grads_static_pg,
+            )
+            out_static_baseline, out_grads_static_baseline = out[0], out[1:]
+            for i in range(50):
+                out = exe.run(
+                    mp,
+                    feed={"x": self.np_x, "dout": self.np_dout},
+                    fetch_list=[out_static_pg] + out_grads_static_pg,
+                )
+                out_static, out_grads_static = out[0], out[1:]
+                np.testing.assert_equal(
+                    out_static,
+                    out_static_baseline,
+                    err_msg=(
+                        'Develop: paddle.nn.Dropout static forward is unstable in %s dtype'
+                    )
+                    % self.dtype,
+                )
+                for idx in range(len(out_grads_static)):
+                    np.testing.assert_equal(
+                        out_grads_static[idx],
+                        out_grads_static_baseline[idx],
+                        err_msg=(
+                            'Develop: paddle.nn.Dropout static grad is unstable in %s dtype'
+                        )
+                        % self.dtype,
+                    )
+
+class TestMatmulDevelopCase1_FP16(TestMatmulDevelopCase1_FP32):
+    def init_params(self):
+        self.np_input_dir = "./inputs_case1.npz"
+        self.dtype = "float16"
+        self.save_static_res_path = "./static_develop_res_case1_fp16.npz"
+        self.save_eager_res_path = "./eager_develop_res_case1_fp16.npz"
+
+
+class TestMatmulDevelopCase1_BFP16(TestMatmulDevelopCase1_FP32):
+    def init_params(self):
+        self.np_input_dir = "./inputs_case1.npz"
+        self.dtype = "bfloat16"
+        self.save_static_res_path = "./static_develop_res_case1_bfp16.npz"
+        self.save_eager_res_path = "./eager_develop_res_case1_bfp16.npz"
+
+
+class TestMatmulDevelopCase2_FP32(TestMatmulDevelopCase1_FP32):
+    def init_params(self):
+        self.np_input_dir = "./inputs_case2.npz"
+        self.dtype = "float32"
+        self.save_static_res_path = "./static_develop_res_case2_fp32.npz"
+        self.save_eager_res_path = "./eager_develop_res_case2_fp32.npz"
+
+
+class TestMatmulDevelopCase2_FP16(TestMatmulDevelopCase1_FP32):
+    def init_params(self):
+        self.np_input_dir = "./inputs_case2.npz"
+        self.dtype = "float16"
+        self.save_static_res_path = "./static_develop_res_case2_fp16.npz"
+        self.save_eager_res_path = "./eager_develop_res_case2_fp16.npz"
+
+
+class TestMatmulDevelopCase2_BFP16(TestMatmulDevelopCase1_FP32):
+    def init_params(self):
+        self.np_input_dir = "./inputs_case2.npz"
+        self.dtype = "bfloat16"
+        self.save_static_res_path = "./static_develop_res_case2_bfp16.npz"
+        self.save_eager_res_path = "./eager_develop_res_case2_bfp16.npz"
+
+
+if __name__ == '__main__':
+    generate_np_inputs_and_dout()
+    unittest.main()
diff --git a/test_dropout/test_nn_functional_dropout_develop.py b/test_dropout/test_nn_functional_dropout_develop.py
new file mode 100644
index 0000000..52243ab
--- /dev/null
+++ b/test_dropout/test_nn_functional_dropout_develop.py
@@ -0,0 +1,380 @@
+import numpy as np
+import paddle
+import torch
+import unittest
+import sys
+sys.path.append("..")
+from utils import TOLERANCE, convert_dtype_to_torch_type
+from paddle.fluid import core
+from paddle.utils import map_structure
+
+seed = 1234
+
+def set_seed():
+    np.random.seed(seed)
+    paddle.seed(seed)
+    torch.manual_seed(seed)
+    if core.is_compiled_with_cuda():
+        paddle.set_flags({'FLAGS_cudnn_deterministic': True})
+        torch.backends.cudnn.deterministic = True
+        torch.cuda.manual_seed_all(seed)
+
+def generate_np_inputs_and_dout():
+    p = 0.1
+
+    x_case1 = np.random.random(size=[1, 4096, 12288]).astype("float32")
+    dout_case1 = np.random.random(size=[1, 4096, 12288]).astype("float32")
+
+    x_case2 = np.random.random(size=[1, 32, 4096, 4096]).astype("float32")
+    dout_case2 = np.random.random(size=[1, 32, 4096, 4096]).astype("float32")
+
+    np.savez("./inputs_case1.npz", x=x_case1, p=p, dout=dout_case1)
+    np.savez("./inputs_case2.npz", x=x_case2, p=p, dout=dout_case2)
+
+
+class TestMatmulDevelopCase1_FP32(unittest.TestCase):
+    def setUp(self):
+        set_seed()
+        self.init_params()
+        self.init_threshold()
+        self.init_np_inputs_and_dout()
+        x_torch, dout_torch = self.gen_torch_inputs_and_dout()
+        out_torch, out_grads_torch = self.cal_torch_res(x_torch, dout_torch)
+        del x_torch
+        del dout_torch
+        self.out_torch = out_torch.cpu().detach().numpy()
+        self.out_grads_torch = map_structure(
+            lambda x: x.cpu().numpy(),
+            out_grads_torch,
+        )
+        del out_torch, out_grads_torch
+        torch.cuda.empty_cache()
+
+    def init_params(self):
+        self.np_input_dir = "./inputs_case1.npz"
+        self.dtype = "float16"
+        self.save_static_res_path = "./static_develop_res_case1_fp32.npz"
+        self.save_eager_res_path = "./eager_develop_res_case1_fp32.npz"
+
+    def init_threshold(self):
+        self.atol = TOLERANCE[self.dtype]["atol"]
+        self.rtol = TOLERANCE[self.dtype]["rtol"]
+
+    def init_np_inputs_and_dout(self):
+        np_inputs_array = np.load(self.np_input_dir)
+        # get np array from npz file
+        self.np_x = np_inputs_array["x"]
+        self.np_dout = np_inputs_array["dout"]
+        self.p = float(np_inputs_array["p"])
+        # convert np array dtype
+        if self.dtype == "float16":
+            self.np_x = self.np_x.astype("float16")
+            self.np_dout = self.np_dout.astype("float16")
+
+    def gen_torch_inputs_and_dout(self):
+        x_torch = torch.tensor(
+            self.np_x,
+            device='cuda',
+            dtype=convert_dtype_to_torch_type(self.dtype)
+            if self.dtype != 'bfloat16'
+            else torch.float32,
+            requires_grad=True,
+        )
+        dout_torch = torch.tensor(
+            self.np_dout,
+            device='cuda',
+            dtype=convert_dtype_to_torch_type(self.dtype)
+            if self.dtype != 'bfloat16'
+            else torch.float32,
+            requires_grad=True,
+        )
+        return x_torch, dout_torch
+
+    def gen_eager_inputs_and_dout(self):
+        x_eager = paddle.to_tensor(
+            self.np_x,
+            dtype=self.dtype if self.dtype != 'bfloat16' else "float32",
+            place="gpu",
+        )
+        x_eager.stop_gradient = False
+        dout_eager = paddle.to_tensor(
+            self.np_dout,
+            dtype=self.dtype if self.dtype != 'bfloat16' else "float32",
+            place="gpu",
+        )
+        dout_eager.stop_gradient = False
+        return x_eager, dout_eager
+
+    def gen_static_inputs_and_dout(self):
+        x_static = paddle.static.data(
+            'x',
+            shape=self.np_x.shape,
+            dtype=self.dtype if self.dtype != "bfloat16" else "float32",
+        )
+        x_static.stop_gradient = False
+        dout_static = paddle.static.data(
+            'dout',
+            shape=self.np_dout.shape,
+            dtype=self.dtype if self.dtype != "bfloat16" else "float32",
+        )
+        dout_static.stop_gradient = False
+        return x_static, dout_static
+
+    def cal_torch_res(self, x, dout):
+        x_t = x
+        dout_t = dout
+        if self.dtype == "bfloat16":
+            x_t = x.to(dtype=torch.bfloat16)
+            dout_t = dout.to(dtype=torch.bfloat16)
+        torch.manual_seed(seed)
+        out = torch.nn.functional.dropout(x_t, p=self.p)
+        out_grads = torch.autograd.grad([out], [x], grad_outputs=[dout_t])
+        if self.dtype == "bfloat16":
+            out = out.to(dtype=torch.float32)
+        return out, out_grads
+
+    def cal_eager_res(self, x, dout):
+        x_t = x
+        dout_t = dout
+        if self.dtype == "bfloat16":
+            x_t = paddle.cast(x, dtype="uint16")
+            dout_t = paddle.cast(dout, dtype="uint16")
+        paddle.seed(seed)
+        out = paddle.nn.functional.dropout(x_t, p=self.p)
+        out_grads = paddle.grad(
+            [out], [x], grad_outputs=[dout_t]
+        )
+        if self.dtype == "bfloat16":
+            out = paddle.cast(out, dtype="float32")
+        return out, out_grads
+
+    def cal_static_res(self, x, dout):
+        x_t = x
+        dout_t = dout
+        if self.dtype == "bfloat16":
+            x_t = paddle.cast(x, dtype="uint16")
+            dout_t = paddle.cast(dout, dtype="uint16")
+        paddle.seed(seed)
+        out = paddle.nn.functional.dropout(x_t, p=self.p)
+        out_grads = paddle.static.gradients(
+            [out], [x], target_gradients=[dout_t]
+        )
+        if self.dtype == "bfloat16":
+            out = paddle.cast(out, dtype="float32")
+        return out, out_grads
+
+    def test_eager_accuracy(self):
+        x_eager, dout_eager = self.gen_eager_inputs_and_dout()
+        out_eager, out_grads_eager = self.cal_eager_res(x_eager, dout_eager)
+        del x_eager
+        del dout_eager
+        paddle.device.cuda.empty_cache()
+        out_eager_np = out_eager.numpy()
+        out_grads_eager_np = map_structure(
+            lambda x: x.numpy(),
+            out_grads_eager,
+        )
+        del out_eager
+        del out_grads_eager
+        paddle.device.cuda.empty_cache()
+        # save eager res for test_matmul_incubate
+        np.savez(self.save_eager_res_path, out_eager=out_eager_np,
+                 out_grads_eager_0=out_grads_eager_np[0])
+
+        # compare eager res with torch
+        np.testing.assert_allclose(
+            out_eager_np,
+            self.out_torch,
+            self.atol,
+            self.rtol,
+            err_msg=(
+                'Develop: compare paddle.nn.functional.dropout eager forward res with torch failed in %s dtype'
+            )
+            % self.dtype,
+        )
+        for idx in range(len(out_grads_eager_np)):
+            np.testing.assert_allclose(
+                out_grads_eager_np[idx],
+                self.out_grads_torch[idx],
+                self.atol,
+                self.rtol,
+                err_msg=(
+                    'Develop: compare paddle.nn.functional.dropout eager grad res with torch failed in %s dtype'
+                )
+                % self.dtype,
+            )
+    def test_static_accuracy(self):
+        with paddle.fluid.framework._dygraph_guard(None):
+            mp, sp = paddle.static.Program(), paddle.static.Program()
+            with paddle.static.program_guard(mp, sp):
+                x_static, dout_static = self.gen_static_inputs_and_dout()
+                (out_static, out_grads_static) = self.cal_static_res(
+                    x_static,
+                    dout_static,
+                )
+            exe = paddle.static.Executor(
+                place=paddle.CUDAPlace(0)
+            )
+            exe.run(sp)
+            out = exe.run(
+                mp,
+                feed={"x": self.np_x, "dout": self.np_dout},
+                fetch_list=[out_static] + out_grads_static,
+            )
+            out_static, out_grads_static = out[0], out[1:]
+
+        # save static res for test_matmul_incubate
+        np.savez(self.save_static_res_path, out_static=out_static,
+                 out_grads_static_0=out_grads_static[0])
+
+        # compare static res with torch
+        np.testing.assert_allclose(
+            out_static,
+            self.out_torch,
+            self.atol,
+            self.rtol,
+            err_msg=(
+                'Develop: compare paddle.nn.functional.dropout static forward res with torch failed in %s dtype'
+            )
+            % self.dtype,
+        )
+        for idx in range(len(out_grads_static)):
+            np.testing.assert_allclose(
+                out_grads_static[idx],
+                self.out_grads_torch[idx],
+                self.atol,
+                self.rtol,
+                err_msg=(
+                    'Develop: compare paddle.nn.functional.dropout static grad res with torch failed in %s dtype'
+                )
+                % self.dtype,
+            )
+    
+    def test_eager_stability(self):
+        x_eager, dout_eager = self.gen_eager_inputs_and_dout()
+        out_eager_baseline, out_grads_eager_baseline = self.cal_eager_res(
+            x_eager, dout_eager)
+        out_eager_baseline_np = out_eager_baseline.numpy()
+        out_grads_eager_baseline_np = map_structure(
+            lambda x: x.numpy(),
+            out_grads_eager_baseline,
+        )
+        del out_eager_baseline
+        del out_grads_eager_baseline
+        paddle.device.cuda.empty_cache()
+
+        for i in range(50):
+            out_eager, out_grads_eager = self.cal_eager_res(
+                x_eager, dout_eager)
+            out_eager = out_eager.numpy()
+            out_grads_eager = map_structure(
+                lambda x: x.numpy(),
+                out_grads_eager,
+            )
+            np.testing.assert_equal(
+                out_eager,
+                out_eager_baseline_np,
+                err_msg=(
+                    'Develop: paddle.nn.functional.dropout eager forward is unstable in %s dtype'
+                )
+                % self.dtype,
+            )
+            for idx in range(len(out_grads_eager)):
+                np.testing.assert_equal(
+                    out_grads_eager[idx],
+                    out_grads_eager_baseline_np[idx],
+                    err_msg=(
+                        'Develop: paddle.nn.functional.dropout eager grad is unstable in %s dtype'
+                    )
+                    % self.dtype,
+                )
+
+    def test_static_stability(self):
+        with paddle.fluid.framework._dygraph_guard(None):
+            paddle.framework.random._manual_program_seed(seed)
+            mp, sp = paddle.static.Program(), paddle.static.Program()
+            with paddle.static.program_guard(mp, sp):
+                x_static, dout_static = self.gen_static_inputs_and_dout()
+                (out_static_pg, out_grads_static_pg) = self.cal_static_res(
+                    x_static,
+                    dout_static,
+                )
+            exe = paddle.static.Executor(
+                place=paddle.CUDAPlace(0)
+            )
+
+            exe.run(sp)
+            out = exe.run(
+                mp,
+                feed={"x": self.np_x, "dout": self.np_dout},
+                fetch_list=[out_static_pg] + out_grads_static_pg,
+            )
+            out_static_baseline, out_grads_static_baseline = out[0], out[1:]
+            for i in range(50):
+                out = exe.run(
+                    mp,
+                    feed={"x": self.np_x, "dout": self.np_dout},
+                    fetch_list=[out_static_pg] + out_grads_static_pg,
+                )
+                out_static, out_grads_static = out[0], out[1:]
+                np.testing.assert_equal(
+                    out_static,
+                    out_static_baseline,
+                    err_msg=(
+                        'Develop: paddle.nn.functional.dropout static forward is unstable in %s dtype'
+                    )
+                    % self.dtype,
+                )
+                for idx in range(len(out_grads_static)):
+                    np.testing.assert_equal(
+                        out_grads_static[idx],
+                        out_grads_static_baseline[idx],
+                        err_msg=(
+                            'Develop: paddle.nn.functional.dropout static grad is unstable in %s dtype'
+                        )
+                        % self.dtype,
+                    )
+
+class TestMatmulDevelopCase1_FP16(TestMatmulDevelopCase1_FP32):
+    def init_params(self):
+        self.np_input_dir = "./inputs_case1.npz"
+        self.dtype = "float16"
+        self.save_static_res_path = "./static_develop_res_case1_fp16.npz"
+        self.save_eager_res_path = "./eager_develop_res_case1_fp16.npz"
+
+
+class TestMatmulDevelopCase1_BFP16(TestMatmulDevelopCase1_FP32):
+    def init_params(self):
+        self.np_input_dir = "./inputs_case1.npz"
+        self.dtype = "bfloat16"
+        self.save_static_res_path = "./static_develop_res_case1_bfp16.npz"
+        self.save_eager_res_path = "./eager_develop_res_case1_bfp16.npz"
+
+
+class TestMatmulDevelopCase2_FP32(TestMatmulDevelopCase1_FP32):
+    def init_params(self):
+        self.np_input_dir = "./inputs_case2.npz"
+        self.dtype = "float32"
+        self.save_static_res_path = "./static_develop_res_case2_fp32.npz"
+        self.save_eager_res_path = "./eager_develop_res_case2_fp32.npz"
+
+
+class TestMatmulDevelopCase2_FP16(TestMatmulDevelopCase1_FP32):
+    def init_params(self):
+        self.np_input_dir = "./inputs_case2.npz"
+        self.dtype = "float16"
+        self.save_static_res_path = "./static_develop_res_case2_fp16.npz"
+        self.save_eager_res_path = "./eager_develop_res_case2_fp16.npz"
+
+
+class TestMatmulDevelopCase2_BFP16(TestMatmulDevelopCase1_FP32):
+    def init_params(self):
+        self.np_input_dir = "./inputs_case2.npz"
+        self.dtype = "bfloat16"
+        self.save_static_res_path = "./static_develop_res_case2_bfp16.npz"
+        self.save_eager_res_path = "./eager_develop_res_case2_bfp16.npz"
+
+
+if __name__ == '__main__':
+    generate_np_inputs_and_dout()
+    unittest.main()

From a1438d3dfffea71d3aeebb81fac2009ab0e7adf6 Mon Sep 17 00:00:00 2001
From: gongel <gongel@qq.com>
Date: Mon, 17 Apr 2023 01:26:23 +0000
Subject: [PATCH 2/4] add FusedDropoutAdd

---
 .../test_FusedDropoutAdd_dropout_incubate.py  | 412 +++++++++++++++++
 ...est_incubate_nn_FusedDropoutAdd_develop.py | 423 ++++++++++++++++++
 2 files changed, 835 insertions(+)
 create mode 100644 test_dropout/test_FusedDropoutAdd_dropout_incubate.py
 create mode 100644 test_dropout/test_incubate_nn_FusedDropoutAdd_develop.py

diff --git a/test_dropout/test_FusedDropoutAdd_dropout_incubate.py b/test_dropout/test_FusedDropoutAdd_dropout_incubate.py
new file mode 100644
index 0000000..6eb424b
--- /dev/null
+++ b/test_dropout/test_FusedDropoutAdd_dropout_incubate.py
@@ -0,0 +1,412 @@
+import numpy as np
+import paddle
+import torch
+import unittest
+from paddle.fluid.layers.utils import map_structure
+import sys
+
+sys.path.append("..")
+from utils import TOLERANCE, convert_dtype_to_torch_type
+from paddle.fluid import core
+
+seed = 1234
+
+
+def set_seed():
+    np.random.seed(seed)
+    paddle.seed(seed)
+    torch.manual_seed(seed)
+    if core.is_compiled_with_cuda():
+        paddle.set_flags({'FLAGS_cudnn_deterministic': True})
+        torch.backends.cudnn.deterministic = True
+        torch.cuda.manual_seed_all(seed)
+
+
+class TestMatmulIncubateCase1_FP32(unittest.TestCase):
+
+    def setUp(self):
+        set_seed()
+        self.init_params()
+        self.init_threshold()
+        self.init_np_inputs_and_dout()
+        x_torch, y_torch, dout_torch = self.gen_torch_inputs_and_dout()
+        out_torch, out_grads_torch = self.cal_torch_res(
+            x_torch, y_torch, dout_torch)
+        del x_torch
+        del y_torch
+        del dout_torch
+        self.out_torch = out_torch.cpu().detach().numpy()
+        self.out_grads_torch = map_structure(
+            lambda x: x.cpu().numpy(),
+            out_grads_torch,
+        )
+        del out_torch, out_grads_torch
+        torch.cuda.empty_cache()
+
+    def init_params(self):
+        self.np_input_dir = "./inputs_case1.npz"
+        self.dtype = "float32"
+        self.save_static_res_path = "./static_develop_res_case1_fp32.npz"
+        self.save_eager_res_path = "./eager_develop_res_case1_fp32.npz"
+
+    def init_threshold(self):
+        self.atol = TOLERANCE[self.dtype]["atol"]
+        self.rtol = TOLERANCE[self.dtype]["rtol"]
+
+    def init_np_inputs_and_dout(self):
+        np_inputs_array = np.load(self.np_input_dir)
+        # get np array from npz file
+        self.np_x = np_inputs_array["x"]
+        self.np_y = np_inputs_array["y"]
+        self.p = float(np_inputs_array["p"])
+        self.np_dout = np_inputs_array["dout"]
+        # convert np array dtype
+        if self.dtype == "float16":
+            self.np_x = self.np_x.astype("float16")
+            self.np_y = self.np_y.astype("float16")
+            self.np_dout = self.np_dout.astype("float16")
+
+    def gen_torch_inputs_and_dout(self):
+        x_torch = torch.tensor(
+            self.np_x,
+            device='cuda',
+            dtype=convert_dtype_to_torch_type(self.dtype)
+            if self.dtype != 'bfloat16' else torch.float32,
+            requires_grad=True,
+        )
+        y_torch = torch.tensor(
+            self.np_y,
+            device='cuda',
+            dtype=convert_dtype_to_torch_type(self.dtype)
+            if self.dtype != 'bfloat16' else torch.float32,
+            requires_grad=True,
+        )
+        dout_torch = torch.tensor(
+            self.np_dout,
+            device='cuda',
+            dtype=convert_dtype_to_torch_type(self.dtype)
+            if self.dtype != 'bfloat16' else torch.float32,
+            requires_grad=True,
+        )
+        return x_torch, y_torch, dout_torch
+
+    def gen_eager_inputs_and_dout(self):
+        x_eager = paddle.to_tensor(
+            self.np_x,
+            dtype=self.dtype if self.dtype != 'bfloat16' else "float32",
+            place="gpu",
+        )
+        x_eager.stop_gradient = False
+        y_eager = paddle.to_tensor(
+            self.np_y,
+            dtype=self.dtype if self.dtype != 'bfloat16' else "float32",
+            place="gpu",
+        )
+        y_eager.stop_gradient = False
+        dout_eager = paddle.to_tensor(
+            self.np_dout,
+            dtype=self.dtype if self.dtype != 'bfloat16' else "float32",
+            place="gpu",
+        )
+        dout_eager.stop_gradient = False
+        return x_torch, y_torch, dout_torch
+
+    def gen_static_inputs_and_dout(self):
+        x_static = paddle.static.data(
+            'x',
+            shape=self.np_x.shape,
+            dtype=self.dtype if self.dtype != "bfloat16" else "float32",
+        )
+        x_static.stop_gradient = False
+        y_static = paddle.static.data(
+            'y',
+            shape=self.np_y.shape,
+            dtype=self.dtype if self.dtype != "bfloat16" else "float32",
+        )
+        y_static.stop_gradient = False
+        dout_static = paddle.static.data(
+            'dout',
+            shape=self.np_dout.shape,
+            dtype=self.dtype if self.dtype != "bfloat16" else "float32",
+        )
+        dout_static.stop_gradient = False
+        return x_static, y_static, dout_static
+
+    def cal_torch_res(self, x, y, dout):
+        x_t = x
+        y_t = y
+        dout_t = dout
+        if self.dtype == "bfloat16":
+            x_t = x.to(dtype=torch.bfloat16)
+            y_t = y.to(dtype=torch.bfloat16)
+            dout_t = dout.to(dtype=torch.bfloat16)
+        torch.manual_seed(seed)
+        out = torch.nn.functional.dropout(x_t, p=self.p) + y_t
+        out_grads = torch.autograd.grad([out], [x, y], grad_outputs=[dout_t])
+        if self.dtype == "bfloat16":
+            out = out.to(dtype=torch.float32)
+        return out, out_grads
+
+    def cal_eager_res(self, x, y, dout):
+        x_t = x
+        y_t = y
+        dout_t = dout
+        if self.dtype == "bfloat16":
+            x_t = paddle.cast(x, dtype="uint16")
+            y_t = paddle.cast(y, dtype="uint16")
+            dout_t = paddle.cast(dout, dtype="uint16")
+        paddle.seed(seed)
+        out = paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout(
+            x_t, p=self.p) + y_t
+        out_grads = paddle.grad([out], [x, y],
+                                grad_outputs=[dout_t],
+                                retain_graph=True)
+        if self.dtype == "bfloat16":
+            out = paddle.cast(out, dtype="float32")
+        return out, out_grads
+
+    def cal_static_res(self, x, y, dout):
+        x_t = x
+        y_t = y
+        dout_t = dout
+        if self.dtype == "bfloat16":
+            x_t = paddle.cast(x, dtype="uint16")
+            y_t = paddle.cast(y, dtype="uint16")
+            dout_t = paddle.cast(dout, dtype="uint16")
+        paddle.seed(seed)
+        out = paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout(
+            x_t, p=self.p) + y_t
+        out_grads = paddle.static.gradients([out], [x, y],
+                                            target_gradients=[dout_t])
+        if self.dtype == "bfloat16":
+            out = paddle.cast(out, dtype="float32")
+        return out, out_grads
+
+    def test_eager_accuracy(self):
+        # get develop eager res
+        develop_res_array = np.load(self.save_eager_res_path)
+        out_eager_develop = develop_res_array["out_eager"]
+        out_eager_grad_0_develop = develop_res_array["out_grads_eager_0"]
+        out_eager_grads_develop = [
+            out_eager_grad_0_develop, out_eager_grad_1_develop
+        ]
+
+        # calculate incubate eager res
+        x_eager, y_eager, dout_eager = self.gen_eager_inputs_and_dout()
+        out_eager, out_grads_eager = self.cal_eager_res(
+            x_eager, y_eager, dout_eager)
+        del x_eager
+        del y_eager
+        del dout_eager
+        paddle.device.cuda.empty_cache()
+        out_eager_np = out_eager.numpy()
+        out_grads_eager_np = map_structure(
+            lambda x: x.numpy(),
+            out_grads_eager,
+        )
+        del out_eager
+        del out_grads_eager
+        paddle.device.cuda.empty_cache()
+        # compare incubate eager res with develop eager res
+        np.testing.assert_equal(
+            out_eager_np,
+            out_eager_develop,
+            err_msg=
+            ('Incubate: compare paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout incubate eager forward res with develop eager forward res failed in %s dtype'
+             ) % self.dtype,
+        )
+        for idx in range(len(out_grads_eager_np)):
+            np.testing.assert_equal(
+                out_grads_eager_np[idx],
+                out_eager_grads_develop[idx],
+                err_msg=
+                ('Incubate: compare paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout incubate eager grad res with develop eager grad res failed in %s dtype'
+                 ) % self.dtype,
+            )
+
+    def test_static_accuracy(self):
+        # get develop static res
+        develop_res_array = np.load(self.save_static_res_path)
+        out_static_develop = develop_res_array["out_static"]
+        out_grads_static_0_develop = develop_res_array["out_grads_static_0"]
+        out_grads_static_develop = [
+            out_grads_static_0_develop, out_grads_static_1_develop
+        ]
+
+        # calculate incubate static res
+        with paddle.fluid.framework._dygraph_guard(None):
+            mp, sp = paddle.static.Program(), paddle.static.Program()
+            with paddle.static.program_guard(mp, sp):
+                x_static, y_static, dout_static = self.gen_static_inputs_and_dout(
+                )
+                (out_static, out_grads_static) = self.cal_static_res(
+                    x_static,
+                    y_static,
+                    dout_static,
+                )
+            exe = paddle.static.Executor(place=paddle.CUDAPlace(0))
+            exe.run(sp)
+            out = exe.run(
+                mp,
+                feed={
+                    "x": self.np_x,
+                    "y": self.np_y,
+                    "dout": self.np_dout
+                },
+                fetch_list=[out_static] + out_grads_static,
+            )
+            out_static, out_grads_static = out[0], out[1:]
+
+        # compare incubate static res with develop static res
+        np.testing.assert_equal(
+            out_static,
+            out_static_develop,
+            err_msg=
+            ('Incubate: compare paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout incubate static forward res with develop static forward res failed in %s dtype'
+             ) % self.dtype,
+        )
+        for idx in range(len(out_grads_static)):
+            np.testing.assert_equal(
+                out_grads_static[idx],
+                out_grads_static_develop[idx],
+                err_msg=
+                ('Incubate: compare paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout incubate static grad res with develop static grad res failed in %s dtype'
+                 ) % self.dtype,
+            )
+
+    def test_eager_stability(self):
+        x_eager, y_eager, dout_eager = self.gen_eager_inputs_and_dout()
+        out_eager_baseline, out_grads_eager_baseline = self.cal_eager_res(
+            x_eager, y_eager, dout_eager)
+        out_eager_baseline_np = out_eager_baseline.numpy()
+        out_grads_eager_baseline_np = map_structure(
+            lambda x: x.numpy(),
+            out_grads_eager_baseline,
+        )
+        del out_eager_baseline
+        del out_grads_eager_baseline
+        paddle.device.cuda.empty_cache()
+
+        for i in range(50):
+            out_eager, out_grads_eager = self.cal_eager_res(
+                x_eager, y_eager, dout_eager)
+            out_eager = out_eager.numpy()
+            out_grads_eager = map_structure(
+                lambda x: x.numpy(),
+                out_grads_eager,
+            )
+            np.testing.assert_equal(
+                out_eager,
+                out_eager_baseline_np,
+                err_msg=
+                ('Incubate: paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout eager forward is unstable in %s dtype'
+                 ) % self.dtype,
+            )
+            for idx in range(len(out_grads_eager)):
+                np.testing.assert_equal(
+                    out_grads_eager[idx],
+                    out_grads_eager_baseline_np[idx],
+                    err_msg=
+                    ('Incubate: paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout eager grad is unstable in %s dtype'
+                     ) % self.dtype,
+                )
+
+    def test_static_stability(self):
+        with paddle.fluid.framework._dygraph_guard(None):
+            paddle.framework.random._manual_program_seed(seed)
+            mp, sp = paddle.static.Program(), paddle.static.Program()
+            with paddle.static.program_guard(mp, sp):
+                x_static, y_static, dout_static = self.gen_static_inputs_and_dout(
+                )
+                (out_static_pg, out_grads_static_pg) = self.cal_static_res(
+                    x_static,
+                    y_static,
+                    dout_static,
+                )
+            exe = paddle.static.Executor(place=paddle.CUDAPlace(0))
+            exe.run(sp)
+            out = exe.run(
+                mp,
+                feed={
+                    "x": self.np_x,
+                    "y": self.np_y,
+                    "dout": self.np_dout
+                },
+                fetch_list=[out_static_pg] + out_grads_static_pg,
+            )
+            out_static_baseline, out_grads_static_baseline = out[0], out[1:]
+            for i in range(50):
+                out = exe.run(
+                    mp,
+                    feed={
+                        "x": self.np_x,
+                        "y": self.np_y,
+                        "dout": self.np_dout
+                    },
+                    fetch_list=[out_static_pg] + out_grads_static_pg,
+                )
+                out_static, out_grads_static = out[0], out[1:]
+                np.testing.assert_equal(
+                    out_static,
+                    out_static_baseline,
+                    err_msg=
+                    ('Incubate: paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout static forward is unstable in %s dtype'
+                     ) % self.dtype,
+                )
+                for idx in range(len(out_grads_static)):
+                    np.testing.assert_equal(
+                        out_grads_static[idx],
+                        out_grads_static_baseline[idx],
+                        err_msg=
+                        ('Incubate: paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout static grad is unstable in %s dtype'
+                         ) % self.dtype,
+                    )
+
+
+class TestMatmulIncubateCase1_FP16(TestMatmulIncubateCase1_FP32):
+
+    def init_params(self):
+        self.np_input_dir = "./inputs_case1.npz"
+        self.dtype = "float16"
+        self.save_static_res_path = "./static_develop_res_case1_fp16.npz"
+        self.save_eager_res_path = "./eager_develop_res_case1_fp16.npz"
+
+
+class TestMatmulIncubateCase1_BFP16(TestMatmulIncubateCase1_FP32):
+
+    def init_params(self):
+        self.np_input_dir = "./inputs_case1.npz"
+        self.dtype = "bfloat16"
+        self.save_static_res_path = "./static_develop_res_case1_bfp16.npz"
+        self.save_eager_res_path = "./eager_develop_res_case1_bfp16.npz"
+
+
+class TestMatmulIncubateCase2_FP32(TestMatmulIncubateCase1_FP32):
+
+    def init_params(self):
+        self.np_input_dir = "./inputs_case2.npz"
+        self.dtype = "float32"
+        self.save_static_res_path = "./static_develop_res_case2_fp32.npz"
+        self.save_eager_res_path = "./eager_develop_res_case2_fp32.npz"
+
+
+class TestMatmulIncubateCase2_FP16(TestMatmulIncubateCase1_FP32):
+
+    def init_params(self):
+        self.np_input_dir = "./inputs_case2.npz"
+        self.dtype = "float16"
+        self.save_static_res_path = "./static_develop_res_case2_fp16.npz"
+        self.save_eager_res_path = "./eager_develop_res_case2_fp16.npz"
+
+
+class TestMatmulIncubateCase2_BFP16(TestMatmulIncubateCase1_FP32):
+
+    def init_params(self):
+        self.np_input_dir = "./inputs_case2.npz"
+        self.dtype = "bfloat16"
+        self.save_static_res_path = "./static_develop_res_case2_bfp16.npz"
+        self.save_eager_res_path = "./eager_develop_res_case2_bfp16.npz"
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test_dropout/test_incubate_nn_FusedDropoutAdd_develop.py b/test_dropout/test_incubate_nn_FusedDropoutAdd_develop.py
new file mode 100644
index 0000000..28ed383
--- /dev/null
+++ b/test_dropout/test_incubate_nn_FusedDropoutAdd_develop.py
@@ -0,0 +1,423 @@
+import numpy as np
+import paddle
+import torch
+import unittest
+import sys
+
+sys.path.append("..")
+from utils import TOLERANCE, convert_dtype_to_torch_type
+from paddle.fluid import core
+from paddle.utils import map_structure
+
+seed = 1234
+np.random.seed(seed)
+paddle.seed(seed)
+torch.manual_seed(seed)
+if core.is_compiled_with_cuda():
+    paddle.set_flags({'FLAGS_cudnn_deterministic': True})
+    torch.backends.cudnn.deterministic = True
+    torch.cuda.manual_seed_all(seed)
+
+
+def generate_np_inputs_and_dout():
+    p = 0.1
+
+    x_case1 = np.random.random(size=[1, 4096, 12288]).astype("float32")
+    y_case1 = np.random.random(size=[1, 4096, 12288]).astype("float32")
+    dout_case1 = np.random.random(size=[1, 4096, 12288]).astype("float32")
+
+    x_case2 = np.random.random(size=[1, 32, 4096, 4096]).astype("float32")
+    y_case2 = np.random.random(size=[1, 32, 4096, 4096]).astype("float32")
+    dout_case2 = np.random.random(size=[1, 32, 4096, 4096]).astype("float32")
+
+    np.savez("./inputs_case1.npz", x=x_case1, y=x_case1, p=p, dout=dout_case1)
+    np.savez("./inputs_case2.npz", x=x_case2, y=x_case2, p=p, dout=dout_case2)
+
+
+class TestMatmulDevelopCase1_FP32(unittest.TestCase):
+
+    def setUp(self):
+        self.init_params()
+        self.init_threshold()
+        self.init_np_inputs_and_dout()
+        x_torch, y_torch, dout_torch = self.gen_torch_inputs_and_dout()
+        out_torch, out_grads_torch = self.cal_torch_res(
+            x_torch, y_torch, dout_torch)
+        del x_torch
+        del y_torch
+        del dout_torch
+        self.out_torch = out_torch.cpu().detach().numpy()
+        self.out_grads_torch = map_structure(
+            lambda x: x.cpu().numpy(),
+            out_grads_torch,
+        )
+        del out_torch, out_grads_torch
+        torch.cuda.empty_cache()
+
+    def init_params(self):
+        self.np_input_dir = "./inputs_case1.npz"
+        self.dtype = "float32"
+        self.save_static_res_path = "./static_develop_res_case1_fp32.npz"
+        self.save_eager_res_path = "./eager_develop_res_case1_fp32.npz"
+
+    def init_threshold(self):
+        self.atol = TOLERANCE[self.dtype]["atol"]
+        self.rtol = TOLERANCE[self.dtype]["rtol"]
+
+    def init_np_inputs_and_dout(self):
+        np_inputs_array = np.load(self.np_input_dir)
+        # get np array from npz file
+        self.np_x = np_inputs_array["x"]
+        self.np_y = np_inputs_array["y"]
+        self.np_dout = np_inputs_array["dout"]
+        self.p = float(np_inputs_array["p"])
+        # convert np array dtype
+        if self.dtype == "float16":
+            self.np_x = self.np_x.astype("float16")
+            self.np_y = self.np_y.astype("float16")
+            self.np_dout = self.np_dout.astype("float16")
+
+    def gen_torch_inputs_and_dout(self):
+        x_torch = torch.tensor(
+            self.np_x,
+            device='cuda',
+            dtype=convert_dtype_to_torch_type(self.dtype)
+            if self.dtype != 'bfloat16' else torch.float32,
+            requires_grad=True,
+        )
+        y_torch = torch.tensor(
+            self.np_y,
+            device='cuda',
+            dtype=convert_dtype_to_torch_type(self.dtype)
+            if self.dtype != 'bfloat16' else torch.float32,
+            requires_grad=True,
+        )
+        dout_torch = torch.tensor(
+            self.np_dout,
+            device='cuda',
+            dtype=convert_dtype_to_torch_type(self.dtype)
+            if self.dtype != 'bfloat16' else torch.float32,
+            requires_grad=True,
+        )
+        return x_torch, y_torch, dout_torch
+
+    def gen_eager_inputs_and_dout(self):
+        x_eager = paddle.to_tensor(
+            self.np_x,
+            dtype=self.dtype if self.dtype != 'bfloat16' else "float32",
+            place="gpu",
+        )
+        x_eager.stop_gradient = False
+        y_eager = paddle.to_tensor(
+            self.np_y,
+            dtype=self.dtype if self.dtype != 'bfloat16' else "float32",
+            place="gpu",
+        )
+        y_eager.stop_gradient = False
+        dout_eager = paddle.to_tensor(
+            self.np_dout,
+            dtype=self.dtype if self.dtype != 'bfloat16' else "float32",
+            place="gpu",
+        )
+        dout_eager.stop_gradient = False
+        return x_eager, y_eager, dout_eager
+
+    def gen_static_inputs_and_dout(self):
+        x_static = paddle.static.data(
+            'x',
+            shape=self.np_x.shape,
+            dtype=self.dtype if self.dtype != "bfloat16" else "float32",
+        )
+        x_static.stop_gradient = False
+        y_static = paddle.static.data(
+            'y',
+            shape=self.np_y.shape,
+            dtype=self.dtype if self.dtype != "bfloat16" else "float32",
+        )
+        y_static.stop_gradient = False
+        dout_static = paddle.static.data(
+            'dout',
+            shape=self.np_dout.shape,
+            dtype=self.dtype if self.dtype != "bfloat16" else "float32",
+        )
+        dout_static.stop_gradient = False
+        return x_static, y_static, dout_static
+
+    def cal_torch_res(self, x, y, dout):
+        x_t = x
+        y_t = y
+        dout_t = dout
+        if self.dtype == "bfloat16":
+            x_t = x.to(dtype=torch.bfloat16)
+            y_t = y.to(dtype=torch.bfloat16)
+            dout_t = dout.to(dtype=torch.bfloat16)
+        torch.manual_seed(seed)
+        out = torch.nn.functional.dropout(x_t, p=self.p) + y_t
+        out_grads = torch.autograd.grad([out], [x, y], grad_outputs=[dout_t])
+        if self.dtype == "bfloat16":
+            out = out.to(dtype=torch.float32)
+        return out, out_grads
+
+    def cal_eager_res(self, x, y, dout):
+        x_t = x
+        y_t = y
+        dout_t = dout
+        if self.dtype == "bfloat16":
+            x_t = paddle.cast(x, dtype="uint16")
+            y_t = paddle.cast(y, dtype="uint16")
+            dout_t = paddle.cast(dout, dtype="uint16")
+        paddle.seed(seed)
+        out = paddle.incubate.nn.FusedDropoutAdd(p=self.p)(x_t, y_t)
+        out_grads = paddle.grad([out], [x, y], grad_outputs=[dout_t])
+        if self.dtype == "bfloat16":
+            out = paddle.cast(out, dtype="float32")
+        return out, out_grads
+
+    def cal_static_res(self, x, y, dout):
+        x_t = x
+        y_t = y
+        dout_t = dout
+        if self.dtype == "bfloat16":
+            x_t = paddle.cast(x, dtype="uint16")
+            y_t = paddle.cast(y, dtype="uint16")
+            dout_t = paddle.cast(dout, dtype="uint16")
+        paddle.seed(seed)
+        out = paddle.incubate.nn.FusedDropoutAdd(p=self.p)(x_t, y_t)
+        out_grads = paddle.static.gradients([out], [x],
+                                            target_gradients=[dout_t])
+        if self.dtype == "bfloat16":
+            out = paddle.cast(out, dtype="float32")
+        return out, out_grads
+
+    def test_eager_accuracy(self):
+        x_eager, y_eager, dout_eager = self.gen_eager_inputs_and_dout()
+        out_eager, out_grads_eager = self.cal_eager_res(
+            x_eager, y_eager, dout_eager)
+        del x_eager
+        del y_eager
+        del dout_eager
+        paddle.device.cuda.empty_cache()
+        out_eager_np = out_eager.numpy()
+        out_grads_eager_np = map_structure(
+            lambda x: x.numpy(),
+            out_grads_eager,
+        )
+        del out_eager
+        del out_grads_eager
+        paddle.device.cuda.empty_cache()
+        # save eager res for test_matmul_incubate
+        np.savez(self.save_eager_res_path,
+                 out_eager=out_eager_np,
+                 out_grads_eager_0=out_grads_eager_np[0],
+                 out_grads_eager_1=out_grads_eager_np[1])
+
+        # compare eager res with torch
+        np.testing.assert_allclose(
+            out_eager_np,
+            self.out_torch,
+            self.atol,
+            self.rtol,
+            err_msg=
+            ('Develop: compare paddle.nn.functional.dropout eager forward res with torch failed in %s dtype'
+             ) % self.dtype,
+        )
+        for idx in range(len(out_grads_eager_np)):
+            np.testing.assert_allclose(
+                out_grads_eager_np[idx],
+                self.out_grads_torch[idx],
+                self.atol,
+                self.rtol,
+                err_msg=
+                ('Develop: compare paddle.nn.functional.dropout eager grad res with torch failed in %s dtype'
+                 ) % self.dtype,
+            )
+
+    def test_static_accuracy(self):
+        with paddle.fluid.framework._dygraph_guard(None):
+            mp, sp = paddle.static.Program(), paddle.static.Program()
+            with paddle.static.program_guard(mp, sp):
+                x_static, y_static, dout_static = self.gen_static_inputs_and_dout(
+                )
+                (out_static, out_grads_static) = self.cal_static_res(
+                    x_static,
+                    y_static,
+                    dout_static,
+                )
+            exe = paddle.static.Executor(place=paddle.CUDAPlace(0))
+            exe.run(sp)
+            out = exe.run(
+                mp,
+                feed={
+                    "x": self.np_x,
+                    "y": self.np_y,
+                    "dout": self.np_dout
+                },
+                fetch_list=[out_static] + out_grads_static,
+            )
+            out_static, out_grads_static = out[0], out[1:]
+
+        # save static res for test_matmul_incubate
+        np.savez(self.save_static_res_path,
+                 out_static=out_static,
+                 out_grads_static_0=out_grads_static[0],
+                 out_grads_static_1=out_grads_static[1])
+
+        # compare static res with torch
+        np.testing.assert_allclose(
+            out_static,
+            self.out_torch,
+            self.atol,
+            self.rtol,
+            err_msg=
+            ('Develop: compare paddle.nn.functional.dropout static forward res with torch failed in %s dtype'
+             ) % self.dtype,
+        )
+        for idx in range(len(out_grads_static)):
+            np.testing.assert_allclose(
+                out_grads_static[idx],
+                self.out_grads_torch[idx],
+                self.atol,
+                self.rtol,
+                err_msg=
+                ('Develop: compare paddle.nn.functional.dropout static grad res with torch failed in %s dtype'
+                 ) % self.dtype,
+            )
+
+    def test_eager_stability(self):
+        x_eager, y_eager, dout_eager = self.gen_eager_inputs_and_dout()
+        out_eager_baseline, out_grads_eager_baseline = self.cal_eager_res(
+            x_eager, y_eager, dout_eager)
+        out_eager_baseline_np = out_eager_baseline.numpy()
+        out_grads_eager_baseline_np = map_structure(
+            lambda x: x.numpy(),
+            out_grads_eager_baseline,
+        )
+        del out_eager_baseline
+        del out_grads_eager_baseline
+        paddle.device.cuda.empty_cache()
+
+        for i in range(50):
+            out_eager, out_grads_eager = self.cal_eager_res(
+                x_eager, y_eager, dout_eager)
+            out_eager = out_eager.numpy()
+            out_grads_eager = map_structure(
+                lambda x: x.numpy(),
+                out_grads_eager,
+            )
+            np.testing.assert_equal(
+                out_eager,
+                out_eager_baseline_np,
+                err_msg=
+                ('Develop: paddle.nn.functional.dropout eager forward is unstable in %s dtype'
+                 ) % self.dtype,
+            )
+            for idx in range(len(out_grads_eager)):
+                np.testing.assert_equal(
+                    out_grads_eager[idx],
+                    out_grads_eager_baseline_np[idx],
+                    err_msg=
+                    ('Develop: paddle.nn.functional.dropout eager grad is unstable in %s dtype'
+                     ) % self.dtype,
+                )
+
+    def test_static_stability(self):
+        with paddle.fluid.framework._dygraph_guard(None):
+            paddle.framework.random._manual_program_seed(seed)
+            mp, sp = paddle.static.Program(), paddle.static.Program()
+            with paddle.static.program_guard(mp, sp):
+                x_static, y_static, dout_static = self.gen_static_inputs_and_dout(
+                )
+                (out_static_pg, out_grads_static_pg) = self.cal_static_res(
+                    x_static,
+                    y_static,
+                    dout_static,
+                )
+            exe = paddle.static.Executor(place=paddle.CUDAPlace(0))
+
+            exe.run(sp)
+            out = exe.run(
+                mp,
+                feed={
+                    "x": self.np_x,
+                    "y": self.np_y,
+                    "dout": self.np_dout
+                },
+                fetch_list=[out_static_pg] + out_grads_static_pg,
+            )
+            out_static_baseline, out_grads_static_baseline = out[0], out[1:]
+            for i in range(50):
+                out = exe.run(
+                    mp,
+                    feed={
+                        "x": self.np_x,
+                        "y": self.np_y,
+                        "dout": self.np_dout
+                    },
+                    fetch_list=[out_static_pg] + out_grads_static_pg,
+                )
+                out_static, out_grads_static = out[0], out[1:]
+                np.testing.assert_equal(
+                    out_static,
+                    out_static_baseline,
+                    err_msg=
+                    ('Develop: paddle.nn.functional.dropout static forward is unstable in %s dtype'
+                     ) % self.dtype,
+                )
+                for idx in range(len(out_grads_static)):
+                    np.testing.assert_equal(
+                        out_grads_static[idx],
+                        out_grads_static_baseline[idx],
+                        err_msg=
+                        ('Develop: paddle.nn.functional.dropout static grad is unstable in %s dtype'
+                         ) % self.dtype,
+                    )
+
+
+class TestMatmulDevelopCase1_FP16(TestMatmulDevelopCase1_FP32):
+
+    def init_params(self):
+        self.np_input_dir = "./inputs_case1.npz"
+        self.dtype = "float16"
+        self.save_static_res_path = "./static_develop_res_case1_fp16.npz"
+        self.save_eager_res_path = "./eager_develop_res_case1_fp16.npz"
+
+
+class TestMatmulDevelopCase1_BFP16(TestMatmulDevelopCase1_FP32):
+
+    def init_params(self):
+        self.np_input_dir = "./inputs_case1.npz"
+        self.dtype = "bfloat16"
+        self.save_static_res_path = "./static_develop_res_case1_bfp16.npz"
+        self.save_eager_res_path = "./eager_develop_res_case1_bfp16.npz"
+
+
+class TestMatmulDevelopCase2_FP32(TestMatmulDevelopCase1_FP32):
+
+    def init_params(self):
+        self.np_input_dir = "./inputs_case2.npz"
+        self.dtype = "float32"
+        self.save_static_res_path = "./static_develop_res_case2_fp32.npz"
+        self.save_eager_res_path = "./eager_develop_res_case2_fp32.npz"
+
+
+class TestMatmulDevelopCase2_FP16(TestMatmulDevelopCase1_FP32):
+
+    def init_params(self):
+        self.np_input_dir = "./inputs_case2.npz"
+        self.dtype = "float16"
+        self.save_static_res_path = "./static_develop_res_case2_fp16.npz"
+        self.save_eager_res_path = "./eager_develop_res_case2_fp16.npz"
+
+
+class TestMatmulDevelopCase2_BFP16(TestMatmulDevelopCase1_FP32):
+
+    def init_params(self):
+        self.np_input_dir = "./inputs_case2.npz"
+        self.dtype = "bfloat16"
+        self.save_static_res_path = "./static_develop_res_case2_bfp16.npz"
+        self.save_eager_res_path = "./eager_develop_res_case2_bfp16.npz"
+
+
+if __name__ == '__main__':
+    generate_np_inputs_and_dout()
+    unittest.main()

From f637d4d3f1a6027c98a1d5a5e0a82d0eaae5f497 Mon Sep 17 00:00:00 2001
From: gongel <gongel@qq.com>
Date: Tue, 18 Apr 2023 07:32:30 +0000
Subject: [PATCH 3/4] fix

---
 test_dropout/test_FusedDropoutAdd_dropout_incubate.py    | 5 +++--
 test_dropout/test_incubate_nn_FusedDropoutAdd_develop.py | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/test_dropout/test_FusedDropoutAdd_dropout_incubate.py b/test_dropout/test_FusedDropoutAdd_dropout_incubate.py
index 6eb424b..06a94f5 100644
--- a/test_dropout/test_FusedDropoutAdd_dropout_incubate.py
+++ b/test_dropout/test_FusedDropoutAdd_dropout_incubate.py
@@ -37,7 +37,7 @@ def setUp(self):
         del dout_torch
         self.out_torch = out_torch.cpu().detach().numpy()
         self.out_grads_torch = map_structure(
-            lambda x: x.cpu().numpy(),
+            lambda x: x.detach().cpu().numpy(),
             out_grads_torch,
         )
         del out_torch, out_grads_torch
@@ -109,7 +109,7 @@ def gen_eager_inputs_and_dout(self):
             place="gpu",
         )
         dout_eager.stop_gradient = False
-        return x_torch, y_torch, dout_torch
+        return x_eager, y_eager, dout_eager
 
     def gen_static_inputs_and_dout(self):
         x_static = paddle.static.data(
@@ -187,6 +187,7 @@ def test_eager_accuracy(self):
         develop_res_array = np.load(self.save_eager_res_path)
         out_eager_develop = develop_res_array["out_eager"]
         out_eager_grad_0_develop = develop_res_array["out_grads_eager_0"]
+        out_eager_grad_1_develop = develop_res_array["out_grads_eager_1"]
         out_eager_grads_develop = [
             out_eager_grad_0_develop, out_eager_grad_1_develop
         ]
diff --git a/test_dropout/test_incubate_nn_FusedDropoutAdd_develop.py b/test_dropout/test_incubate_nn_FusedDropoutAdd_develop.py
index 28ed383..73990d5 100644
--- a/test_dropout/test_incubate_nn_FusedDropoutAdd_develop.py
+++ b/test_dropout/test_incubate_nn_FusedDropoutAdd_develop.py
@@ -48,7 +48,7 @@ def setUp(self):
         del dout_torch
         self.out_torch = out_torch.cpu().detach().numpy()
         self.out_grads_torch = map_structure(
-            lambda x: x.cpu().numpy(),
+            lambda x: x.detach().cpu().numpy(),
             out_grads_torch,
         )
         del out_torch, out_grads_torch

From fd5411e741c9029d8415eeb9380205e5ca233aab Mon Sep 17 00:00:00 2001
From: gongel <gongel@qq.com>
Date: Thu, 20 Apr 2023 08:47:32 +0000
Subject: [PATCH 4/4] update seed

---
 .../test_FusedDropoutAdd_dropout_incubate.py  |  65 +-----
 test_dropout/test_dropout_incubate.py         | 196 ++++++++----------
 ...est_incubate_nn_FusedDropoutAdd_develop.py |  33 ++-
 test_dropout/test_nn_Dropout_develop.py       | 146 +++++++------
 .../test_nn_functional_dropout_develop.py     | 130 ++++++------
 5 files changed, 271 insertions(+), 299 deletions(-)

diff --git a/test_dropout/test_FusedDropoutAdd_dropout_incubate.py b/test_dropout/test_FusedDropoutAdd_dropout_incubate.py
index 06a94f5..f96b53f 100644
--- a/test_dropout/test_FusedDropoutAdd_dropout_incubate.py
+++ b/test_dropout/test_FusedDropoutAdd_dropout_incubate.py
@@ -8,6 +8,7 @@
 sys.path.append("..")
 from utils import TOLERANCE, convert_dtype_to_torch_type
 from paddle.fluid import core
+from paddle.fluid.framework import in_dygraph_mode
 
 seed = 1234
 
@@ -16,6 +17,8 @@ def set_seed():
     np.random.seed(seed)
     paddle.seed(seed)
     torch.manual_seed(seed)
+    if not in_dygraph_mode():
+        paddle.framework.random._manual_program_seed(seed)
     if core.is_compiled_with_cuda():
         paddle.set_flags({'FLAGS_cudnn_deterministic': True})
         torch.backends.cudnn.deterministic = True
@@ -29,19 +32,6 @@ def setUp(self):
         self.init_params()
         self.init_threshold()
         self.init_np_inputs_and_dout()
-        x_torch, y_torch, dout_torch = self.gen_torch_inputs_and_dout()
-        out_torch, out_grads_torch = self.cal_torch_res(
-            x_torch, y_torch, dout_torch)
-        del x_torch
-        del y_torch
-        del dout_torch
-        self.out_torch = out_torch.cpu().detach().numpy()
-        self.out_grads_torch = map_structure(
-            lambda x: x.detach().cpu().numpy(),
-            out_grads_torch,
-        )
-        del out_torch, out_grads_torch
-        torch.cuda.empty_cache()
 
     def init_params(self):
         self.np_input_dir = "./inputs_case1.npz"
@@ -66,30 +56,6 @@ def init_np_inputs_and_dout(self):
             self.np_y = self.np_y.astype("float16")
             self.np_dout = self.np_dout.astype("float16")
 
-    def gen_torch_inputs_and_dout(self):
-        x_torch = torch.tensor(
-            self.np_x,
-            device='cuda',
-            dtype=convert_dtype_to_torch_type(self.dtype)
-            if self.dtype != 'bfloat16' else torch.float32,
-            requires_grad=True,
-        )
-        y_torch = torch.tensor(
-            self.np_y,
-            device='cuda',
-            dtype=convert_dtype_to_torch_type(self.dtype)
-            if self.dtype != 'bfloat16' else torch.float32,
-            requires_grad=True,
-        )
-        dout_torch = torch.tensor(
-            self.np_dout,
-            device='cuda',
-            dtype=convert_dtype_to_torch_type(self.dtype)
-            if self.dtype != 'bfloat16' else torch.float32,
-            requires_grad=True,
-        )
-        return x_torch, y_torch, dout_torch
-
     def gen_eager_inputs_and_dout(self):
         x_eager = paddle.to_tensor(
             self.np_x,
@@ -132,21 +98,6 @@ def gen_static_inputs_and_dout(self):
         dout_static.stop_gradient = False
         return x_static, y_static, dout_static
 
-    def cal_torch_res(self, x, y, dout):
-        x_t = x
-        y_t = y
-        dout_t = dout
-        if self.dtype == "bfloat16":
-            x_t = x.to(dtype=torch.bfloat16)
-            y_t = y.to(dtype=torch.bfloat16)
-            dout_t = dout.to(dtype=torch.bfloat16)
-        torch.manual_seed(seed)
-        out = torch.nn.functional.dropout(x_t, p=self.p) + y_t
-        out_grads = torch.autograd.grad([out], [x, y], grad_outputs=[dout_t])
-        if self.dtype == "bfloat16":
-            out = out.to(dtype=torch.float32)
-        return out, out_grads
-
     def cal_eager_res(self, x, y, dout):
         x_t = x
         y_t = y
@@ -155,7 +106,7 @@ def cal_eager_res(self, x, y, dout):
             x_t = paddle.cast(x, dtype="uint16")
             y_t = paddle.cast(y, dtype="uint16")
             dout_t = paddle.cast(dout, dtype="uint16")
-        paddle.seed(seed)
+        set_seed()
         out = paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout(
             x_t, p=self.p) + y_t
         out_grads = paddle.grad([out], [x, y],
@@ -173,7 +124,7 @@ def cal_static_res(self, x, y, dout):
             x_t = paddle.cast(x, dtype="uint16")
             y_t = paddle.cast(y, dtype="uint16")
             dout_t = paddle.cast(dout, dtype="uint16")
-        paddle.seed(seed)
+        set_seed()
         out = paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout(
             x_t, p=self.p) + y_t
         out_grads = paddle.static.gradients([out], [x, y],
@@ -194,6 +145,7 @@ def test_eager_accuracy(self):
 
         # calculate incubate eager res
         x_eager, y_eager, dout_eager = self.gen_eager_inputs_and_dout()
+        set_seed()
         out_eager, out_grads_eager = self.cal_eager_res(
             x_eager, y_eager, dout_eager)
         del x_eager
@@ -246,6 +198,7 @@ def test_static_accuracy(self):
                     dout_static,
                 )
             exe = paddle.static.Executor(place=paddle.CUDAPlace(0))
+            set_seed()
             exe.run(sp)
             out = exe.run(
                 mp,
@@ -277,6 +230,7 @@ def test_static_accuracy(self):
 
     def test_eager_stability(self):
         x_eager, y_eager, dout_eager = self.gen_eager_inputs_and_dout()
+        set_seed()
         out_eager_baseline, out_grads_eager_baseline = self.cal_eager_res(
             x_eager, y_eager, dout_eager)
         out_eager_baseline_np = out_eager_baseline.numpy()
@@ -289,6 +243,7 @@ def test_eager_stability(self):
         paddle.device.cuda.empty_cache()
 
         for i in range(50):
+            set_seed()
             out_eager, out_grads_eager = self.cal_eager_res(
                 x_eager, y_eager, dout_eager)
             out_eager = out_eager.numpy()
@@ -325,6 +280,7 @@ def test_static_stability(self):
                     dout_static,
                 )
             exe = paddle.static.Executor(place=paddle.CUDAPlace(0))
+            set_seed()
             exe.run(sp)
             out = exe.run(
                 mp,
@@ -337,6 +293,7 @@ def test_static_stability(self):
             )
             out_static_baseline, out_grads_static_baseline = out[0], out[1:]
             for i in range(50):
+                set_seed()
                 out = exe.run(
                     mp,
                     feed={
diff --git a/test_dropout/test_dropout_incubate.py b/test_dropout/test_dropout_incubate.py
index cea762d..afd6b2d 100644
--- a/test_dropout/test_dropout_incubate.py
+++ b/test_dropout/test_dropout_incubate.py
@@ -4,16 +4,21 @@
 import unittest
 from paddle.fluid.layers.utils import map_structure
 import sys
+
 sys.path.append("..")
 from utils import TOLERANCE, convert_dtype_to_torch_type
 from paddle.fluid import core
+from paddle.fluid.framework import in_dygraph_mode
 
 seed = 1234
 
+
 def set_seed():
     np.random.seed(seed)
     paddle.seed(seed)
     torch.manual_seed(seed)
+    if not in_dygraph_mode():
+        paddle.framework.random._manual_program_seed(seed)
     if core.is_compiled_with_cuda():
         paddle.set_flags({'FLAGS_cudnn_deterministic': True})
         torch.backends.cudnn.deterministic = True
@@ -21,29 +26,19 @@ def set_seed():
 
 
 class TestMatmulIncubateCase1_FP32(unittest.TestCase):
+
     def setUp(self):
         set_seed()
         self.init_params()
         self.init_threshold()
         self.init_np_inputs_and_dout()
-        x_torch, dout_torch = self.gen_torch_inputs_and_dout()
-        out_torch, out_grads_torch = self.cal_torch_res(x_torch, dout_torch)
-        del x_torch 
-        del dout_torch 
-        self.out_torch = out_torch.cpu().detach().numpy()
-        self.out_grads_torch = map_structure(
-                                lambda x: x.cpu().numpy(),
-                                out_grads_torch,
-                            )
-        del out_torch, out_grads_torch
-        torch.cuda.empty_cache()
 
     def init_params(self):
         self.np_input_dir = "./inputs_case1.npz"
         self.dtype = "float32"
         self.save_static_res_path = "./static_develop_res_case1_fp32.npz"
         self.save_eager_res_path = "./eager_develop_res_case1_fp32.npz"
-    
+
     def init_threshold(self):
         self.atol = TOLERANCE[self.dtype]["atol"]
         self.rtol = TOLERANCE[self.dtype]["rtol"]
@@ -58,26 +53,7 @@ def init_np_inputs_and_dout(self):
         if self.dtype == "float16":
             self.np_x = self.np_x.astype("float16")
             self.np_dout = self.np_dout.astype("float16")
-    
-    def gen_torch_inputs_and_dout(self):
-        x_torch = torch.tensor(
-            self.np_x,
-            device='cuda',
-            dtype=convert_dtype_to_torch_type(self.dtype)
-            if self.dtype != 'bfloat16'
-            else torch.float32,
-            requires_grad=True,
-        )
-        dout_torch = torch.tensor(
-            self.np_dout,
-            device='cuda',
-            dtype=convert_dtype_to_torch_type(self.dtype)
-            if self.dtype != 'bfloat16'
-            else torch.float32,
-            requires_grad=True,
-        )
-        return x_torch, dout_torch
-    
+
     def gen_eager_inputs_and_dout(self):
         x_eager = paddle.to_tensor(
             self.np_x,
@@ -108,30 +84,18 @@ def gen_static_inputs_and_dout(self):
         dout_static.stop_gradient = False
         return x_static, dout_static
 
-    def cal_torch_res(self, x, dout):
-        x_t = x
-        dout_t = dout
-        if self.dtype == "bfloat16":
-            x_t = x.to(dtype=torch.bfloat16)
-            dout_t = dout.to(dtype=torch.bfloat16)
-        torch.manual_seed(seed)
-        out = torch.nn.functional.dropout(x_t, p=self.p)
-        out_grads = torch.autograd.grad([out], [x], grad_outputs=[dout_t])
-        if self.dtype == "bfloat16":
-            out = out.to(dtype=torch.float32)
-        return out, out_grads
-
     def cal_eager_res(self, x, dout):
         x_t = x
         dout_t = dout
         if self.dtype == "bfloat16":
             x_t = paddle.cast(x, dtype="uint16")
             dout_t = paddle.cast(dout, dtype="uint16")
-        paddle.seed(seed)
-        out = paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout(x_t, p=self.p)
-        out_grads = paddle.grad(
-            [out], [x], grad_outputs=[dout_t], retain_graph=True
-        )
+        set_seed()
+        out = paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout(
+            x_t, p=self.p)
+        out_grads = paddle.grad([out], [x],
+                                grad_outputs=[dout_t],
+                                retain_graph=True)
         if self.dtype == "bfloat16":
             out = paddle.cast(out, dtype="float32")
         return out, out_grads
@@ -142,11 +106,11 @@ def cal_static_res(self, x, dout):
         if self.dtype == "bfloat16":
             x_t = paddle.cast(x, dtype="uint16")
             dout_t = paddle.cast(dout, dtype="uint16")
-        paddle.seed(seed)
-        out = paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout(x_t, p=self.p)
-        out_grads = paddle.static.gradients(
-            [out], [x], target_gradients=[dout_t]
-        )
+        set_seed()
+        out = paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout(
+            x_t, p=self.p)
+        out_grads = paddle.static.gradients([out], [x],
+                                            target_gradients=[dout_t])
         if self.dtype == "bfloat16":
             out = paddle.cast(out, dtype="float32")
         return out, out_grads
@@ -160,15 +124,16 @@ def test_eager_accuracy(self):
 
         # calculate incubate eager res
         x_eager, dout_eager = self.gen_eager_inputs_and_dout()
+        set_seed()
         out_eager, out_grads_eager = self.cal_eager_res(x_eager, dout_eager)
         del x_eager
         del dout_eager
         paddle.device.cuda.empty_cache()
         out_eager_np = out_eager.numpy()
         out_grads_eager_np = map_structure(
-                                lambda x: x.numpy(),
-                                out_grads_eager,
-                            )
+            lambda x: x.numpy(),
+            out_grads_eager,
+        )
         del out_eager
         del out_grads_eager
         paddle.device.cuda.empty_cache()
@@ -176,21 +141,19 @@ def test_eager_accuracy(self):
         np.testing.assert_equal(
             out_eager_np,
             out_eager_develop,
-            err_msg=(
-                'Incubate: compare paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout incubate eager forward res with develop eager forward res failed in %s dtype'
-            )
-            % self.dtype,
+            err_msg=
+            ('Incubate: compare paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout incubate eager forward res with develop eager forward res failed in %s dtype'
+             ) % self.dtype,
         )
         for idx in range(len(out_grads_eager_np)):
             np.testing.assert_equal(
                 out_grads_eager_np[idx],
                 out_eager_grads_develop[idx],
-            err_msg=(
-                'Incubate: compare paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout incubate eager grad res with develop eager grad res failed in %s dtype'
+                err_msg=
+                ('Incubate: compare paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout incubate eager grad res with develop eager grad res failed in %s dtype'
+                 ) % self.dtype,
             )
-                % self.dtype,
-            )
-    
+
     def test_static_accuracy(self):
         # get develop static res
         develop_res_array = np.load(self.save_static_res_path)
@@ -207,71 +170,73 @@ def test_static_accuracy(self):
                     x_static,
                     dout_static,
                 )
-            exe = paddle.static.Executor(
-                place=paddle.CUDAPlace(0)
-            )
+            exe = paddle.static.Executor(place=paddle.CUDAPlace(0))
+            set_seed()
             exe.run(sp)
             out = exe.run(
                 mp,
-                feed={"x": self.np_x, "dout": self.np_dout},
+                feed={
+                    "x": self.np_x,
+                    "dout": self.np_dout
+                },
                 fetch_list=[out_static] + out_grads_static,
             )
             out_static, out_grads_static = out[0], out[1:]
-        
+
         # compare incubate static res with develop static res
         np.testing.assert_equal(
             out_static,
             out_static_develop,
-            err_msg=(
-                'Incubate: compare paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout incubate static forward res with develop static forward res failed in %s dtype'
-            )
-            % self.dtype,
+            err_msg=
+            ('Incubate: compare paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout incubate static forward res with develop static forward res failed in %s dtype'
+             ) % self.dtype,
         )
         for idx in range(len(out_grads_static)):
             np.testing.assert_equal(
                 out_grads_static[idx],
                 out_grads_static_develop[idx],
-            err_msg=(
-                'Incubate: compare paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout incubate static grad res with develop static grad res failed in %s dtype'
-            )
-                % self.dtype,
+                err_msg=
+                ('Incubate: compare paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout incubate static grad res with develop static grad res failed in %s dtype'
+                 ) % self.dtype,
             )
 
     def test_eager_stability(self):
         x_eager, dout_eager = self.gen_eager_inputs_and_dout()
-        out_eager_baseline, out_grads_eager_baseline = self.cal_eager_res(x_eager, dout_eager)
+        set_seed()
+        out_eager_baseline, out_grads_eager_baseline = self.cal_eager_res(
+            x_eager, dout_eager)
         out_eager_baseline_np = out_eager_baseline.numpy()
         out_grads_eager_baseline_np = map_structure(
-                                lambda x: x.numpy(),
-                                out_grads_eager_baseline,
-                            )
+            lambda x: x.numpy(),
+            out_grads_eager_baseline,
+        )
         del out_eager_baseline
         del out_grads_eager_baseline
         paddle.device.cuda.empty_cache()
 
         for i in range(50):
-            out_eager, out_grads_eager = self.cal_eager_res(x_eager, dout_eager)
+            set_seed()
+            out_eager, out_grads_eager = self.cal_eager_res(
+                x_eager, dout_eager)
             out_eager = out_eager.numpy()
             out_grads_eager = map_structure(
-                                    lambda x: x.numpy(),
-                                    out_grads_eager,
-                                )
+                lambda x: x.numpy(),
+                out_grads_eager,
+            )
             np.testing.assert_equal(
                 out_eager,
                 out_eager_baseline_np,
-                err_msg=(
-                    'Incubate: paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout eager forward is unstable in %s dtype'
-                )
-                % self.dtype,
+                err_msg=
+                ('Incubate: paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout eager forward is unstable in %s dtype'
+                 ) % self.dtype,
             )
             for idx in range(len(out_grads_eager)):
                 np.testing.assert_equal(
                     out_grads_eager[idx],
                     out_grads_eager_baseline_np[idx],
-                    err_msg=(
-                        'Incubate: paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout eager grad is unstable in %s dtype'
-                    )
-                    % self.dtype,
+                    err_msg=
+                    ('Incubate: paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout eager grad is unstable in %s dtype'
+                     ) % self.dtype,
                 )
 
     def test_static_stability(self):
@@ -284,71 +249,84 @@ def test_static_stability(self):
                     x_static,
                     dout_static,
                 )
-            exe = paddle.static.Executor(
-                place=paddle.CUDAPlace(0)
-            )
+            exe = paddle.static.Executor(place=paddle.CUDAPlace(0))
+            set_seed()
             exe.run(sp)
             out = exe.run(
                 mp,
-                feed={"x": self.np_x, "dout": self.np_dout},
+                feed={
+                    "x": self.np_x,
+                    "dout": self.np_dout
+                },
                 fetch_list=[out_static_pg] + out_grads_static_pg,
             )
             out_static_baseline, out_grads_static_baseline = out[0], out[1:]
             for i in range(50):
+                set_seed()
                 out = exe.run(
                     mp,
-                    feed={"x": self.np_x, "dout": self.np_dout},
+                    feed={
+                        "x": self.np_x,
+                        "dout": self.np_dout
+                    },
                     fetch_list=[out_static_pg] + out_grads_static_pg,
                 )
                 out_static, out_grads_static = out[0], out[1:]
                 np.testing.assert_equal(
                     out_static,
                     out_static_baseline,
-                    err_msg=(
-                        'Incubate: paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout static forward is unstable in %s dtype'
-                    )
-                    % self.dtype,
+                    err_msg=
+                    ('Incubate: paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout static forward is unstable in %s dtype'
+                     ) % self.dtype,
                 )
                 for idx in range(len(out_grads_static)):
                     np.testing.assert_equal(
                         out_grads_static[idx],
                         out_grads_static_baseline[idx],
-                        err_msg=(
-                            'Incubate: paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout static grad is unstable in %s dtype'
-                        )
-                        % self.dtype,
+                        err_msg=
+                        ('Incubate: paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout static grad is unstable in %s dtype'
+                         ) % self.dtype,
                     )
 
 
 class TestMatmulIncubateCase1_FP16(TestMatmulIncubateCase1_FP32):
+
     def init_params(self):
         self.np_input_dir = "./inputs_case1.npz"
         self.dtype = "float16"
         self.save_static_res_path = "./static_develop_res_case1_fp16.npz"
         self.save_eager_res_path = "./eager_develop_res_case1_fp16.npz"
 
+
 class TestMatmulIncubateCase1_BFP16(TestMatmulIncubateCase1_FP32):
+
     def init_params(self):
         self.np_input_dir = "./inputs_case1.npz"
         self.dtype = "bfloat16"
         self.save_static_res_path = "./static_develop_res_case1_bfp16.npz"
         self.save_eager_res_path = "./eager_develop_res_case1_bfp16.npz"
 
+
 class TestMatmulIncubateCase2_FP32(TestMatmulIncubateCase1_FP32):
+
     def init_params(self):
         self.np_input_dir = "./inputs_case2.npz"
         self.dtype = "float32"
         self.save_static_res_path = "./static_develop_res_case2_fp32.npz"
         self.save_eager_res_path = "./eager_develop_res_case2_fp32.npz"
 
+
 class TestMatmulIncubateCase2_FP16(TestMatmulIncubateCase1_FP32):
+
     def init_params(self):
         self.np_input_dir = "./inputs_case2.npz"
         self.dtype = "float16"
         self.save_static_res_path = "./static_develop_res_case2_fp16.npz"
         self.save_eager_res_path = "./eager_develop_res_case2_fp16.npz"
 
+
 class TestMatmulIncubateCase2_BFP16(TestMatmulIncubateCase1_FP32):
+
     def init_params(self):
         self.np_input_dir = "./inputs_case2.npz"
         self.dtype = "bfloat16"
diff --git a/test_dropout/test_incubate_nn_FusedDropoutAdd_develop.py b/test_dropout/test_incubate_nn_FusedDropoutAdd_develop.py
index 73990d5..c30048d 100644
--- a/test_dropout/test_incubate_nn_FusedDropoutAdd_develop.py
+++ b/test_dropout/test_incubate_nn_FusedDropoutAdd_develop.py
@@ -8,15 +8,21 @@
 from utils import TOLERANCE, convert_dtype_to_torch_type
 from paddle.fluid import core
 from paddle.utils import map_structure
+from paddle.fluid.framework import in_dygraph_mode
 
 seed = 1234
-np.random.seed(seed)
-paddle.seed(seed)
-torch.manual_seed(seed)
-if core.is_compiled_with_cuda():
-    paddle.set_flags({'FLAGS_cudnn_deterministic': True})
-    torch.backends.cudnn.deterministic = True
-    torch.cuda.manual_seed_all(seed)
+
+
+def set_seed():
+    np.random.seed(seed)
+    paddle.seed(seed)
+    torch.manual_seed(seed)
+    if not in_dygraph_mode():
+        paddle.framework.random._manual_program_seed(seed)
+    if core.is_compiled_with_cuda():
+        paddle.set_flags({'FLAGS_cudnn_deterministic': True})
+        torch.backends.cudnn.deterministic = True
+        torch.cuda.manual_seed_all(seed)
 
 
 def generate_np_inputs_and_dout():
@@ -151,7 +157,7 @@ def cal_torch_res(self, x, y, dout):
             x_t = x.to(dtype=torch.bfloat16)
             y_t = y.to(dtype=torch.bfloat16)
             dout_t = dout.to(dtype=torch.bfloat16)
-        torch.manual_seed(seed)
+        set_seed()
         out = torch.nn.functional.dropout(x_t, p=self.p) + y_t
         out_grads = torch.autograd.grad([out], [x, y], grad_outputs=[dout_t])
         if self.dtype == "bfloat16":
@@ -166,7 +172,7 @@ def cal_eager_res(self, x, y, dout):
             x_t = paddle.cast(x, dtype="uint16")
             y_t = paddle.cast(y, dtype="uint16")
             dout_t = paddle.cast(dout, dtype="uint16")
-        paddle.seed(seed)
+        set_seed()
         out = paddle.incubate.nn.FusedDropoutAdd(p=self.p)(x_t, y_t)
         out_grads = paddle.grad([out], [x, y], grad_outputs=[dout_t])
         if self.dtype == "bfloat16":
@@ -181,7 +187,7 @@ def cal_static_res(self, x, y, dout):
             x_t = paddle.cast(x, dtype="uint16")
             y_t = paddle.cast(y, dtype="uint16")
             dout_t = paddle.cast(dout, dtype="uint16")
-        paddle.seed(seed)
+        set_seed()
         out = paddle.incubate.nn.FusedDropoutAdd(p=self.p)(x_t, y_t)
         out_grads = paddle.static.gradients([out], [x],
                                             target_gradients=[dout_t])
@@ -191,6 +197,7 @@ def cal_static_res(self, x, y, dout):
 
     def test_eager_accuracy(self):
         x_eager, y_eager, dout_eager = self.gen_eager_inputs_and_dout()
+        set_seed()
         out_eager, out_grads_eager = self.cal_eager_res(
             x_eager, y_eager, dout_eager)
         del x_eager
@@ -244,6 +251,7 @@ def test_static_accuracy(self):
                     dout_static,
                 )
             exe = paddle.static.Executor(place=paddle.CUDAPlace(0))
+            set_seed()
             exe.run(sp)
             out = exe.run(
                 mp,
@@ -285,6 +293,7 @@ def test_static_accuracy(self):
 
     def test_eager_stability(self):
         x_eager, y_eager, dout_eager = self.gen_eager_inputs_and_dout()
+        set_seed()
         out_eager_baseline, out_grads_eager_baseline = self.cal_eager_res(
             x_eager, y_eager, dout_eager)
         out_eager_baseline_np = out_eager_baseline.numpy()
@@ -297,6 +306,7 @@ def test_eager_stability(self):
         paddle.device.cuda.empty_cache()
 
         for i in range(50):
+            set_seed()
             out_eager, out_grads_eager = self.cal_eager_res(
                 x_eager, y_eager, dout_eager)
             out_eager = out_eager.numpy()
@@ -333,7 +343,7 @@ def test_static_stability(self):
                     dout_static,
                 )
             exe = paddle.static.Executor(place=paddle.CUDAPlace(0))
-
+            set_seed()
             exe.run(sp)
             out = exe.run(
                 mp,
@@ -346,6 +356,7 @@ def test_static_stability(self):
             )
             out_static_baseline, out_grads_static_baseline = out[0], out[1:]
             for i in range(50):
+                set_seed()
                 out = exe.run(
                     mp,
                     feed={
diff --git a/test_dropout/test_nn_Dropout_develop.py b/test_dropout/test_nn_Dropout_develop.py
index ddba168..7c169a8 100644
--- a/test_dropout/test_nn_Dropout_develop.py
+++ b/test_dropout/test_nn_Dropout_develop.py
@@ -3,19 +3,27 @@
 import torch
 import unittest
 import sys
+
 sys.path.append("..")
 from utils import TOLERANCE, convert_dtype_to_torch_type
 from paddle.fluid import core
 from paddle.utils import map_structure
+from paddle.fluid.framework import in_dygraph_mode
 
 seed = 1234
-np.random.seed(seed)
-paddle.seed(seed)
-torch.manual_seed(seed)
-if core.is_compiled_with_cuda():
-    paddle.set_flags({'FLAGS_cudnn_deterministic': True})
-    torch.backends.cudnn.deterministic = True
-    torch.cuda.manual_seed_all(seed)
+
+
+def set_seed():
+    np.random.seed(seed)
+    paddle.seed(seed)
+    torch.manual_seed(seed)
+    if not in_dygraph_mode():
+        paddle.framework.random._manual_program_seed(seed)
+    if core.is_compiled_with_cuda():
+        paddle.set_flags({'FLAGS_cudnn_deterministic': True})
+        torch.backends.cudnn.deterministic = True
+        torch.cuda.manual_seed_all(seed)
+
 
 def generate_np_inputs_and_dout():
     p = 0.1
@@ -31,6 +39,7 @@ def generate_np_inputs_and_dout():
 
 
 class TestMatmulDevelopCase1_FP32(unittest.TestCase):
+
     def setUp(self):
         self.init_params()
         self.init_threshold()
@@ -73,16 +82,14 @@ def gen_torch_inputs_and_dout(self):
             self.np_x,
             device='cuda',
             dtype=convert_dtype_to_torch_type(self.dtype)
-            if self.dtype != 'bfloat16'
-            else torch.float32,
+            if self.dtype != 'bfloat16' else torch.float32,
             requires_grad=True,
         )
         dout_torch = torch.tensor(
             self.np_dout,
             device='cuda',
             dtype=convert_dtype_to_torch_type(self.dtype)
-            if self.dtype != 'bfloat16'
-            else torch.float32,
+            if self.dtype != 'bfloat16' else torch.float32,
             requires_grad=True,
         )
         return x_torch, dout_torch
@@ -123,7 +130,7 @@ def cal_torch_res(self, x, dout):
         if self.dtype == "bfloat16":
             x_t = x.to(dtype=torch.bfloat16)
             dout_t = dout.to(dtype=torch.bfloat16)
-        torch.manual_seed(seed)
+        set_seed()
         out = torch.nn.Dropout(p=self.p)(x_t)
         out_grads = torch.autograd.grad([out], [x], grad_outputs=[dout_t])
         if self.dtype == "bfloat16":
@@ -136,11 +143,9 @@ def cal_eager_res(self, x, dout):
         if self.dtype == "bfloat16":
             x_t = paddle.cast(x, dtype="uint16")
             dout_t = paddle.cast(dout, dtype="uint16")
-        paddle.seed(seed)
+        set_seed()
         out = paddle.nn.Dropout(p=self.p)(x_t)
-        out_grads = paddle.grad(
-            [out], [x], grad_outputs=[dout_t]
-        )
+        out_grads = paddle.grad([out], [x], grad_outputs=[dout_t])
         if self.dtype == "bfloat16":
             out = paddle.cast(out, dtype="float32")
         return out, out_grads
@@ -151,17 +156,17 @@ def cal_static_res(self, x, dout):
         if self.dtype == "bfloat16":
             x_t = paddle.cast(x, dtype="uint16")
             dout_t = paddle.cast(dout, dtype="uint16")
-        paddle.seed(seed)
+        set_seed()
         out = paddle.nn.Dropout(p=self.p)(x_t)
-        out_grads = paddle.static.gradients(
-            [out], [x], target_gradients=[dout_t]
-        )
+        out_grads = paddle.static.gradients([out], [x],
+                                            target_gradients=[dout_t])
         if self.dtype == "bfloat16":
             out = paddle.cast(out, dtype="float32")
         return out, out_grads
 
     def test_eager_accuracy(self):
         x_eager, dout_eager = self.gen_eager_inputs_and_dout()
+        set_seed()
         out_eager, out_grads_eager = self.cal_eager_res(x_eager, dout_eager)
         del x_eager
         del dout_eager
@@ -175,7 +180,8 @@ def test_eager_accuracy(self):
         del out_grads_eager
         paddle.device.cuda.empty_cache()
         # save eager res for test_matmul_incubate
-        np.savez(self.save_eager_res_path, out_eager=out_eager_np,
+        np.savez(self.save_eager_res_path,
+                 out_eager=out_eager_np,
                  out_grads_eager_0=out_grads_eager_np[0])
 
         # compare eager res with torch
@@ -184,10 +190,9 @@ def test_eager_accuracy(self):
             self.out_torch,
             self.atol,
             self.rtol,
-            err_msg=(
-                'Develop: compare paddle.nn.functional.dropout eager forward res with torch failed in %s dtype'
-            )
-            % self.dtype,
+            err_msg=
+            ('Develop: compare paddle.nn.functional.dropout eager forward res with torch failed in %s dtype'
+             ) % self.dtype,
         )
         for idx in range(len(out_grads_eager_np)):
             np.testing.assert_allclose(
@@ -195,11 +200,11 @@ def test_eager_accuracy(self):
                 self.out_grads_torch[idx],
                 self.atol,
                 self.rtol,
-                err_msg=(
-                    'Develop: compare paddle.nn.Dropout eager grad res with torch failed in %s dtype'
-                )
-                % self.dtype,
+                err_msg=
+                ('Develop: compare paddle.nn.Dropout eager grad res with torch failed in %s dtype'
+                 ) % self.dtype,
             )
+
     def test_static_accuracy(self):
         with paddle.fluid.framework._dygraph_guard(None):
             mp, sp = paddle.static.Program(), paddle.static.Program()
@@ -209,19 +214,22 @@ def test_static_accuracy(self):
                     x_static,
                     dout_static,
                 )
-            exe = paddle.static.Executor(
-                place=paddle.CUDAPlace(0)
-            )
+            exe = paddle.static.Executor(place=paddle.CUDAPlace(0))
+            set_seed()
             exe.run(sp)
             out = exe.run(
                 mp,
-                feed={"x": self.np_x, "dout": self.np_dout},
+                feed={
+                    "x": self.np_x,
+                    "dout": self.np_dout
+                },
                 fetch_list=[out_static] + out_grads_static,
             )
             out_static, out_grads_static = out[0], out[1:]
 
         # save static res for test_matmul_incubate
-        np.savez(self.save_static_res_path, out_static=out_static,
+        np.savez(self.save_static_res_path,
+                 out_static=out_static,
                  out_grads_static_0=out_grads_static[0])
 
         # compare static res with torch
@@ -230,10 +238,9 @@ def test_static_accuracy(self):
             self.out_torch,
             self.atol,
             self.rtol,
-            err_msg=(
-                'Develop: compare paddle.nn.Dropout static forward res with torch failed in %s dtype'
-            )
-            % self.dtype,
+            err_msg=
+            ('Develop: compare paddle.nn.Dropout static forward res with torch failed in %s dtype'
+             ) % self.dtype,
         )
         for idx in range(len(out_grads_static)):
             np.testing.assert_allclose(
@@ -241,14 +248,14 @@ def test_static_accuracy(self):
                 self.out_grads_torch[idx],
                 self.atol,
                 self.rtol,
-                err_msg=(
-                    'Develop: compare paddle.nn.Dropout static grad res with torch failed in %s dtype'
-                )
-                % self.dtype,
+                err_msg=
+                ('Develop: compare paddle.nn.Dropout static grad res with torch failed in %s dtype'
+                 ) % self.dtype,
             )
-    
+
     def test_eager_stability(self):
         x_eager, dout_eager = self.gen_eager_inputs_and_dout()
+        set_seed()
         out_eager_baseline, out_grads_eager_baseline = self.cal_eager_res(
             x_eager, dout_eager)
         out_eager_baseline_np = out_eager_baseline.numpy()
@@ -261,6 +268,7 @@ def test_eager_stability(self):
         paddle.device.cuda.empty_cache()
 
         for i in range(50):
+            set_seed()
             out_eager, out_grads_eager = self.cal_eager_res(
                 x_eager, dout_eager)
             out_eager = out_eager.numpy()
@@ -271,24 +279,21 @@ def test_eager_stability(self):
             np.testing.assert_equal(
                 out_eager,
                 out_eager_baseline_np,
-                err_msg=(
-                    'Develop: paddle.nn.Dropout eager forward is unstable in %s dtype'
-                )
-                % self.dtype,
+                err_msg=
+                ('Develop: paddle.nn.Dropout eager forward is unstable in %s dtype'
+                 ) % self.dtype,
             )
             for idx in range(len(out_grads_eager)):
                 np.testing.assert_equal(
                     out_grads_eager[idx],
                     out_grads_eager_baseline_np[idx],
-                    err_msg=(
-                        'Develop: paddle.nn.Dropout eager grad is unstable in %s dtype'
-                    )
-                    % self.dtype,
+                    err_msg=
+                    ('Develop: paddle.nn.Dropout eager grad is unstable in %s dtype'
+                     ) % self.dtype,
                 )
 
     def test_static_stability(self):
         with paddle.fluid.framework._dygraph_guard(None):
-            paddle.framework.random._manual_program_seed(seed)
             mp, sp = paddle.static.Program(), paddle.static.Program()
             with paddle.static.program_guard(mp, sp):
                 x_static, dout_static = self.gen_static_inputs_and_dout()
@@ -296,43 +301,48 @@ def test_static_stability(self):
                     x_static,
                     dout_static,
                 )
-            exe = paddle.static.Executor(
-                place=paddle.CUDAPlace(0)
-            )
-
+            exe = paddle.static.Executor(place=paddle.CUDAPlace(0))
+            set_seed()
             exe.run(sp)
             out = exe.run(
                 mp,
-                feed={"x": self.np_x, "dout": self.np_dout},
+                feed={
+                    "x": self.np_x,
+                    "dout": self.np_dout
+                },
                 fetch_list=[out_static_pg] + out_grads_static_pg,
             )
             out_static_baseline, out_grads_static_baseline = out[0], out[1:]
             for i in range(50):
+                set_seed()
                 out = exe.run(
                     mp,
-                    feed={"x": self.np_x, "dout": self.np_dout},
+                    feed={
+                        "x": self.np_x,
+                        "dout": self.np_dout
+                    },
                     fetch_list=[out_static_pg] + out_grads_static_pg,
                 )
                 out_static, out_grads_static = out[0], out[1:]
                 np.testing.assert_equal(
                     out_static,
                     out_static_baseline,
-                    err_msg=(
-                        'Develop: paddle.nn.Dropout static forward is unstable in %s dtype'
-                    )
-                    % self.dtype,
+                    err_msg=
+                    ('Develop: paddle.nn.Dropout static forward is unstable in %s dtype'
+                     ) % self.dtype,
                 )
                 for idx in range(len(out_grads_static)):
                     np.testing.assert_equal(
                         out_grads_static[idx],
                         out_grads_static_baseline[idx],
-                        err_msg=(
-                            'Develop: paddle.nn.Dropout static grad is unstable in %s dtype'
-                        )
-                        % self.dtype,
+                        err_msg=
+                        ('Develop: paddle.nn.Dropout static grad is unstable in %s dtype'
+                         ) % self.dtype,
                     )
 
+
 class TestMatmulDevelopCase1_FP16(TestMatmulDevelopCase1_FP32):
+
     def init_params(self):
         self.np_input_dir = "./inputs_case1.npz"
         self.dtype = "float16"
@@ -341,6 +351,7 @@ def init_params(self):
 
 
 class TestMatmulDevelopCase1_BFP16(TestMatmulDevelopCase1_FP32):
+
     def init_params(self):
         self.np_input_dir = "./inputs_case1.npz"
         self.dtype = "bfloat16"
@@ -349,6 +360,7 @@ def init_params(self):
 
 
 class TestMatmulDevelopCase2_FP32(TestMatmulDevelopCase1_FP32):
+
     def init_params(self):
         self.np_input_dir = "./inputs_case2.npz"
         self.dtype = "float32"
@@ -357,6 +369,7 @@ def init_params(self):
 
 
 class TestMatmulDevelopCase2_FP16(TestMatmulDevelopCase1_FP32):
+
     def init_params(self):
         self.np_input_dir = "./inputs_case2.npz"
         self.dtype = "float16"
@@ -365,6 +378,7 @@ def init_params(self):
 
 
 class TestMatmulDevelopCase2_BFP16(TestMatmulDevelopCase1_FP32):
+
     def init_params(self):
         self.np_input_dir = "./inputs_case2.npz"
         self.dtype = "bfloat16"
diff --git a/test_dropout/test_nn_functional_dropout_develop.py b/test_dropout/test_nn_functional_dropout_develop.py
index 52243ab..58f53a1 100644
--- a/test_dropout/test_nn_functional_dropout_develop.py
+++ b/test_dropout/test_nn_functional_dropout_develop.py
@@ -3,22 +3,28 @@
 import torch
 import unittest
 import sys
+
 sys.path.append("..")
 from utils import TOLERANCE, convert_dtype_to_torch_type
 from paddle.fluid import core
 from paddle.utils import map_structure
+from paddle.fluid.framework import in_dygraph_mode
 
 seed = 1234
 
+
 def set_seed():
     np.random.seed(seed)
     paddle.seed(seed)
     torch.manual_seed(seed)
+    if not in_dygraph_mode():
+        paddle.framework.random._manual_program_seed(seed)
     if core.is_compiled_with_cuda():
         paddle.set_flags({'FLAGS_cudnn_deterministic': True})
         torch.backends.cudnn.deterministic = True
         torch.cuda.manual_seed_all(seed)
 
+
 def generate_np_inputs_and_dout():
     p = 0.1
 
@@ -33,8 +39,8 @@ def generate_np_inputs_and_dout():
 
 
 class TestMatmulDevelopCase1_FP32(unittest.TestCase):
+
     def setUp(self):
-        set_seed()
         self.init_params()
         self.init_threshold()
         self.init_np_inputs_and_dout()
@@ -76,16 +82,14 @@ def gen_torch_inputs_and_dout(self):
             self.np_x,
             device='cuda',
             dtype=convert_dtype_to_torch_type(self.dtype)
-            if self.dtype != 'bfloat16'
-            else torch.float32,
+            if self.dtype != 'bfloat16' else torch.float32,
             requires_grad=True,
         )
         dout_torch = torch.tensor(
             self.np_dout,
             device='cuda',
             dtype=convert_dtype_to_torch_type(self.dtype)
-            if self.dtype != 'bfloat16'
-            else torch.float32,
+            if self.dtype != 'bfloat16' else torch.float32,
             requires_grad=True,
         )
         return x_torch, dout_torch
@@ -126,7 +130,7 @@ def cal_torch_res(self, x, dout):
         if self.dtype == "bfloat16":
             x_t = x.to(dtype=torch.bfloat16)
             dout_t = dout.to(dtype=torch.bfloat16)
-        torch.manual_seed(seed)
+        set_seed()
         out = torch.nn.functional.dropout(x_t, p=self.p)
         out_grads = torch.autograd.grad([out], [x], grad_outputs=[dout_t])
         if self.dtype == "bfloat16":
@@ -139,11 +143,9 @@ def cal_eager_res(self, x, dout):
         if self.dtype == "bfloat16":
             x_t = paddle.cast(x, dtype="uint16")
             dout_t = paddle.cast(dout, dtype="uint16")
-        paddle.seed(seed)
+        set_seed()
         out = paddle.nn.functional.dropout(x_t, p=self.p)
-        out_grads = paddle.grad(
-            [out], [x], grad_outputs=[dout_t]
-        )
+        out_grads = paddle.grad([out], [x], grad_outputs=[dout_t])
         if self.dtype == "bfloat16":
             out = paddle.cast(out, dtype="float32")
         return out, out_grads
@@ -154,17 +156,17 @@ def cal_static_res(self, x, dout):
         if self.dtype == "bfloat16":
             x_t = paddle.cast(x, dtype="uint16")
             dout_t = paddle.cast(dout, dtype="uint16")
-        paddle.seed(seed)
+        set_seed()
         out = paddle.nn.functional.dropout(x_t, p=self.p)
-        out_grads = paddle.static.gradients(
-            [out], [x], target_gradients=[dout_t]
-        )
+        out_grads = paddle.static.gradients([out], [x],
+                                            target_gradients=[dout_t])
         if self.dtype == "bfloat16":
             out = paddle.cast(out, dtype="float32")
         return out, out_grads
 
     def test_eager_accuracy(self):
         x_eager, dout_eager = self.gen_eager_inputs_and_dout()
+        set_seed()
         out_eager, out_grads_eager = self.cal_eager_res(x_eager, dout_eager)
         del x_eager
         del dout_eager
@@ -178,7 +180,8 @@ def test_eager_accuracy(self):
         del out_grads_eager
         paddle.device.cuda.empty_cache()
         # save eager res for test_matmul_incubate
-        np.savez(self.save_eager_res_path, out_eager=out_eager_np,
+        np.savez(self.save_eager_res_path,
+                 out_eager=out_eager_np,
                  out_grads_eager_0=out_grads_eager_np[0])
 
         # compare eager res with torch
@@ -187,10 +190,9 @@ def test_eager_accuracy(self):
             self.out_torch,
             self.atol,
             self.rtol,
-            err_msg=(
-                'Develop: compare paddle.nn.functional.dropout eager forward res with torch failed in %s dtype'
-            )
-            % self.dtype,
+            err_msg=
+            ('Develop: compare paddle.nn.functional.dropout eager forward res with torch failed in %s dtype'
+             ) % self.dtype,
         )
         for idx in range(len(out_grads_eager_np)):
             np.testing.assert_allclose(
@@ -198,11 +200,11 @@ def test_eager_accuracy(self):
                 self.out_grads_torch[idx],
                 self.atol,
                 self.rtol,
-                err_msg=(
-                    'Develop: compare paddle.nn.functional.dropout eager grad res with torch failed in %s dtype'
-                )
-                % self.dtype,
+                err_msg=
+                ('Develop: compare paddle.nn.functional.dropout eager grad res with torch failed in %s dtype'
+                 ) % self.dtype,
             )
+
     def test_static_accuracy(self):
         with paddle.fluid.framework._dygraph_guard(None):
             mp, sp = paddle.static.Program(), paddle.static.Program()
@@ -212,19 +214,22 @@ def test_static_accuracy(self):
                     x_static,
                     dout_static,
                 )
-            exe = paddle.static.Executor(
-                place=paddle.CUDAPlace(0)
-            )
+            exe = paddle.static.Executor(place=paddle.CUDAPlace(0))
+            set_seed()
             exe.run(sp)
             out = exe.run(
                 mp,
-                feed={"x": self.np_x, "dout": self.np_dout},
+                feed={
+                    "x": self.np_x,
+                    "dout": self.np_dout
+                },
                 fetch_list=[out_static] + out_grads_static,
             )
             out_static, out_grads_static = out[0], out[1:]
 
         # save static res for test_matmul_incubate
-        np.savez(self.save_static_res_path, out_static=out_static,
+        np.savez(self.save_static_res_path,
+                 out_static=out_static,
                  out_grads_static_0=out_grads_static[0])
 
         # compare static res with torch
@@ -233,10 +238,9 @@ def test_static_accuracy(self):
             self.out_torch,
             self.atol,
             self.rtol,
-            err_msg=(
-                'Develop: compare paddle.nn.functional.dropout static forward res with torch failed in %s dtype'
-            )
-            % self.dtype,
+            err_msg=
+            ('Develop: compare paddle.nn.functional.dropout static forward res with torch failed in %s dtype'
+             ) % self.dtype,
         )
         for idx in range(len(out_grads_static)):
             np.testing.assert_allclose(
@@ -244,14 +248,14 @@ def test_static_accuracy(self):
                 self.out_grads_torch[idx],
                 self.atol,
                 self.rtol,
-                err_msg=(
-                    'Develop: compare paddle.nn.functional.dropout static grad res with torch failed in %s dtype'
-                )
-                % self.dtype,
+                err_msg=
+                ('Develop: compare paddle.nn.functional.dropout static grad res with torch failed in %s dtype'
+                 ) % self.dtype,
             )
-    
+
     def test_eager_stability(self):
         x_eager, dout_eager = self.gen_eager_inputs_and_dout()
+        set_seed()
         out_eager_baseline, out_grads_eager_baseline = self.cal_eager_res(
             x_eager, dout_eager)
         out_eager_baseline_np = out_eager_baseline.numpy()
@@ -264,6 +268,7 @@ def test_eager_stability(self):
         paddle.device.cuda.empty_cache()
 
         for i in range(50):
+            set_seed()
             out_eager, out_grads_eager = self.cal_eager_res(
                 x_eager, dout_eager)
             out_eager = out_eager.numpy()
@@ -274,19 +279,17 @@ def test_eager_stability(self):
             np.testing.assert_equal(
                 out_eager,
                 out_eager_baseline_np,
-                err_msg=(
-                    'Develop: paddle.nn.functional.dropout eager forward is unstable in %s dtype'
-                )
-                % self.dtype,
+                err_msg=
+                ('Develop: paddle.nn.functional.dropout eager forward is unstable in %s dtype'
+                 ) % self.dtype,
             )
             for idx in range(len(out_grads_eager)):
                 np.testing.assert_equal(
                     out_grads_eager[idx],
                     out_grads_eager_baseline_np[idx],
-                    err_msg=(
-                        'Develop: paddle.nn.functional.dropout eager grad is unstable in %s dtype'
-                    )
-                    % self.dtype,
+                    err_msg=
+                    ('Develop: paddle.nn.functional.dropout eager grad is unstable in %s dtype'
+                     ) % self.dtype,
                 )
 
     def test_static_stability(self):
@@ -299,43 +302,48 @@ def test_static_stability(self):
                     x_static,
                     dout_static,
                 )
-            exe = paddle.static.Executor(
-                place=paddle.CUDAPlace(0)
-            )
-
+            exe = paddle.static.Executor(place=paddle.CUDAPlace(0))
+            set_seed()
             exe.run(sp)
             out = exe.run(
                 mp,
-                feed={"x": self.np_x, "dout": self.np_dout},
+                feed={
+                    "x": self.np_x,
+                    "dout": self.np_dout
+                },
                 fetch_list=[out_static_pg] + out_grads_static_pg,
             )
             out_static_baseline, out_grads_static_baseline = out[0], out[1:]
             for i in range(50):
+                set_seed()
                 out = exe.run(
                     mp,
-                    feed={"x": self.np_x, "dout": self.np_dout},
+                    feed={
+                        "x": self.np_x,
+                        "dout": self.np_dout
+                    },
                     fetch_list=[out_static_pg] + out_grads_static_pg,
                 )
                 out_static, out_grads_static = out[0], out[1:]
                 np.testing.assert_equal(
                     out_static,
                     out_static_baseline,
-                    err_msg=(
-                        'Develop: paddle.nn.functional.dropout static forward is unstable in %s dtype'
-                    )
-                    % self.dtype,
+                    err_msg=
+                    ('Develop: paddle.nn.functional.dropout static forward is unstable in %s dtype'
+                     ) % self.dtype,
                 )
                 for idx in range(len(out_grads_static)):
                     np.testing.assert_equal(
                         out_grads_static[idx],
                         out_grads_static_baseline[idx],
-                        err_msg=(
-                            'Develop: paddle.nn.functional.dropout static grad is unstable in %s dtype'
-                        )
-                        % self.dtype,
+                        err_msg=
+                        ('Develop: paddle.nn.functional.dropout static grad is unstable in %s dtype'
+                         ) % self.dtype,
                     )
 
+
 class TestMatmulDevelopCase1_FP16(TestMatmulDevelopCase1_FP32):
+
     def init_params(self):
         self.np_input_dir = "./inputs_case1.npz"
         self.dtype = "float16"
@@ -344,6 +352,7 @@ def init_params(self):
 
 
 class TestMatmulDevelopCase1_BFP16(TestMatmulDevelopCase1_FP32):
+
     def init_params(self):
         self.np_input_dir = "./inputs_case1.npz"
         self.dtype = "bfloat16"
@@ -352,6 +361,7 @@ def init_params(self):
 
 
 class TestMatmulDevelopCase2_FP32(TestMatmulDevelopCase1_FP32):
+
     def init_params(self):
         self.np_input_dir = "./inputs_case2.npz"
         self.dtype = "float32"
@@ -360,6 +370,7 @@ def init_params(self):
 
 
 class TestMatmulDevelopCase2_FP16(TestMatmulDevelopCase1_FP32):
+
     def init_params(self):
         self.np_input_dir = "./inputs_case2.npz"
         self.dtype = "float16"
@@ -368,6 +379,7 @@ def init_params(self):
 
 
 class TestMatmulDevelopCase2_BFP16(TestMatmulDevelopCase1_FP32):
+
     def init_params(self):
         self.np_input_dir = "./inputs_case2.npz"
         self.dtype = "bfloat16"