From 0ae720c75879508962267b5f97e0beab19ca6d72 Mon Sep 17 00:00:00 2001 From: gongel Date: Thu, 13 Apr 2023 07:14:31 +0000 Subject: [PATCH 1/4] add dropout --- test_dropout/test_dropout_incubate.py | 360 +++++++++++++++++ test_dropout/test_nn_Dropout_develop.py | 377 +++++++++++++++++ .../test_nn_functional_dropout_develop.py | 380 ++++++++++++++++++ 3 files changed, 1117 insertions(+) create mode 100644 test_dropout/test_dropout_incubate.py create mode 100644 test_dropout/test_nn_Dropout_develop.py create mode 100644 test_dropout/test_nn_functional_dropout_develop.py diff --git a/test_dropout/test_dropout_incubate.py b/test_dropout/test_dropout_incubate.py new file mode 100644 index 0000000..cea762d --- /dev/null +++ b/test_dropout/test_dropout_incubate.py @@ -0,0 +1,360 @@ +import numpy as np +import paddle +import torch +import unittest +from paddle.fluid.layers.utils import map_structure +import sys +sys.path.append("..") +from utils import TOLERANCE, convert_dtype_to_torch_type +from paddle.fluid import core + +seed = 1234 + +def set_seed(): + np.random.seed(seed) + paddle.seed(seed) + torch.manual_seed(seed) + if core.is_compiled_with_cuda(): + paddle.set_flags({'FLAGS_cudnn_deterministic': True}) + torch.backends.cudnn.deterministic = True + torch.cuda.manual_seed_all(seed) + + +class TestMatmulIncubateCase1_FP32(unittest.TestCase): + def setUp(self): + set_seed() + self.init_params() + self.init_threshold() + self.init_np_inputs_and_dout() + x_torch, dout_torch = self.gen_torch_inputs_and_dout() + out_torch, out_grads_torch = self.cal_torch_res(x_torch, dout_torch) + del x_torch + del dout_torch + self.out_torch = out_torch.cpu().detach().numpy() + self.out_grads_torch = map_structure( + lambda x: x.cpu().numpy(), + out_grads_torch, + ) + del out_torch, out_grads_torch + torch.cuda.empty_cache() + + def init_params(self): + self.np_input_dir = "./inputs_case1.npz" + self.dtype = "float32" + self.save_static_res_path = "./static_develop_res_case1_fp32.npz" + self.save_eager_res_path = "./eager_develop_res_case1_fp32.npz" + + def init_threshold(self): + self.atol = TOLERANCE[self.dtype]["atol"] + self.rtol = TOLERANCE[self.dtype]["rtol"] + + def init_np_inputs_and_dout(self): + np_inputs_array = np.load(self.np_input_dir) + # get np array from npz file + self.np_x = np_inputs_array["x"] + self.p = float(np_inputs_array["p"]) + self.np_dout = np_inputs_array["dout"] + # convert np array dtype + if self.dtype == "float16": + self.np_x = self.np_x.astype("float16") + self.np_dout = self.np_dout.astype("float16") + + def gen_torch_inputs_and_dout(self): + x_torch = torch.tensor( + self.np_x, + device='cuda', + dtype=convert_dtype_to_torch_type(self.dtype) + if self.dtype != 'bfloat16' + else torch.float32, + requires_grad=True, + ) + dout_torch = torch.tensor( + self.np_dout, + device='cuda', + dtype=convert_dtype_to_torch_type(self.dtype) + if self.dtype != 'bfloat16' + else torch.float32, + requires_grad=True, + ) + return x_torch, dout_torch + + def gen_eager_inputs_and_dout(self): + x_eager = paddle.to_tensor( + self.np_x, + dtype=self.dtype if self.dtype != 'bfloat16' else "float32", + place="gpu", + ) + x_eager.stop_gradient = False + dout_eager = paddle.to_tensor( + self.np_dout, + dtype=self.dtype if self.dtype != 'bfloat16' else "float32", + place="gpu", + ) + dout_eager.stop_gradient = False + return x_eager, dout_eager + + def gen_static_inputs_and_dout(self): + x_static = paddle.static.data( + 'x', + shape=self.np_x.shape, + dtype=self.dtype if self.dtype != "bfloat16" else "float32", + ) + x_static.stop_gradient = False + dout_static = paddle.static.data( + 'dout', + shape=self.np_dout.shape, + dtype=self.dtype if self.dtype != "bfloat16" else "float32", + ) + dout_static.stop_gradient = False + return x_static, dout_static + + def cal_torch_res(self, x, dout): + x_t = x + dout_t = dout + if self.dtype == "bfloat16": + x_t = x.to(dtype=torch.bfloat16) + dout_t = dout.to(dtype=torch.bfloat16) + torch.manual_seed(seed) + out = torch.nn.functional.dropout(x_t, p=self.p) + out_grads = torch.autograd.grad([out], [x], grad_outputs=[dout_t]) + if self.dtype == "bfloat16": + out = out.to(dtype=torch.float32) + return out, out_grads + + def cal_eager_res(self, x, dout): + x_t = x + dout_t = dout + if self.dtype == "bfloat16": + x_t = paddle.cast(x, dtype="uint16") + dout_t = paddle.cast(dout, dtype="uint16") + paddle.seed(seed) + out = paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout(x_t, p=self.p) + out_grads = paddle.grad( + [out], [x], grad_outputs=[dout_t], retain_graph=True + ) + if self.dtype == "bfloat16": + out = paddle.cast(out, dtype="float32") + return out, out_grads + + def cal_static_res(self, x, dout): + x_t = x + dout_t = dout + if self.dtype == "bfloat16": + x_t = paddle.cast(x, dtype="uint16") + dout_t = paddle.cast(dout, dtype="uint16") + paddle.seed(seed) + out = paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout(x_t, p=self.p) + out_grads = paddle.static.gradients( + [out], [x], target_gradients=[dout_t] + ) + if self.dtype == "bfloat16": + out = paddle.cast(out, dtype="float32") + return out, out_grads + + def test_eager_accuracy(self): + # get develop eager res + develop_res_array = np.load(self.save_eager_res_path) + out_eager_develop = develop_res_array["out_eager"] + out_eager_grad_0_develop = develop_res_array["out_grads_eager_0"] + out_eager_grads_develop = [out_eager_grad_0_develop] + + # calculate incubate eager res + x_eager, dout_eager = self.gen_eager_inputs_and_dout() + out_eager, out_grads_eager = self.cal_eager_res(x_eager, dout_eager) + del x_eager + del dout_eager + paddle.device.cuda.empty_cache() + out_eager_np = out_eager.numpy() + out_grads_eager_np = map_structure( + lambda x: x.numpy(), + out_grads_eager, + ) + del out_eager + del out_grads_eager + paddle.device.cuda.empty_cache() + # compare incubate eager res with develop eager res + np.testing.assert_equal( + out_eager_np, + out_eager_develop, + err_msg=( + 'Incubate: compare paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout incubate eager forward res with develop eager forward res failed in %s dtype' + ) + % self.dtype, + ) + for idx in range(len(out_grads_eager_np)): + np.testing.assert_equal( + out_grads_eager_np[idx], + out_eager_grads_develop[idx], + err_msg=( + 'Incubate: compare paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout incubate eager grad res with develop eager grad res failed in %s dtype' + ) + % self.dtype, + ) + + def test_static_accuracy(self): + # get develop static res + develop_res_array = np.load(self.save_static_res_path) + out_static_develop = develop_res_array["out_static"] + out_grads_static_0_develop = develop_res_array["out_grads_static_0"] + out_grads_static_develop = [out_grads_static_0_develop] + + # calculate incubate static res + with paddle.fluid.framework._dygraph_guard(None): + mp, sp = paddle.static.Program(), paddle.static.Program() + with paddle.static.program_guard(mp, sp): + x_static, dout_static = self.gen_static_inputs_and_dout() + (out_static, out_grads_static) = self.cal_static_res( + x_static, + dout_static, + ) + exe = paddle.static.Executor( + place=paddle.CUDAPlace(0) + ) + exe.run(sp) + out = exe.run( + mp, + feed={"x": self.np_x, "dout": self.np_dout}, + fetch_list=[out_static] + out_grads_static, + ) + out_static, out_grads_static = out[0], out[1:] + + # compare incubate static res with develop static res + np.testing.assert_equal( + out_static, + out_static_develop, + err_msg=( + 'Incubate: compare paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout incubate static forward res with develop static forward res failed in %s dtype' + ) + % self.dtype, + ) + for idx in range(len(out_grads_static)): + np.testing.assert_equal( + out_grads_static[idx], + out_grads_static_develop[idx], + err_msg=( + 'Incubate: compare paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout incubate static grad res with develop static grad res failed in %s dtype' + ) + % self.dtype, + ) + + def test_eager_stability(self): + x_eager, dout_eager = self.gen_eager_inputs_and_dout() + out_eager_baseline, out_grads_eager_baseline = self.cal_eager_res(x_eager, dout_eager) + out_eager_baseline_np = out_eager_baseline.numpy() + out_grads_eager_baseline_np = map_structure( + lambda x: x.numpy(), + out_grads_eager_baseline, + ) + del out_eager_baseline + del out_grads_eager_baseline + paddle.device.cuda.empty_cache() + + for i in range(50): + out_eager, out_grads_eager = self.cal_eager_res(x_eager, dout_eager) + out_eager = out_eager.numpy() + out_grads_eager = map_structure( + lambda x: x.numpy(), + out_grads_eager, + ) + np.testing.assert_equal( + out_eager, + out_eager_baseline_np, + err_msg=( + 'Incubate: paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout eager forward is unstable in %s dtype' + ) + % self.dtype, + ) + for idx in range(len(out_grads_eager)): + np.testing.assert_equal( + out_grads_eager[idx], + out_grads_eager_baseline_np[idx], + err_msg=( + 'Incubate: paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout eager grad is unstable in %s dtype' + ) + % self.dtype, + ) + + def test_static_stability(self): + with paddle.fluid.framework._dygraph_guard(None): + paddle.framework.random._manual_program_seed(seed) + mp, sp = paddle.static.Program(), paddle.static.Program() + with paddle.static.program_guard(mp, sp): + x_static, dout_static = self.gen_static_inputs_and_dout() + (out_static_pg, out_grads_static_pg) = self.cal_static_res( + x_static, + dout_static, + ) + exe = paddle.static.Executor( + place=paddle.CUDAPlace(0) + ) + exe.run(sp) + out = exe.run( + mp, + feed={"x": self.np_x, "dout": self.np_dout}, + fetch_list=[out_static_pg] + out_grads_static_pg, + ) + out_static_baseline, out_grads_static_baseline = out[0], out[1:] + for i in range(50): + out = exe.run( + mp, + feed={"x": self.np_x, "dout": self.np_dout}, + fetch_list=[out_static_pg] + out_grads_static_pg, + ) + out_static, out_grads_static = out[0], out[1:] + np.testing.assert_equal( + out_static, + out_static_baseline, + err_msg=( + 'Incubate: paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout static forward is unstable in %s dtype' + ) + % self.dtype, + ) + for idx in range(len(out_grads_static)): + np.testing.assert_equal( + out_grads_static[idx], + out_grads_static_baseline[idx], + err_msg=( + 'Incubate: paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout static grad is unstable in %s dtype' + ) + % self.dtype, + ) + + +class TestMatmulIncubateCase1_FP16(TestMatmulIncubateCase1_FP32): + def init_params(self): + self.np_input_dir = "./inputs_case1.npz" + self.dtype = "float16" + self.save_static_res_path = "./static_develop_res_case1_fp16.npz" + self.save_eager_res_path = "./eager_develop_res_case1_fp16.npz" + +class TestMatmulIncubateCase1_BFP16(TestMatmulIncubateCase1_FP32): + def init_params(self): + self.np_input_dir = "./inputs_case1.npz" + self.dtype = "bfloat16" + self.save_static_res_path = "./static_develop_res_case1_bfp16.npz" + self.save_eager_res_path = "./eager_develop_res_case1_bfp16.npz" + +class TestMatmulIncubateCase2_FP32(TestMatmulIncubateCase1_FP32): + def init_params(self): + self.np_input_dir = "./inputs_case2.npz" + self.dtype = "float32" + self.save_static_res_path = "./static_develop_res_case2_fp32.npz" + self.save_eager_res_path = "./eager_develop_res_case2_fp32.npz" + +class TestMatmulIncubateCase2_FP16(TestMatmulIncubateCase1_FP32): + def init_params(self): + self.np_input_dir = "./inputs_case2.npz" + self.dtype = "float16" + self.save_static_res_path = "./static_develop_res_case2_fp16.npz" + self.save_eager_res_path = "./eager_develop_res_case2_fp16.npz" + +class TestMatmulIncubateCase2_BFP16(TestMatmulIncubateCase1_FP32): + def init_params(self): + self.np_input_dir = "./inputs_case2.npz" + self.dtype = "bfloat16" + self.save_static_res_path = "./static_develop_res_case2_bfp16.npz" + self.save_eager_res_path = "./eager_develop_res_case2_bfp16.npz" + + +if __name__ == '__main__': + unittest.main() diff --git a/test_dropout/test_nn_Dropout_develop.py b/test_dropout/test_nn_Dropout_develop.py new file mode 100644 index 0000000..ddba168 --- /dev/null +++ b/test_dropout/test_nn_Dropout_develop.py @@ -0,0 +1,377 @@ +import numpy as np +import paddle +import torch +import unittest +import sys +sys.path.append("..") +from utils import TOLERANCE, convert_dtype_to_torch_type +from paddle.fluid import core +from paddle.utils import map_structure + +seed = 1234 +np.random.seed(seed) +paddle.seed(seed) +torch.manual_seed(seed) +if core.is_compiled_with_cuda(): + paddle.set_flags({'FLAGS_cudnn_deterministic': True}) + torch.backends.cudnn.deterministic = True + torch.cuda.manual_seed_all(seed) + +def generate_np_inputs_and_dout(): + p = 0.1 + + x_case1 = np.random.random(size=[1, 4096, 12288]).astype("float32") + dout_case1 = np.random.random(size=[1, 4096, 12288]).astype("float32") + + x_case2 = np.random.random(size=[1, 32, 4096, 4096]).astype("float32") + dout_case2 = np.random.random(size=[1, 32, 4096, 4096]).astype("float32") + + np.savez("./inputs_case1.npz", x=x_case1, p=p, dout=dout_case1) + np.savez("./inputs_case2.npz", x=x_case2, p=p, dout=dout_case2) + + +class TestMatmulDevelopCase1_FP32(unittest.TestCase): + def setUp(self): + self.init_params() + self.init_threshold() + self.init_np_inputs_and_dout() + x_torch, dout_torch = self.gen_torch_inputs_and_dout() + out_torch, out_grads_torch = self.cal_torch_res(x_torch, dout_torch) + del x_torch + del dout_torch + self.out_torch = out_torch.cpu().detach().numpy() + self.out_grads_torch = map_structure( + lambda x: x.cpu().numpy(), + out_grads_torch, + ) + del out_torch, out_grads_torch + torch.cuda.empty_cache() + + def init_params(self): + self.np_input_dir = "./inputs_case1.npz" + self.dtype = "float32" + self.save_static_res_path = "./static_develop_res_case1_fp32.npz" + self.save_eager_res_path = "./eager_develop_res_case1_fp32.npz" + + def init_threshold(self): + self.atol = TOLERANCE[self.dtype]["atol"] + self.rtol = TOLERANCE[self.dtype]["rtol"] + + def init_np_inputs_and_dout(self): + np_inputs_array = np.load(self.np_input_dir) + # get np array from npz file + self.np_x = np_inputs_array["x"] + self.np_dout = np_inputs_array["dout"] + self.p = float(np_inputs_array["p"]) + # convert np array dtype + if self.dtype == "float16": + self.np_x = self.np_x.astype("float16") + self.np_dout = self.np_dout.astype("float16") + + def gen_torch_inputs_and_dout(self): + x_torch = torch.tensor( + self.np_x, + device='cuda', + dtype=convert_dtype_to_torch_type(self.dtype) + if self.dtype != 'bfloat16' + else torch.float32, + requires_grad=True, + ) + dout_torch = torch.tensor( + self.np_dout, + device='cuda', + dtype=convert_dtype_to_torch_type(self.dtype) + if self.dtype != 'bfloat16' + else torch.float32, + requires_grad=True, + ) + return x_torch, dout_torch + + def gen_eager_inputs_and_dout(self): + x_eager = paddle.to_tensor( + self.np_x, + dtype=self.dtype if self.dtype != 'bfloat16' else "float32", + place="gpu", + ) + x_eager.stop_gradient = False + dout_eager = paddle.to_tensor( + self.np_dout, + dtype=self.dtype if self.dtype != 'bfloat16' else "float32", + place="gpu", + ) + dout_eager.stop_gradient = False + return x_eager, dout_eager + + def gen_static_inputs_and_dout(self): + x_static = paddle.static.data( + 'x', + shape=self.np_x.shape, + dtype=self.dtype if self.dtype != "bfloat16" else "float32", + ) + x_static.stop_gradient = False + dout_static = paddle.static.data( + 'dout', + shape=self.np_dout.shape, + dtype=self.dtype if self.dtype != "bfloat16" else "float32", + ) + dout_static.stop_gradient = False + return x_static, dout_static + + def cal_torch_res(self, x, dout): + x_t = x + dout_t = dout + if self.dtype == "bfloat16": + x_t = x.to(dtype=torch.bfloat16) + dout_t = dout.to(dtype=torch.bfloat16) + torch.manual_seed(seed) + out = torch.nn.Dropout(p=self.p)(x_t) + out_grads = torch.autograd.grad([out], [x], grad_outputs=[dout_t]) + if self.dtype == "bfloat16": + out = out.to(dtype=torch.float32) + return out, out_grads + + def cal_eager_res(self, x, dout): + x_t = x + dout_t = dout + if self.dtype == "bfloat16": + x_t = paddle.cast(x, dtype="uint16") + dout_t = paddle.cast(dout, dtype="uint16") + paddle.seed(seed) + out = paddle.nn.Dropout(p=self.p)(x_t) + out_grads = paddle.grad( + [out], [x], grad_outputs=[dout_t] + ) + if self.dtype == "bfloat16": + out = paddle.cast(out, dtype="float32") + return out, out_grads + + def cal_static_res(self, x, dout): + x_t = x + dout_t = dout + if self.dtype == "bfloat16": + x_t = paddle.cast(x, dtype="uint16") + dout_t = paddle.cast(dout, dtype="uint16") + paddle.seed(seed) + out = paddle.nn.Dropout(p=self.p)(x_t) + out_grads = paddle.static.gradients( + [out], [x], target_gradients=[dout_t] + ) + if self.dtype == "bfloat16": + out = paddle.cast(out, dtype="float32") + return out, out_grads + + def test_eager_accuracy(self): + x_eager, dout_eager = self.gen_eager_inputs_and_dout() + out_eager, out_grads_eager = self.cal_eager_res(x_eager, dout_eager) + del x_eager + del dout_eager + paddle.device.cuda.empty_cache() + out_eager_np = out_eager.numpy() + out_grads_eager_np = map_structure( + lambda x: x.numpy(), + out_grads_eager, + ) + del out_eager + del out_grads_eager + paddle.device.cuda.empty_cache() + # save eager res for test_matmul_incubate + np.savez(self.save_eager_res_path, out_eager=out_eager_np, + out_grads_eager_0=out_grads_eager_np[0]) + + # compare eager res with torch + np.testing.assert_allclose( + out_eager_np, + self.out_torch, + self.atol, + self.rtol, + err_msg=( + 'Develop: compare paddle.nn.functional.dropout eager forward res with torch failed in %s dtype' + ) + % self.dtype, + ) + for idx in range(len(out_grads_eager_np)): + np.testing.assert_allclose( + out_grads_eager_np[idx], + self.out_grads_torch[idx], + self.atol, + self.rtol, + err_msg=( + 'Develop: compare paddle.nn.Dropout eager grad res with torch failed in %s dtype' + ) + % self.dtype, + ) + def test_static_accuracy(self): + with paddle.fluid.framework._dygraph_guard(None): + mp, sp = paddle.static.Program(), paddle.static.Program() + with paddle.static.program_guard(mp, sp): + x_static, dout_static = self.gen_static_inputs_and_dout() + (out_static, out_grads_static) = self.cal_static_res( + x_static, + dout_static, + ) + exe = paddle.static.Executor( + place=paddle.CUDAPlace(0) + ) + exe.run(sp) + out = exe.run( + mp, + feed={"x": self.np_x, "dout": self.np_dout}, + fetch_list=[out_static] + out_grads_static, + ) + out_static, out_grads_static = out[0], out[1:] + + # save static res for test_matmul_incubate + np.savez(self.save_static_res_path, out_static=out_static, + out_grads_static_0=out_grads_static[0]) + + # compare static res with torch + np.testing.assert_allclose( + out_static, + self.out_torch, + self.atol, + self.rtol, + err_msg=( + 'Develop: compare paddle.nn.Dropout static forward res with torch failed in %s dtype' + ) + % self.dtype, + ) + for idx in range(len(out_grads_static)): + np.testing.assert_allclose( + out_grads_static[idx], + self.out_grads_torch[idx], + self.atol, + self.rtol, + err_msg=( + 'Develop: compare paddle.nn.Dropout static grad res with torch failed in %s dtype' + ) + % self.dtype, + ) + + def test_eager_stability(self): + x_eager, dout_eager = self.gen_eager_inputs_and_dout() + out_eager_baseline, out_grads_eager_baseline = self.cal_eager_res( + x_eager, dout_eager) + out_eager_baseline_np = out_eager_baseline.numpy() + out_grads_eager_baseline_np = map_structure( + lambda x: x.numpy(), + out_grads_eager_baseline, + ) + del out_eager_baseline + del out_grads_eager_baseline + paddle.device.cuda.empty_cache() + + for i in range(50): + out_eager, out_grads_eager = self.cal_eager_res( + x_eager, dout_eager) + out_eager = out_eager.numpy() + out_grads_eager = map_structure( + lambda x: x.numpy(), + out_grads_eager, + ) + np.testing.assert_equal( + out_eager, + out_eager_baseline_np, + err_msg=( + 'Develop: paddle.nn.Dropout eager forward is unstable in %s dtype' + ) + % self.dtype, + ) + for idx in range(len(out_grads_eager)): + np.testing.assert_equal( + out_grads_eager[idx], + out_grads_eager_baseline_np[idx], + err_msg=( + 'Develop: paddle.nn.Dropout eager grad is unstable in %s dtype' + ) + % self.dtype, + ) + + def test_static_stability(self): + with paddle.fluid.framework._dygraph_guard(None): + paddle.framework.random._manual_program_seed(seed) + mp, sp = paddle.static.Program(), paddle.static.Program() + with paddle.static.program_guard(mp, sp): + x_static, dout_static = self.gen_static_inputs_and_dout() + (out_static_pg, out_grads_static_pg) = self.cal_static_res( + x_static, + dout_static, + ) + exe = paddle.static.Executor( + place=paddle.CUDAPlace(0) + ) + + exe.run(sp) + out = exe.run( + mp, + feed={"x": self.np_x, "dout": self.np_dout}, + fetch_list=[out_static_pg] + out_grads_static_pg, + ) + out_static_baseline, out_grads_static_baseline = out[0], out[1:] + for i in range(50): + out = exe.run( + mp, + feed={"x": self.np_x, "dout": self.np_dout}, + fetch_list=[out_static_pg] + out_grads_static_pg, + ) + out_static, out_grads_static = out[0], out[1:] + np.testing.assert_equal( + out_static, + out_static_baseline, + err_msg=( + 'Develop: paddle.nn.Dropout static forward is unstable in %s dtype' + ) + % self.dtype, + ) + for idx in range(len(out_grads_static)): + np.testing.assert_equal( + out_grads_static[idx], + out_grads_static_baseline[idx], + err_msg=( + 'Develop: paddle.nn.Dropout static grad is unstable in %s dtype' + ) + % self.dtype, + ) + +class TestMatmulDevelopCase1_FP16(TestMatmulDevelopCase1_FP32): + def init_params(self): + self.np_input_dir = "./inputs_case1.npz" + self.dtype = "float16" + self.save_static_res_path = "./static_develop_res_case1_fp16.npz" + self.save_eager_res_path = "./eager_develop_res_case1_fp16.npz" + + +class TestMatmulDevelopCase1_BFP16(TestMatmulDevelopCase1_FP32): + def init_params(self): + self.np_input_dir = "./inputs_case1.npz" + self.dtype = "bfloat16" + self.save_static_res_path = "./static_develop_res_case1_bfp16.npz" + self.save_eager_res_path = "./eager_develop_res_case1_bfp16.npz" + + +class TestMatmulDevelopCase2_FP32(TestMatmulDevelopCase1_FP32): + def init_params(self): + self.np_input_dir = "./inputs_case2.npz" + self.dtype = "float32" + self.save_static_res_path = "./static_develop_res_case2_fp32.npz" + self.save_eager_res_path = "./eager_develop_res_case2_fp32.npz" + + +class TestMatmulDevelopCase2_FP16(TestMatmulDevelopCase1_FP32): + def init_params(self): + self.np_input_dir = "./inputs_case2.npz" + self.dtype = "float16" + self.save_static_res_path = "./static_develop_res_case2_fp16.npz" + self.save_eager_res_path = "./eager_develop_res_case2_fp16.npz" + + +class TestMatmulDevelopCase2_BFP16(TestMatmulDevelopCase1_FP32): + def init_params(self): + self.np_input_dir = "./inputs_case2.npz" + self.dtype = "bfloat16" + self.save_static_res_path = "./static_develop_res_case2_bfp16.npz" + self.save_eager_res_path = "./eager_develop_res_case2_bfp16.npz" + + +if __name__ == '__main__': + generate_np_inputs_and_dout() + unittest.main() diff --git a/test_dropout/test_nn_functional_dropout_develop.py b/test_dropout/test_nn_functional_dropout_develop.py new file mode 100644 index 0000000..52243ab --- /dev/null +++ b/test_dropout/test_nn_functional_dropout_develop.py @@ -0,0 +1,380 @@ +import numpy as np +import paddle +import torch +import unittest +import sys +sys.path.append("..") +from utils import TOLERANCE, convert_dtype_to_torch_type +from paddle.fluid import core +from paddle.utils import map_structure + +seed = 1234 + +def set_seed(): + np.random.seed(seed) + paddle.seed(seed) + torch.manual_seed(seed) + if core.is_compiled_with_cuda(): + paddle.set_flags({'FLAGS_cudnn_deterministic': True}) + torch.backends.cudnn.deterministic = True + torch.cuda.manual_seed_all(seed) + +def generate_np_inputs_and_dout(): + p = 0.1 + + x_case1 = np.random.random(size=[1, 4096, 12288]).astype("float32") + dout_case1 = np.random.random(size=[1, 4096, 12288]).astype("float32") + + x_case2 = np.random.random(size=[1, 32, 4096, 4096]).astype("float32") + dout_case2 = np.random.random(size=[1, 32, 4096, 4096]).astype("float32") + + np.savez("./inputs_case1.npz", x=x_case1, p=p, dout=dout_case1) + np.savez("./inputs_case2.npz", x=x_case2, p=p, dout=dout_case2) + + +class TestMatmulDevelopCase1_FP32(unittest.TestCase): + def setUp(self): + set_seed() + self.init_params() + self.init_threshold() + self.init_np_inputs_and_dout() + x_torch, dout_torch = self.gen_torch_inputs_and_dout() + out_torch, out_grads_torch = self.cal_torch_res(x_torch, dout_torch) + del x_torch + del dout_torch + self.out_torch = out_torch.cpu().detach().numpy() + self.out_grads_torch = map_structure( + lambda x: x.cpu().numpy(), + out_grads_torch, + ) + del out_torch, out_grads_torch + torch.cuda.empty_cache() + + def init_params(self): + self.np_input_dir = "./inputs_case1.npz" + self.dtype = "float16" + self.save_static_res_path = "./static_develop_res_case1_fp32.npz" + self.save_eager_res_path = "./eager_develop_res_case1_fp32.npz" + + def init_threshold(self): + self.atol = TOLERANCE[self.dtype]["atol"] + self.rtol = TOLERANCE[self.dtype]["rtol"] + + def init_np_inputs_and_dout(self): + np_inputs_array = np.load(self.np_input_dir) + # get np array from npz file + self.np_x = np_inputs_array["x"] + self.np_dout = np_inputs_array["dout"] + self.p = float(np_inputs_array["p"]) + # convert np array dtype + if self.dtype == "float16": + self.np_x = self.np_x.astype("float16") + self.np_dout = self.np_dout.astype("float16") + + def gen_torch_inputs_and_dout(self): + x_torch = torch.tensor( + self.np_x, + device='cuda', + dtype=convert_dtype_to_torch_type(self.dtype) + if self.dtype != 'bfloat16' + else torch.float32, + requires_grad=True, + ) + dout_torch = torch.tensor( + self.np_dout, + device='cuda', + dtype=convert_dtype_to_torch_type(self.dtype) + if self.dtype != 'bfloat16' + else torch.float32, + requires_grad=True, + ) + return x_torch, dout_torch + + def gen_eager_inputs_and_dout(self): + x_eager = paddle.to_tensor( + self.np_x, + dtype=self.dtype if self.dtype != 'bfloat16' else "float32", + place="gpu", + ) + x_eager.stop_gradient = False + dout_eager = paddle.to_tensor( + self.np_dout, + dtype=self.dtype if self.dtype != 'bfloat16' else "float32", + place="gpu", + ) + dout_eager.stop_gradient = False + return x_eager, dout_eager + + def gen_static_inputs_and_dout(self): + x_static = paddle.static.data( + 'x', + shape=self.np_x.shape, + dtype=self.dtype if self.dtype != "bfloat16" else "float32", + ) + x_static.stop_gradient = False + dout_static = paddle.static.data( + 'dout', + shape=self.np_dout.shape, + dtype=self.dtype if self.dtype != "bfloat16" else "float32", + ) + dout_static.stop_gradient = False + return x_static, dout_static + + def cal_torch_res(self, x, dout): + x_t = x + dout_t = dout + if self.dtype == "bfloat16": + x_t = x.to(dtype=torch.bfloat16) + dout_t = dout.to(dtype=torch.bfloat16) + torch.manual_seed(seed) + out = torch.nn.functional.dropout(x_t, p=self.p) + out_grads = torch.autograd.grad([out], [x], grad_outputs=[dout_t]) + if self.dtype == "bfloat16": + out = out.to(dtype=torch.float32) + return out, out_grads + + def cal_eager_res(self, x, dout): + x_t = x + dout_t = dout + if self.dtype == "bfloat16": + x_t = paddle.cast(x, dtype="uint16") + dout_t = paddle.cast(dout, dtype="uint16") + paddle.seed(seed) + out = paddle.nn.functional.dropout(x_t, p=self.p) + out_grads = paddle.grad( + [out], [x], grad_outputs=[dout_t] + ) + if self.dtype == "bfloat16": + out = paddle.cast(out, dtype="float32") + return out, out_grads + + def cal_static_res(self, x, dout): + x_t = x + dout_t = dout + if self.dtype == "bfloat16": + x_t = paddle.cast(x, dtype="uint16") + dout_t = paddle.cast(dout, dtype="uint16") + paddle.seed(seed) + out = paddle.nn.functional.dropout(x_t, p=self.p) + out_grads = paddle.static.gradients( + [out], [x], target_gradients=[dout_t] + ) + if self.dtype == "bfloat16": + out = paddle.cast(out, dtype="float32") + return out, out_grads + + def test_eager_accuracy(self): + x_eager, dout_eager = self.gen_eager_inputs_and_dout() + out_eager, out_grads_eager = self.cal_eager_res(x_eager, dout_eager) + del x_eager + del dout_eager + paddle.device.cuda.empty_cache() + out_eager_np = out_eager.numpy() + out_grads_eager_np = map_structure( + lambda x: x.numpy(), + out_grads_eager, + ) + del out_eager + del out_grads_eager + paddle.device.cuda.empty_cache() + # save eager res for test_matmul_incubate + np.savez(self.save_eager_res_path, out_eager=out_eager_np, + out_grads_eager_0=out_grads_eager_np[0]) + + # compare eager res with torch + np.testing.assert_allclose( + out_eager_np, + self.out_torch, + self.atol, + self.rtol, + err_msg=( + 'Develop: compare paddle.nn.functional.dropout eager forward res with torch failed in %s dtype' + ) + % self.dtype, + ) + for idx in range(len(out_grads_eager_np)): + np.testing.assert_allclose( + out_grads_eager_np[idx], + self.out_grads_torch[idx], + self.atol, + self.rtol, + err_msg=( + 'Develop: compare paddle.nn.functional.dropout eager grad res with torch failed in %s dtype' + ) + % self.dtype, + ) + def test_static_accuracy(self): + with paddle.fluid.framework._dygraph_guard(None): + mp, sp = paddle.static.Program(), paddle.static.Program() + with paddle.static.program_guard(mp, sp): + x_static, dout_static = self.gen_static_inputs_and_dout() + (out_static, out_grads_static) = self.cal_static_res( + x_static, + dout_static, + ) + exe = paddle.static.Executor( + place=paddle.CUDAPlace(0) + ) + exe.run(sp) + out = exe.run( + mp, + feed={"x": self.np_x, "dout": self.np_dout}, + fetch_list=[out_static] + out_grads_static, + ) + out_static, out_grads_static = out[0], out[1:] + + # save static res for test_matmul_incubate + np.savez(self.save_static_res_path, out_static=out_static, + out_grads_static_0=out_grads_static[0]) + + # compare static res with torch + np.testing.assert_allclose( + out_static, + self.out_torch, + self.atol, + self.rtol, + err_msg=( + 'Develop: compare paddle.nn.functional.dropout static forward res with torch failed in %s dtype' + ) + % self.dtype, + ) + for idx in range(len(out_grads_static)): + np.testing.assert_allclose( + out_grads_static[idx], + self.out_grads_torch[idx], + self.atol, + self.rtol, + err_msg=( + 'Develop: compare paddle.nn.functional.dropout static grad res with torch failed in %s dtype' + ) + % self.dtype, + ) + + def test_eager_stability(self): + x_eager, dout_eager = self.gen_eager_inputs_and_dout() + out_eager_baseline, out_grads_eager_baseline = self.cal_eager_res( + x_eager, dout_eager) + out_eager_baseline_np = out_eager_baseline.numpy() + out_grads_eager_baseline_np = map_structure( + lambda x: x.numpy(), + out_grads_eager_baseline, + ) + del out_eager_baseline + del out_grads_eager_baseline + paddle.device.cuda.empty_cache() + + for i in range(50): + out_eager, out_grads_eager = self.cal_eager_res( + x_eager, dout_eager) + out_eager = out_eager.numpy() + out_grads_eager = map_structure( + lambda x: x.numpy(), + out_grads_eager, + ) + np.testing.assert_equal( + out_eager, + out_eager_baseline_np, + err_msg=( + 'Develop: paddle.nn.functional.dropout eager forward is unstable in %s dtype' + ) + % self.dtype, + ) + for idx in range(len(out_grads_eager)): + np.testing.assert_equal( + out_grads_eager[idx], + out_grads_eager_baseline_np[idx], + err_msg=( + 'Develop: paddle.nn.functional.dropout eager grad is unstable in %s dtype' + ) + % self.dtype, + ) + + def test_static_stability(self): + with paddle.fluid.framework._dygraph_guard(None): + paddle.framework.random._manual_program_seed(seed) + mp, sp = paddle.static.Program(), paddle.static.Program() + with paddle.static.program_guard(mp, sp): + x_static, dout_static = self.gen_static_inputs_and_dout() + (out_static_pg, out_grads_static_pg) = self.cal_static_res( + x_static, + dout_static, + ) + exe = paddle.static.Executor( + place=paddle.CUDAPlace(0) + ) + + exe.run(sp) + out = exe.run( + mp, + feed={"x": self.np_x, "dout": self.np_dout}, + fetch_list=[out_static_pg] + out_grads_static_pg, + ) + out_static_baseline, out_grads_static_baseline = out[0], out[1:] + for i in range(50): + out = exe.run( + mp, + feed={"x": self.np_x, "dout": self.np_dout}, + fetch_list=[out_static_pg] + out_grads_static_pg, + ) + out_static, out_grads_static = out[0], out[1:] + np.testing.assert_equal( + out_static, + out_static_baseline, + err_msg=( + 'Develop: paddle.nn.functional.dropout static forward is unstable in %s dtype' + ) + % self.dtype, + ) + for idx in range(len(out_grads_static)): + np.testing.assert_equal( + out_grads_static[idx], + out_grads_static_baseline[idx], + err_msg=( + 'Develop: paddle.nn.functional.dropout static grad is unstable in %s dtype' + ) + % self.dtype, + ) + +class TestMatmulDevelopCase1_FP16(TestMatmulDevelopCase1_FP32): + def init_params(self): + self.np_input_dir = "./inputs_case1.npz" + self.dtype = "float16" + self.save_static_res_path = "./static_develop_res_case1_fp16.npz" + self.save_eager_res_path = "./eager_develop_res_case1_fp16.npz" + + +class TestMatmulDevelopCase1_BFP16(TestMatmulDevelopCase1_FP32): + def init_params(self): + self.np_input_dir = "./inputs_case1.npz" + self.dtype = "bfloat16" + self.save_static_res_path = "./static_develop_res_case1_bfp16.npz" + self.save_eager_res_path = "./eager_develop_res_case1_bfp16.npz" + + +class TestMatmulDevelopCase2_FP32(TestMatmulDevelopCase1_FP32): + def init_params(self): + self.np_input_dir = "./inputs_case2.npz" + self.dtype = "float32" + self.save_static_res_path = "./static_develop_res_case2_fp32.npz" + self.save_eager_res_path = "./eager_develop_res_case2_fp32.npz" + + +class TestMatmulDevelopCase2_FP16(TestMatmulDevelopCase1_FP32): + def init_params(self): + self.np_input_dir = "./inputs_case2.npz" + self.dtype = "float16" + self.save_static_res_path = "./static_develop_res_case2_fp16.npz" + self.save_eager_res_path = "./eager_develop_res_case2_fp16.npz" + + +class TestMatmulDevelopCase2_BFP16(TestMatmulDevelopCase1_FP32): + def init_params(self): + self.np_input_dir = "./inputs_case2.npz" + self.dtype = "bfloat16" + self.save_static_res_path = "./static_develop_res_case2_bfp16.npz" + self.save_eager_res_path = "./eager_develop_res_case2_bfp16.npz" + + +if __name__ == '__main__': + generate_np_inputs_and_dout() + unittest.main() From a1438d3dfffea71d3aeebb81fac2009ab0e7adf6 Mon Sep 17 00:00:00 2001 From: gongel Date: Mon, 17 Apr 2023 01:26:23 +0000 Subject: [PATCH 2/4] add FusedDropoutAdd --- .../test_FusedDropoutAdd_dropout_incubate.py | 412 +++++++++++++++++ ...est_incubate_nn_FusedDropoutAdd_develop.py | 423 ++++++++++++++++++ 2 files changed, 835 insertions(+) create mode 100644 test_dropout/test_FusedDropoutAdd_dropout_incubate.py create mode 100644 test_dropout/test_incubate_nn_FusedDropoutAdd_develop.py diff --git a/test_dropout/test_FusedDropoutAdd_dropout_incubate.py b/test_dropout/test_FusedDropoutAdd_dropout_incubate.py new file mode 100644 index 0000000..6eb424b --- /dev/null +++ b/test_dropout/test_FusedDropoutAdd_dropout_incubate.py @@ -0,0 +1,412 @@ +import numpy as np +import paddle +import torch +import unittest +from paddle.fluid.layers.utils import map_structure +import sys + +sys.path.append("..") +from utils import TOLERANCE, convert_dtype_to_torch_type +from paddle.fluid import core + +seed = 1234 + + +def set_seed(): + np.random.seed(seed) + paddle.seed(seed) + torch.manual_seed(seed) + if core.is_compiled_with_cuda(): + paddle.set_flags({'FLAGS_cudnn_deterministic': True}) + torch.backends.cudnn.deterministic = True + torch.cuda.manual_seed_all(seed) + + +class TestMatmulIncubateCase1_FP32(unittest.TestCase): + + def setUp(self): + set_seed() + self.init_params() + self.init_threshold() + self.init_np_inputs_and_dout() + x_torch, y_torch, dout_torch = self.gen_torch_inputs_and_dout() + out_torch, out_grads_torch = self.cal_torch_res( + x_torch, y_torch, dout_torch) + del x_torch + del y_torch + del dout_torch + self.out_torch = out_torch.cpu().detach().numpy() + self.out_grads_torch = map_structure( + lambda x: x.cpu().numpy(), + out_grads_torch, + ) + del out_torch, out_grads_torch + torch.cuda.empty_cache() + + def init_params(self): + self.np_input_dir = "./inputs_case1.npz" + self.dtype = "float32" + self.save_static_res_path = "./static_develop_res_case1_fp32.npz" + self.save_eager_res_path = "./eager_develop_res_case1_fp32.npz" + + def init_threshold(self): + self.atol = TOLERANCE[self.dtype]["atol"] + self.rtol = TOLERANCE[self.dtype]["rtol"] + + def init_np_inputs_and_dout(self): + np_inputs_array = np.load(self.np_input_dir) + # get np array from npz file + self.np_x = np_inputs_array["x"] + self.np_y = np_inputs_array["y"] + self.p = float(np_inputs_array["p"]) + self.np_dout = np_inputs_array["dout"] + # convert np array dtype + if self.dtype == "float16": + self.np_x = self.np_x.astype("float16") + self.np_y = self.np_y.astype("float16") + self.np_dout = self.np_dout.astype("float16") + + def gen_torch_inputs_and_dout(self): + x_torch = torch.tensor( + self.np_x, + device='cuda', + dtype=convert_dtype_to_torch_type(self.dtype) + if self.dtype != 'bfloat16' else torch.float32, + requires_grad=True, + ) + y_torch = torch.tensor( + self.np_y, + device='cuda', + dtype=convert_dtype_to_torch_type(self.dtype) + if self.dtype != 'bfloat16' else torch.float32, + requires_grad=True, + ) + dout_torch = torch.tensor( + self.np_dout, + device='cuda', + dtype=convert_dtype_to_torch_type(self.dtype) + if self.dtype != 'bfloat16' else torch.float32, + requires_grad=True, + ) + return x_torch, y_torch, dout_torch + + def gen_eager_inputs_and_dout(self): + x_eager = paddle.to_tensor( + self.np_x, + dtype=self.dtype if self.dtype != 'bfloat16' else "float32", + place="gpu", + ) + x_eager.stop_gradient = False + y_eager = paddle.to_tensor( + self.np_y, + dtype=self.dtype if self.dtype != 'bfloat16' else "float32", + place="gpu", + ) + y_eager.stop_gradient = False + dout_eager = paddle.to_tensor( + self.np_dout, + dtype=self.dtype if self.dtype != 'bfloat16' else "float32", + place="gpu", + ) + dout_eager.stop_gradient = False + return x_torch, y_torch, dout_torch + + def gen_static_inputs_and_dout(self): + x_static = paddle.static.data( + 'x', + shape=self.np_x.shape, + dtype=self.dtype if self.dtype != "bfloat16" else "float32", + ) + x_static.stop_gradient = False + y_static = paddle.static.data( + 'y', + shape=self.np_y.shape, + dtype=self.dtype if self.dtype != "bfloat16" else "float32", + ) + y_static.stop_gradient = False + dout_static = paddle.static.data( + 'dout', + shape=self.np_dout.shape, + dtype=self.dtype if self.dtype != "bfloat16" else "float32", + ) + dout_static.stop_gradient = False + return x_static, y_static, dout_static + + def cal_torch_res(self, x, y, dout): + x_t = x + y_t = y + dout_t = dout + if self.dtype == "bfloat16": + x_t = x.to(dtype=torch.bfloat16) + y_t = y.to(dtype=torch.bfloat16) + dout_t = dout.to(dtype=torch.bfloat16) + torch.manual_seed(seed) + out = torch.nn.functional.dropout(x_t, p=self.p) + y_t + out_grads = torch.autograd.grad([out], [x, y], grad_outputs=[dout_t]) + if self.dtype == "bfloat16": + out = out.to(dtype=torch.float32) + return out, out_grads + + def cal_eager_res(self, x, y, dout): + x_t = x + y_t = y + dout_t = dout + if self.dtype == "bfloat16": + x_t = paddle.cast(x, dtype="uint16") + y_t = paddle.cast(y, dtype="uint16") + dout_t = paddle.cast(dout, dtype="uint16") + paddle.seed(seed) + out = paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout( + x_t, p=self.p) + y_t + out_grads = paddle.grad([out], [x, y], + grad_outputs=[dout_t], + retain_graph=True) + if self.dtype == "bfloat16": + out = paddle.cast(out, dtype="float32") + return out, out_grads + + def cal_static_res(self, x, y, dout): + x_t = x + y_t = y + dout_t = dout + if self.dtype == "bfloat16": + x_t = paddle.cast(x, dtype="uint16") + y_t = paddle.cast(y, dtype="uint16") + dout_t = paddle.cast(dout, dtype="uint16") + paddle.seed(seed) + out = paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout( + x_t, p=self.p) + y_t + out_grads = paddle.static.gradients([out], [x, y], + target_gradients=[dout_t]) + if self.dtype == "bfloat16": + out = paddle.cast(out, dtype="float32") + return out, out_grads + + def test_eager_accuracy(self): + # get develop eager res + develop_res_array = np.load(self.save_eager_res_path) + out_eager_develop = develop_res_array["out_eager"] + out_eager_grad_0_develop = develop_res_array["out_grads_eager_0"] + out_eager_grads_develop = [ + out_eager_grad_0_develop, out_eager_grad_1_develop + ] + + # calculate incubate eager res + x_eager, y_eager, dout_eager = self.gen_eager_inputs_and_dout() + out_eager, out_grads_eager = self.cal_eager_res( + x_eager, y_eager, dout_eager) + del x_eager + del y_eager + del dout_eager + paddle.device.cuda.empty_cache() + out_eager_np = out_eager.numpy() + out_grads_eager_np = map_structure( + lambda x: x.numpy(), + out_grads_eager, + ) + del out_eager + del out_grads_eager + paddle.device.cuda.empty_cache() + # compare incubate eager res with develop eager res + np.testing.assert_equal( + out_eager_np, + out_eager_develop, + err_msg= + ('Incubate: compare paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout incubate eager forward res with develop eager forward res failed in %s dtype' + ) % self.dtype, + ) + for idx in range(len(out_grads_eager_np)): + np.testing.assert_equal( + out_grads_eager_np[idx], + out_eager_grads_develop[idx], + err_msg= + ('Incubate: compare paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout incubate eager grad res with develop eager grad res failed in %s dtype' + ) % self.dtype, + ) + + def test_static_accuracy(self): + # get develop static res + develop_res_array = np.load(self.save_static_res_path) + out_static_develop = develop_res_array["out_static"] + out_grads_static_0_develop = develop_res_array["out_grads_static_0"] + out_grads_static_develop = [ + out_grads_static_0_develop, out_grads_static_1_develop + ] + + # calculate incubate static res + with paddle.fluid.framework._dygraph_guard(None): + mp, sp = paddle.static.Program(), paddle.static.Program() + with paddle.static.program_guard(mp, sp): + x_static, y_static, dout_static = self.gen_static_inputs_and_dout( + ) + (out_static, out_grads_static) = self.cal_static_res( + x_static, + y_static, + dout_static, + ) + exe = paddle.static.Executor(place=paddle.CUDAPlace(0)) + exe.run(sp) + out = exe.run( + mp, + feed={ + "x": self.np_x, + "y": self.np_y, + "dout": self.np_dout + }, + fetch_list=[out_static] + out_grads_static, + ) + out_static, out_grads_static = out[0], out[1:] + + # compare incubate static res with develop static res + np.testing.assert_equal( + out_static, + out_static_develop, + err_msg= + ('Incubate: compare paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout incubate static forward res with develop static forward res failed in %s dtype' + ) % self.dtype, + ) + for idx in range(len(out_grads_static)): + np.testing.assert_equal( + out_grads_static[idx], + out_grads_static_develop[idx], + err_msg= + ('Incubate: compare paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout incubate static grad res with develop static grad res failed in %s dtype' + ) % self.dtype, + ) + + def test_eager_stability(self): + x_eager, y_eager, dout_eager = self.gen_eager_inputs_and_dout() + out_eager_baseline, out_grads_eager_baseline = self.cal_eager_res( + x_eager, y_eager, dout_eager) + out_eager_baseline_np = out_eager_baseline.numpy() + out_grads_eager_baseline_np = map_structure( + lambda x: x.numpy(), + out_grads_eager_baseline, + ) + del out_eager_baseline + del out_grads_eager_baseline + paddle.device.cuda.empty_cache() + + for i in range(50): + out_eager, out_grads_eager = self.cal_eager_res( + x_eager, y_eager, dout_eager) + out_eager = out_eager.numpy() + out_grads_eager = map_structure( + lambda x: x.numpy(), + out_grads_eager, + ) + np.testing.assert_equal( + out_eager, + out_eager_baseline_np, + err_msg= + ('Incubate: paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout eager forward is unstable in %s dtype' + ) % self.dtype, + ) + for idx in range(len(out_grads_eager)): + np.testing.assert_equal( + out_grads_eager[idx], + out_grads_eager_baseline_np[idx], + err_msg= + ('Incubate: paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout eager grad is unstable in %s dtype' + ) % self.dtype, + ) + + def test_static_stability(self): + with paddle.fluid.framework._dygraph_guard(None): + paddle.framework.random._manual_program_seed(seed) + mp, sp = paddle.static.Program(), paddle.static.Program() + with paddle.static.program_guard(mp, sp): + x_static, y_static, dout_static = self.gen_static_inputs_and_dout( + ) + (out_static_pg, out_grads_static_pg) = self.cal_static_res( + x_static, + y_static, + dout_static, + ) + exe = paddle.static.Executor(place=paddle.CUDAPlace(0)) + exe.run(sp) + out = exe.run( + mp, + feed={ + "x": self.np_x, + "y": self.np_y, + "dout": self.np_dout + }, + fetch_list=[out_static_pg] + out_grads_static_pg, + ) + out_static_baseline, out_grads_static_baseline = out[0], out[1:] + for i in range(50): + out = exe.run( + mp, + feed={ + "x": self.np_x, + "y": self.np_y, + "dout": self.np_dout + }, + fetch_list=[out_static_pg] + out_grads_static_pg, + ) + out_static, out_grads_static = out[0], out[1:] + np.testing.assert_equal( + out_static, + out_static_baseline, + err_msg= + ('Incubate: paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout static forward is unstable in %s dtype' + ) % self.dtype, + ) + for idx in range(len(out_grads_static)): + np.testing.assert_equal( + out_grads_static[idx], + out_grads_static_baseline[idx], + err_msg= + ('Incubate: paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout static grad is unstable in %s dtype' + ) % self.dtype, + ) + + +class TestMatmulIncubateCase1_FP16(TestMatmulIncubateCase1_FP32): + + def init_params(self): + self.np_input_dir = "./inputs_case1.npz" + self.dtype = "float16" + self.save_static_res_path = "./static_develop_res_case1_fp16.npz" + self.save_eager_res_path = "./eager_develop_res_case1_fp16.npz" + + +class TestMatmulIncubateCase1_BFP16(TestMatmulIncubateCase1_FP32): + + def init_params(self): + self.np_input_dir = "./inputs_case1.npz" + self.dtype = "bfloat16" + self.save_static_res_path = "./static_develop_res_case1_bfp16.npz" + self.save_eager_res_path = "./eager_develop_res_case1_bfp16.npz" + + +class TestMatmulIncubateCase2_FP32(TestMatmulIncubateCase1_FP32): + + def init_params(self): + self.np_input_dir = "./inputs_case2.npz" + self.dtype = "float32" + self.save_static_res_path = "./static_develop_res_case2_fp32.npz" + self.save_eager_res_path = "./eager_develop_res_case2_fp32.npz" + + +class TestMatmulIncubateCase2_FP16(TestMatmulIncubateCase1_FP32): + + def init_params(self): + self.np_input_dir = "./inputs_case2.npz" + self.dtype = "float16" + self.save_static_res_path = "./static_develop_res_case2_fp16.npz" + self.save_eager_res_path = "./eager_develop_res_case2_fp16.npz" + + +class TestMatmulIncubateCase2_BFP16(TestMatmulIncubateCase1_FP32): + + def init_params(self): + self.np_input_dir = "./inputs_case2.npz" + self.dtype = "bfloat16" + self.save_static_res_path = "./static_develop_res_case2_bfp16.npz" + self.save_eager_res_path = "./eager_develop_res_case2_bfp16.npz" + + +if __name__ == '__main__': + unittest.main() diff --git a/test_dropout/test_incubate_nn_FusedDropoutAdd_develop.py b/test_dropout/test_incubate_nn_FusedDropoutAdd_develop.py new file mode 100644 index 0000000..28ed383 --- /dev/null +++ b/test_dropout/test_incubate_nn_FusedDropoutAdd_develop.py @@ -0,0 +1,423 @@ +import numpy as np +import paddle +import torch +import unittest +import sys + +sys.path.append("..") +from utils import TOLERANCE, convert_dtype_to_torch_type +from paddle.fluid import core +from paddle.utils import map_structure + +seed = 1234 +np.random.seed(seed) +paddle.seed(seed) +torch.manual_seed(seed) +if core.is_compiled_with_cuda(): + paddle.set_flags({'FLAGS_cudnn_deterministic': True}) + torch.backends.cudnn.deterministic = True + torch.cuda.manual_seed_all(seed) + + +def generate_np_inputs_and_dout(): + p = 0.1 + + x_case1 = np.random.random(size=[1, 4096, 12288]).astype("float32") + y_case1 = np.random.random(size=[1, 4096, 12288]).astype("float32") + dout_case1 = np.random.random(size=[1, 4096, 12288]).astype("float32") + + x_case2 = np.random.random(size=[1, 32, 4096, 4096]).astype("float32") + y_case2 = np.random.random(size=[1, 32, 4096, 4096]).astype("float32") + dout_case2 = np.random.random(size=[1, 32, 4096, 4096]).astype("float32") + + np.savez("./inputs_case1.npz", x=x_case1, y=x_case1, p=p, dout=dout_case1) + np.savez("./inputs_case2.npz", x=x_case2, y=x_case2, p=p, dout=dout_case2) + + +class TestMatmulDevelopCase1_FP32(unittest.TestCase): + + def setUp(self): + self.init_params() + self.init_threshold() + self.init_np_inputs_and_dout() + x_torch, y_torch, dout_torch = self.gen_torch_inputs_and_dout() + out_torch, out_grads_torch = self.cal_torch_res( + x_torch, y_torch, dout_torch) + del x_torch + del y_torch + del dout_torch + self.out_torch = out_torch.cpu().detach().numpy() + self.out_grads_torch = map_structure( + lambda x: x.cpu().numpy(), + out_grads_torch, + ) + del out_torch, out_grads_torch + torch.cuda.empty_cache() + + def init_params(self): + self.np_input_dir = "./inputs_case1.npz" + self.dtype = "float32" + self.save_static_res_path = "./static_develop_res_case1_fp32.npz" + self.save_eager_res_path = "./eager_develop_res_case1_fp32.npz" + + def init_threshold(self): + self.atol = TOLERANCE[self.dtype]["atol"] + self.rtol = TOLERANCE[self.dtype]["rtol"] + + def init_np_inputs_and_dout(self): + np_inputs_array = np.load(self.np_input_dir) + # get np array from npz file + self.np_x = np_inputs_array["x"] + self.np_y = np_inputs_array["y"] + self.np_dout = np_inputs_array["dout"] + self.p = float(np_inputs_array["p"]) + # convert np array dtype + if self.dtype == "float16": + self.np_x = self.np_x.astype("float16") + self.np_y = self.np_y.astype("float16") + self.np_dout = self.np_dout.astype("float16") + + def gen_torch_inputs_and_dout(self): + x_torch = torch.tensor( + self.np_x, + device='cuda', + dtype=convert_dtype_to_torch_type(self.dtype) + if self.dtype != 'bfloat16' else torch.float32, + requires_grad=True, + ) + y_torch = torch.tensor( + self.np_y, + device='cuda', + dtype=convert_dtype_to_torch_type(self.dtype) + if self.dtype != 'bfloat16' else torch.float32, + requires_grad=True, + ) + dout_torch = torch.tensor( + self.np_dout, + device='cuda', + dtype=convert_dtype_to_torch_type(self.dtype) + if self.dtype != 'bfloat16' else torch.float32, + requires_grad=True, + ) + return x_torch, y_torch, dout_torch + + def gen_eager_inputs_and_dout(self): + x_eager = paddle.to_tensor( + self.np_x, + dtype=self.dtype if self.dtype != 'bfloat16' else "float32", + place="gpu", + ) + x_eager.stop_gradient = False + y_eager = paddle.to_tensor( + self.np_y, + dtype=self.dtype if self.dtype != 'bfloat16' else "float32", + place="gpu", + ) + y_eager.stop_gradient = False + dout_eager = paddle.to_tensor( + self.np_dout, + dtype=self.dtype if self.dtype != 'bfloat16' else "float32", + place="gpu", + ) + dout_eager.stop_gradient = False + return x_eager, y_eager, dout_eager + + def gen_static_inputs_and_dout(self): + x_static = paddle.static.data( + 'x', + shape=self.np_x.shape, + dtype=self.dtype if self.dtype != "bfloat16" else "float32", + ) + x_static.stop_gradient = False + y_static = paddle.static.data( + 'y', + shape=self.np_y.shape, + dtype=self.dtype if self.dtype != "bfloat16" else "float32", + ) + y_static.stop_gradient = False + dout_static = paddle.static.data( + 'dout', + shape=self.np_dout.shape, + dtype=self.dtype if self.dtype != "bfloat16" else "float32", + ) + dout_static.stop_gradient = False + return x_static, y_static, dout_static + + def cal_torch_res(self, x, y, dout): + x_t = x + y_t = y + dout_t = dout + if self.dtype == "bfloat16": + x_t = x.to(dtype=torch.bfloat16) + y_t = y.to(dtype=torch.bfloat16) + dout_t = dout.to(dtype=torch.bfloat16) + torch.manual_seed(seed) + out = torch.nn.functional.dropout(x_t, p=self.p) + y_t + out_grads = torch.autograd.grad([out], [x, y], grad_outputs=[dout_t]) + if self.dtype == "bfloat16": + out = out.to(dtype=torch.float32) + return out, out_grads + + def cal_eager_res(self, x, y, dout): + x_t = x + y_t = y + dout_t = dout + if self.dtype == "bfloat16": + x_t = paddle.cast(x, dtype="uint16") + y_t = paddle.cast(y, dtype="uint16") + dout_t = paddle.cast(dout, dtype="uint16") + paddle.seed(seed) + out = paddle.incubate.nn.FusedDropoutAdd(p=self.p)(x_t, y_t) + out_grads = paddle.grad([out], [x, y], grad_outputs=[dout_t]) + if self.dtype == "bfloat16": + out = paddle.cast(out, dtype="float32") + return out, out_grads + + def cal_static_res(self, x, y, dout): + x_t = x + y_t = y + dout_t = dout + if self.dtype == "bfloat16": + x_t = paddle.cast(x, dtype="uint16") + y_t = paddle.cast(y, dtype="uint16") + dout_t = paddle.cast(dout, dtype="uint16") + paddle.seed(seed) + out = paddle.incubate.nn.FusedDropoutAdd(p=self.p)(x_t, y_t) + out_grads = paddle.static.gradients([out], [x], + target_gradients=[dout_t]) + if self.dtype == "bfloat16": + out = paddle.cast(out, dtype="float32") + return out, out_grads + + def test_eager_accuracy(self): + x_eager, y_eager, dout_eager = self.gen_eager_inputs_and_dout() + out_eager, out_grads_eager = self.cal_eager_res( + x_eager, y_eager, dout_eager) + del x_eager + del y_eager + del dout_eager + paddle.device.cuda.empty_cache() + out_eager_np = out_eager.numpy() + out_grads_eager_np = map_structure( + lambda x: x.numpy(), + out_grads_eager, + ) + del out_eager + del out_grads_eager + paddle.device.cuda.empty_cache() + # save eager res for test_matmul_incubate + np.savez(self.save_eager_res_path, + out_eager=out_eager_np, + out_grads_eager_0=out_grads_eager_np[0], + out_grads_eager_1=out_grads_eager_np[1]) + + # compare eager res with torch + np.testing.assert_allclose( + out_eager_np, + self.out_torch, + self.atol, + self.rtol, + err_msg= + ('Develop: compare paddle.nn.functional.dropout eager forward res with torch failed in %s dtype' + ) % self.dtype, + ) + for idx in range(len(out_grads_eager_np)): + np.testing.assert_allclose( + out_grads_eager_np[idx], + self.out_grads_torch[idx], + self.atol, + self.rtol, + err_msg= + ('Develop: compare paddle.nn.functional.dropout eager grad res with torch failed in %s dtype' + ) % self.dtype, + ) + + def test_static_accuracy(self): + with paddle.fluid.framework._dygraph_guard(None): + mp, sp = paddle.static.Program(), paddle.static.Program() + with paddle.static.program_guard(mp, sp): + x_static, y_static, dout_static = self.gen_static_inputs_and_dout( + ) + (out_static, out_grads_static) = self.cal_static_res( + x_static, + y_static, + dout_static, + ) + exe = paddle.static.Executor(place=paddle.CUDAPlace(0)) + exe.run(sp) + out = exe.run( + mp, + feed={ + "x": self.np_x, + "y": self.np_y, + "dout": self.np_dout + }, + fetch_list=[out_static] + out_grads_static, + ) + out_static, out_grads_static = out[0], out[1:] + + # save static res for test_matmul_incubate + np.savez(self.save_static_res_path, + out_static=out_static, + out_grads_static_0=out_grads_static[0], + out_grads_static_1=out_grads_static[1]) + + # compare static res with torch + np.testing.assert_allclose( + out_static, + self.out_torch, + self.atol, + self.rtol, + err_msg= + ('Develop: compare paddle.nn.functional.dropout static forward res with torch failed in %s dtype' + ) % self.dtype, + ) + for idx in range(len(out_grads_static)): + np.testing.assert_allclose( + out_grads_static[idx], + self.out_grads_torch[idx], + self.atol, + self.rtol, + err_msg= + ('Develop: compare paddle.nn.functional.dropout static grad res with torch failed in %s dtype' + ) % self.dtype, + ) + + def test_eager_stability(self): + x_eager, y_eager, dout_eager = self.gen_eager_inputs_and_dout() + out_eager_baseline, out_grads_eager_baseline = self.cal_eager_res( + x_eager, y_eager, dout_eager) + out_eager_baseline_np = out_eager_baseline.numpy() + out_grads_eager_baseline_np = map_structure( + lambda x: x.numpy(), + out_grads_eager_baseline, + ) + del out_eager_baseline + del out_grads_eager_baseline + paddle.device.cuda.empty_cache() + + for i in range(50): + out_eager, out_grads_eager = self.cal_eager_res( + x_eager, y_eager, dout_eager) + out_eager = out_eager.numpy() + out_grads_eager = map_structure( + lambda x: x.numpy(), + out_grads_eager, + ) + np.testing.assert_equal( + out_eager, + out_eager_baseline_np, + err_msg= + ('Develop: paddle.nn.functional.dropout eager forward is unstable in %s dtype' + ) % self.dtype, + ) + for idx in range(len(out_grads_eager)): + np.testing.assert_equal( + out_grads_eager[idx], + out_grads_eager_baseline_np[idx], + err_msg= + ('Develop: paddle.nn.functional.dropout eager grad is unstable in %s dtype' + ) % self.dtype, + ) + + def test_static_stability(self): + with paddle.fluid.framework._dygraph_guard(None): + paddle.framework.random._manual_program_seed(seed) + mp, sp = paddle.static.Program(), paddle.static.Program() + with paddle.static.program_guard(mp, sp): + x_static, y_static, dout_static = self.gen_static_inputs_and_dout( + ) + (out_static_pg, out_grads_static_pg) = self.cal_static_res( + x_static, + y_static, + dout_static, + ) + exe = paddle.static.Executor(place=paddle.CUDAPlace(0)) + + exe.run(sp) + out = exe.run( + mp, + feed={ + "x": self.np_x, + "y": self.np_y, + "dout": self.np_dout + }, + fetch_list=[out_static_pg] + out_grads_static_pg, + ) + out_static_baseline, out_grads_static_baseline = out[0], out[1:] + for i in range(50): + out = exe.run( + mp, + feed={ + "x": self.np_x, + "y": self.np_y, + "dout": self.np_dout + }, + fetch_list=[out_static_pg] + out_grads_static_pg, + ) + out_static, out_grads_static = out[0], out[1:] + np.testing.assert_equal( + out_static, + out_static_baseline, + err_msg= + ('Develop: paddle.nn.functional.dropout static forward is unstable in %s dtype' + ) % self.dtype, + ) + for idx in range(len(out_grads_static)): + np.testing.assert_equal( + out_grads_static[idx], + out_grads_static_baseline[idx], + err_msg= + ('Develop: paddle.nn.functional.dropout static grad is unstable in %s dtype' + ) % self.dtype, + ) + + +class TestMatmulDevelopCase1_FP16(TestMatmulDevelopCase1_FP32): + + def init_params(self): + self.np_input_dir = "./inputs_case1.npz" + self.dtype = "float16" + self.save_static_res_path = "./static_develop_res_case1_fp16.npz" + self.save_eager_res_path = "./eager_develop_res_case1_fp16.npz" + + +class TestMatmulDevelopCase1_BFP16(TestMatmulDevelopCase1_FP32): + + def init_params(self): + self.np_input_dir = "./inputs_case1.npz" + self.dtype = "bfloat16" + self.save_static_res_path = "./static_develop_res_case1_bfp16.npz" + self.save_eager_res_path = "./eager_develop_res_case1_bfp16.npz" + + +class TestMatmulDevelopCase2_FP32(TestMatmulDevelopCase1_FP32): + + def init_params(self): + self.np_input_dir = "./inputs_case2.npz" + self.dtype = "float32" + self.save_static_res_path = "./static_develop_res_case2_fp32.npz" + self.save_eager_res_path = "./eager_develop_res_case2_fp32.npz" + + +class TestMatmulDevelopCase2_FP16(TestMatmulDevelopCase1_FP32): + + def init_params(self): + self.np_input_dir = "./inputs_case2.npz" + self.dtype = "float16" + self.save_static_res_path = "./static_develop_res_case2_fp16.npz" + self.save_eager_res_path = "./eager_develop_res_case2_fp16.npz" + + +class TestMatmulDevelopCase2_BFP16(TestMatmulDevelopCase1_FP32): + + def init_params(self): + self.np_input_dir = "./inputs_case2.npz" + self.dtype = "bfloat16" + self.save_static_res_path = "./static_develop_res_case2_bfp16.npz" + self.save_eager_res_path = "./eager_develop_res_case2_bfp16.npz" + + +if __name__ == '__main__': + generate_np_inputs_and_dout() + unittest.main() From f637d4d3f1a6027c98a1d5a5e0a82d0eaae5f497 Mon Sep 17 00:00:00 2001 From: gongel Date: Tue, 18 Apr 2023 07:32:30 +0000 Subject: [PATCH 3/4] fix --- test_dropout/test_FusedDropoutAdd_dropout_incubate.py | 5 +++-- test_dropout/test_incubate_nn_FusedDropoutAdd_develop.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/test_dropout/test_FusedDropoutAdd_dropout_incubate.py b/test_dropout/test_FusedDropoutAdd_dropout_incubate.py index 6eb424b..06a94f5 100644 --- a/test_dropout/test_FusedDropoutAdd_dropout_incubate.py +++ b/test_dropout/test_FusedDropoutAdd_dropout_incubate.py @@ -37,7 +37,7 @@ def setUp(self): del dout_torch self.out_torch = out_torch.cpu().detach().numpy() self.out_grads_torch = map_structure( - lambda x: x.cpu().numpy(), + lambda x: x.detach().cpu().numpy(), out_grads_torch, ) del out_torch, out_grads_torch @@ -109,7 +109,7 @@ def gen_eager_inputs_and_dout(self): place="gpu", ) dout_eager.stop_gradient = False - return x_torch, y_torch, dout_torch + return x_eager, y_eager, dout_eager def gen_static_inputs_and_dout(self): x_static = paddle.static.data( @@ -187,6 +187,7 @@ def test_eager_accuracy(self): develop_res_array = np.load(self.save_eager_res_path) out_eager_develop = develop_res_array["out_eager"] out_eager_grad_0_develop = develop_res_array["out_grads_eager_0"] + out_eager_grad_1_develop = develop_res_array["out_grads_eager_1"] out_eager_grads_develop = [ out_eager_grad_0_develop, out_eager_grad_1_develop ] diff --git a/test_dropout/test_incubate_nn_FusedDropoutAdd_develop.py b/test_dropout/test_incubate_nn_FusedDropoutAdd_develop.py index 28ed383..73990d5 100644 --- a/test_dropout/test_incubate_nn_FusedDropoutAdd_develop.py +++ b/test_dropout/test_incubate_nn_FusedDropoutAdd_develop.py @@ -48,7 +48,7 @@ def setUp(self): del dout_torch self.out_torch = out_torch.cpu().detach().numpy() self.out_grads_torch = map_structure( - lambda x: x.cpu().numpy(), + lambda x: x.detach().cpu().numpy(), out_grads_torch, ) del out_torch, out_grads_torch From fd5411e741c9029d8415eeb9380205e5ca233aab Mon Sep 17 00:00:00 2001 From: gongel Date: Thu, 20 Apr 2023 08:47:32 +0000 Subject: [PATCH 4/4] update seed --- .../test_FusedDropoutAdd_dropout_incubate.py | 65 +----- test_dropout/test_dropout_incubate.py | 196 ++++++++---------- ...est_incubate_nn_FusedDropoutAdd_develop.py | 33 ++- test_dropout/test_nn_Dropout_develop.py | 146 +++++++------ .../test_nn_functional_dropout_develop.py | 130 ++++++------ 5 files changed, 271 insertions(+), 299 deletions(-) diff --git a/test_dropout/test_FusedDropoutAdd_dropout_incubate.py b/test_dropout/test_FusedDropoutAdd_dropout_incubate.py index 06a94f5..f96b53f 100644 --- a/test_dropout/test_FusedDropoutAdd_dropout_incubate.py +++ b/test_dropout/test_FusedDropoutAdd_dropout_incubate.py @@ -8,6 +8,7 @@ sys.path.append("..") from utils import TOLERANCE, convert_dtype_to_torch_type from paddle.fluid import core +from paddle.fluid.framework import in_dygraph_mode seed = 1234 @@ -16,6 +17,8 @@ def set_seed(): np.random.seed(seed) paddle.seed(seed) torch.manual_seed(seed) + if not in_dygraph_mode(): + paddle.framework.random._manual_program_seed(seed) if core.is_compiled_with_cuda(): paddle.set_flags({'FLAGS_cudnn_deterministic': True}) torch.backends.cudnn.deterministic = True @@ -29,19 +32,6 @@ def setUp(self): self.init_params() self.init_threshold() self.init_np_inputs_and_dout() - x_torch, y_torch, dout_torch = self.gen_torch_inputs_and_dout() - out_torch, out_grads_torch = self.cal_torch_res( - x_torch, y_torch, dout_torch) - del x_torch - del y_torch - del dout_torch - self.out_torch = out_torch.cpu().detach().numpy() - self.out_grads_torch = map_structure( - lambda x: x.detach().cpu().numpy(), - out_grads_torch, - ) - del out_torch, out_grads_torch - torch.cuda.empty_cache() def init_params(self): self.np_input_dir = "./inputs_case1.npz" @@ -66,30 +56,6 @@ def init_np_inputs_and_dout(self): self.np_y = self.np_y.astype("float16") self.np_dout = self.np_dout.astype("float16") - def gen_torch_inputs_and_dout(self): - x_torch = torch.tensor( - self.np_x, - device='cuda', - dtype=convert_dtype_to_torch_type(self.dtype) - if self.dtype != 'bfloat16' else torch.float32, - requires_grad=True, - ) - y_torch = torch.tensor( - self.np_y, - device='cuda', - dtype=convert_dtype_to_torch_type(self.dtype) - if self.dtype != 'bfloat16' else torch.float32, - requires_grad=True, - ) - dout_torch = torch.tensor( - self.np_dout, - device='cuda', - dtype=convert_dtype_to_torch_type(self.dtype) - if self.dtype != 'bfloat16' else torch.float32, - requires_grad=True, - ) - return x_torch, y_torch, dout_torch - def gen_eager_inputs_and_dout(self): x_eager = paddle.to_tensor( self.np_x, @@ -132,21 +98,6 @@ def gen_static_inputs_and_dout(self): dout_static.stop_gradient = False return x_static, y_static, dout_static - def cal_torch_res(self, x, y, dout): - x_t = x - y_t = y - dout_t = dout - if self.dtype == "bfloat16": - x_t = x.to(dtype=torch.bfloat16) - y_t = y.to(dtype=torch.bfloat16) - dout_t = dout.to(dtype=torch.bfloat16) - torch.manual_seed(seed) - out = torch.nn.functional.dropout(x_t, p=self.p) + y_t - out_grads = torch.autograd.grad([out], [x, y], grad_outputs=[dout_t]) - if self.dtype == "bfloat16": - out = out.to(dtype=torch.float32) - return out, out_grads - def cal_eager_res(self, x, y, dout): x_t = x y_t = y @@ -155,7 +106,7 @@ def cal_eager_res(self, x, y, dout): x_t = paddle.cast(x, dtype="uint16") y_t = paddle.cast(y, dtype="uint16") dout_t = paddle.cast(dout, dtype="uint16") - paddle.seed(seed) + set_seed() out = paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout( x_t, p=self.p) + y_t out_grads = paddle.grad([out], [x, y], @@ -173,7 +124,7 @@ def cal_static_res(self, x, y, dout): x_t = paddle.cast(x, dtype="uint16") y_t = paddle.cast(y, dtype="uint16") dout_t = paddle.cast(dout, dtype="uint16") - paddle.seed(seed) + set_seed() out = paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout( x_t, p=self.p) + y_t out_grads = paddle.static.gradients([out], [x, y], @@ -194,6 +145,7 @@ def test_eager_accuracy(self): # calculate incubate eager res x_eager, y_eager, dout_eager = self.gen_eager_inputs_and_dout() + set_seed() out_eager, out_grads_eager = self.cal_eager_res( x_eager, y_eager, dout_eager) del x_eager @@ -246,6 +198,7 @@ def test_static_accuracy(self): dout_static, ) exe = paddle.static.Executor(place=paddle.CUDAPlace(0)) + set_seed() exe.run(sp) out = exe.run( mp, @@ -277,6 +230,7 @@ def test_static_accuracy(self): def test_eager_stability(self): x_eager, y_eager, dout_eager = self.gen_eager_inputs_and_dout() + set_seed() out_eager_baseline, out_grads_eager_baseline = self.cal_eager_res( x_eager, y_eager, dout_eager) out_eager_baseline_np = out_eager_baseline.numpy() @@ -289,6 +243,7 @@ def test_eager_stability(self): paddle.device.cuda.empty_cache() for i in range(50): + set_seed() out_eager, out_grads_eager = self.cal_eager_res( x_eager, y_eager, dout_eager) out_eager = out_eager.numpy() @@ -325,6 +280,7 @@ def test_static_stability(self): dout_static, ) exe = paddle.static.Executor(place=paddle.CUDAPlace(0)) + set_seed() exe.run(sp) out = exe.run( mp, @@ -337,6 +293,7 @@ def test_static_stability(self): ) out_static_baseline, out_grads_static_baseline = out[0], out[1:] for i in range(50): + set_seed() out = exe.run( mp, feed={ diff --git a/test_dropout/test_dropout_incubate.py b/test_dropout/test_dropout_incubate.py index cea762d..afd6b2d 100644 --- a/test_dropout/test_dropout_incubate.py +++ b/test_dropout/test_dropout_incubate.py @@ -4,16 +4,21 @@ import unittest from paddle.fluid.layers.utils import map_structure import sys + sys.path.append("..") from utils import TOLERANCE, convert_dtype_to_torch_type from paddle.fluid import core +from paddle.fluid.framework import in_dygraph_mode seed = 1234 + def set_seed(): np.random.seed(seed) paddle.seed(seed) torch.manual_seed(seed) + if not in_dygraph_mode(): + paddle.framework.random._manual_program_seed(seed) if core.is_compiled_with_cuda(): paddle.set_flags({'FLAGS_cudnn_deterministic': True}) torch.backends.cudnn.deterministic = True @@ -21,29 +26,19 @@ def set_seed(): class TestMatmulIncubateCase1_FP32(unittest.TestCase): + def setUp(self): set_seed() self.init_params() self.init_threshold() self.init_np_inputs_and_dout() - x_torch, dout_torch = self.gen_torch_inputs_and_dout() - out_torch, out_grads_torch = self.cal_torch_res(x_torch, dout_torch) - del x_torch - del dout_torch - self.out_torch = out_torch.cpu().detach().numpy() - self.out_grads_torch = map_structure( - lambda x: x.cpu().numpy(), - out_grads_torch, - ) - del out_torch, out_grads_torch - torch.cuda.empty_cache() def init_params(self): self.np_input_dir = "./inputs_case1.npz" self.dtype = "float32" self.save_static_res_path = "./static_develop_res_case1_fp32.npz" self.save_eager_res_path = "./eager_develop_res_case1_fp32.npz" - + def init_threshold(self): self.atol = TOLERANCE[self.dtype]["atol"] self.rtol = TOLERANCE[self.dtype]["rtol"] @@ -58,26 +53,7 @@ def init_np_inputs_and_dout(self): if self.dtype == "float16": self.np_x = self.np_x.astype("float16") self.np_dout = self.np_dout.astype("float16") - - def gen_torch_inputs_and_dout(self): - x_torch = torch.tensor( - self.np_x, - device='cuda', - dtype=convert_dtype_to_torch_type(self.dtype) - if self.dtype != 'bfloat16' - else torch.float32, - requires_grad=True, - ) - dout_torch = torch.tensor( - self.np_dout, - device='cuda', - dtype=convert_dtype_to_torch_type(self.dtype) - if self.dtype != 'bfloat16' - else torch.float32, - requires_grad=True, - ) - return x_torch, dout_torch - + def gen_eager_inputs_and_dout(self): x_eager = paddle.to_tensor( self.np_x, @@ -108,30 +84,18 @@ def gen_static_inputs_and_dout(self): dout_static.stop_gradient = False return x_static, dout_static - def cal_torch_res(self, x, dout): - x_t = x - dout_t = dout - if self.dtype == "bfloat16": - x_t = x.to(dtype=torch.bfloat16) - dout_t = dout.to(dtype=torch.bfloat16) - torch.manual_seed(seed) - out = torch.nn.functional.dropout(x_t, p=self.p) - out_grads = torch.autograd.grad([out], [x], grad_outputs=[dout_t]) - if self.dtype == "bfloat16": - out = out.to(dtype=torch.float32) - return out, out_grads - def cal_eager_res(self, x, dout): x_t = x dout_t = dout if self.dtype == "bfloat16": x_t = paddle.cast(x, dtype="uint16") dout_t = paddle.cast(dout, dtype="uint16") - paddle.seed(seed) - out = paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout(x_t, p=self.p) - out_grads = paddle.grad( - [out], [x], grad_outputs=[dout_t], retain_graph=True - ) + set_seed() + out = paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout( + x_t, p=self.p) + out_grads = paddle.grad([out], [x], + grad_outputs=[dout_t], + retain_graph=True) if self.dtype == "bfloat16": out = paddle.cast(out, dtype="float32") return out, out_grads @@ -142,11 +106,11 @@ def cal_static_res(self, x, dout): if self.dtype == "bfloat16": x_t = paddle.cast(x, dtype="uint16") dout_t = paddle.cast(dout, dtype="uint16") - paddle.seed(seed) - out = paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout(x_t, p=self.p) - out_grads = paddle.static.gradients( - [out], [x], target_gradients=[dout_t] - ) + set_seed() + out = paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout( + x_t, p=self.p) + out_grads = paddle.static.gradients([out], [x], + target_gradients=[dout_t]) if self.dtype == "bfloat16": out = paddle.cast(out, dtype="float32") return out, out_grads @@ -160,15 +124,16 @@ def test_eager_accuracy(self): # calculate incubate eager res x_eager, dout_eager = self.gen_eager_inputs_and_dout() + set_seed() out_eager, out_grads_eager = self.cal_eager_res(x_eager, dout_eager) del x_eager del dout_eager paddle.device.cuda.empty_cache() out_eager_np = out_eager.numpy() out_grads_eager_np = map_structure( - lambda x: x.numpy(), - out_grads_eager, - ) + lambda x: x.numpy(), + out_grads_eager, + ) del out_eager del out_grads_eager paddle.device.cuda.empty_cache() @@ -176,21 +141,19 @@ def test_eager_accuracy(self): np.testing.assert_equal( out_eager_np, out_eager_develop, - err_msg=( - 'Incubate: compare paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout incubate eager forward res with develop eager forward res failed in %s dtype' - ) - % self.dtype, + err_msg= + ('Incubate: compare paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout incubate eager forward res with develop eager forward res failed in %s dtype' + ) % self.dtype, ) for idx in range(len(out_grads_eager_np)): np.testing.assert_equal( out_grads_eager_np[idx], out_eager_grads_develop[idx], - err_msg=( - 'Incubate: compare paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout incubate eager grad res with develop eager grad res failed in %s dtype' + err_msg= + ('Incubate: compare paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout incubate eager grad res with develop eager grad res failed in %s dtype' + ) % self.dtype, ) - % self.dtype, - ) - + def test_static_accuracy(self): # get develop static res develop_res_array = np.load(self.save_static_res_path) @@ -207,71 +170,73 @@ def test_static_accuracy(self): x_static, dout_static, ) - exe = paddle.static.Executor( - place=paddle.CUDAPlace(0) - ) + exe = paddle.static.Executor(place=paddle.CUDAPlace(0)) + set_seed() exe.run(sp) out = exe.run( mp, - feed={"x": self.np_x, "dout": self.np_dout}, + feed={ + "x": self.np_x, + "dout": self.np_dout + }, fetch_list=[out_static] + out_grads_static, ) out_static, out_grads_static = out[0], out[1:] - + # compare incubate static res with develop static res np.testing.assert_equal( out_static, out_static_develop, - err_msg=( - 'Incubate: compare paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout incubate static forward res with develop static forward res failed in %s dtype' - ) - % self.dtype, + err_msg= + ('Incubate: compare paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout incubate static forward res with develop static forward res failed in %s dtype' + ) % self.dtype, ) for idx in range(len(out_grads_static)): np.testing.assert_equal( out_grads_static[idx], out_grads_static_develop[idx], - err_msg=( - 'Incubate: compare paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout incubate static grad res with develop static grad res failed in %s dtype' - ) - % self.dtype, + err_msg= + ('Incubate: compare paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout incubate static grad res with develop static grad res failed in %s dtype' + ) % self.dtype, ) def test_eager_stability(self): x_eager, dout_eager = self.gen_eager_inputs_and_dout() - out_eager_baseline, out_grads_eager_baseline = self.cal_eager_res(x_eager, dout_eager) + set_seed() + out_eager_baseline, out_grads_eager_baseline = self.cal_eager_res( + x_eager, dout_eager) out_eager_baseline_np = out_eager_baseline.numpy() out_grads_eager_baseline_np = map_structure( - lambda x: x.numpy(), - out_grads_eager_baseline, - ) + lambda x: x.numpy(), + out_grads_eager_baseline, + ) del out_eager_baseline del out_grads_eager_baseline paddle.device.cuda.empty_cache() for i in range(50): - out_eager, out_grads_eager = self.cal_eager_res(x_eager, dout_eager) + set_seed() + out_eager, out_grads_eager = self.cal_eager_res( + x_eager, dout_eager) out_eager = out_eager.numpy() out_grads_eager = map_structure( - lambda x: x.numpy(), - out_grads_eager, - ) + lambda x: x.numpy(), + out_grads_eager, + ) np.testing.assert_equal( out_eager, out_eager_baseline_np, - err_msg=( - 'Incubate: paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout eager forward is unstable in %s dtype' - ) - % self.dtype, + err_msg= + ('Incubate: paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout eager forward is unstable in %s dtype' + ) % self.dtype, ) for idx in range(len(out_grads_eager)): np.testing.assert_equal( out_grads_eager[idx], out_grads_eager_baseline_np[idx], - err_msg=( - 'Incubate: paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout eager grad is unstable in %s dtype' - ) - % self.dtype, + err_msg= + ('Incubate: paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout eager grad is unstable in %s dtype' + ) % self.dtype, ) def test_static_stability(self): @@ -284,71 +249,84 @@ def test_static_stability(self): x_static, dout_static, ) - exe = paddle.static.Executor( - place=paddle.CUDAPlace(0) - ) + exe = paddle.static.Executor(place=paddle.CUDAPlace(0)) + set_seed() exe.run(sp) out = exe.run( mp, - feed={"x": self.np_x, "dout": self.np_dout}, + feed={ + "x": self.np_x, + "dout": self.np_dout + }, fetch_list=[out_static_pg] + out_grads_static_pg, ) out_static_baseline, out_grads_static_baseline = out[0], out[1:] for i in range(50): + set_seed() out = exe.run( mp, - feed={"x": self.np_x, "dout": self.np_dout}, + feed={ + "x": self.np_x, + "dout": self.np_dout + }, fetch_list=[out_static_pg] + out_grads_static_pg, ) out_static, out_grads_static = out[0], out[1:] np.testing.assert_equal( out_static, out_static_baseline, - err_msg=( - 'Incubate: paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout static forward is unstable in %s dtype' - ) - % self.dtype, + err_msg= + ('Incubate: paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout static forward is unstable in %s dtype' + ) % self.dtype, ) for idx in range(len(out_grads_static)): np.testing.assert_equal( out_grads_static[idx], out_grads_static_baseline[idx], - err_msg=( - 'Incubate: paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout static grad is unstable in %s dtype' - ) - % self.dtype, + err_msg= + ('Incubate: paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout static grad is unstable in %s dtype' + ) % self.dtype, ) class TestMatmulIncubateCase1_FP16(TestMatmulIncubateCase1_FP32): + def init_params(self): self.np_input_dir = "./inputs_case1.npz" self.dtype = "float16" self.save_static_res_path = "./static_develop_res_case1_fp16.npz" self.save_eager_res_path = "./eager_develop_res_case1_fp16.npz" + class TestMatmulIncubateCase1_BFP16(TestMatmulIncubateCase1_FP32): + def init_params(self): self.np_input_dir = "./inputs_case1.npz" self.dtype = "bfloat16" self.save_static_res_path = "./static_develop_res_case1_bfp16.npz" self.save_eager_res_path = "./eager_develop_res_case1_bfp16.npz" + class TestMatmulIncubateCase2_FP32(TestMatmulIncubateCase1_FP32): + def init_params(self): self.np_input_dir = "./inputs_case2.npz" self.dtype = "float32" self.save_static_res_path = "./static_develop_res_case2_fp32.npz" self.save_eager_res_path = "./eager_develop_res_case2_fp32.npz" + class TestMatmulIncubateCase2_FP16(TestMatmulIncubateCase1_FP32): + def init_params(self): self.np_input_dir = "./inputs_case2.npz" self.dtype = "float16" self.save_static_res_path = "./static_develop_res_case2_fp16.npz" self.save_eager_res_path = "./eager_develop_res_case2_fp16.npz" + class TestMatmulIncubateCase2_BFP16(TestMatmulIncubateCase1_FP32): + def init_params(self): self.np_input_dir = "./inputs_case2.npz" self.dtype = "bfloat16" diff --git a/test_dropout/test_incubate_nn_FusedDropoutAdd_develop.py b/test_dropout/test_incubate_nn_FusedDropoutAdd_develop.py index 73990d5..c30048d 100644 --- a/test_dropout/test_incubate_nn_FusedDropoutAdd_develop.py +++ b/test_dropout/test_incubate_nn_FusedDropoutAdd_develop.py @@ -8,15 +8,21 @@ from utils import TOLERANCE, convert_dtype_to_torch_type from paddle.fluid import core from paddle.utils import map_structure +from paddle.fluid.framework import in_dygraph_mode seed = 1234 -np.random.seed(seed) -paddle.seed(seed) -torch.manual_seed(seed) -if core.is_compiled_with_cuda(): - paddle.set_flags({'FLAGS_cudnn_deterministic': True}) - torch.backends.cudnn.deterministic = True - torch.cuda.manual_seed_all(seed) + + +def set_seed(): + np.random.seed(seed) + paddle.seed(seed) + torch.manual_seed(seed) + if not in_dygraph_mode(): + paddle.framework.random._manual_program_seed(seed) + if core.is_compiled_with_cuda(): + paddle.set_flags({'FLAGS_cudnn_deterministic': True}) + torch.backends.cudnn.deterministic = True + torch.cuda.manual_seed_all(seed) def generate_np_inputs_and_dout(): @@ -151,7 +157,7 @@ def cal_torch_res(self, x, y, dout): x_t = x.to(dtype=torch.bfloat16) y_t = y.to(dtype=torch.bfloat16) dout_t = dout.to(dtype=torch.bfloat16) - torch.manual_seed(seed) + set_seed() out = torch.nn.functional.dropout(x_t, p=self.p) + y_t out_grads = torch.autograd.grad([out], [x, y], grad_outputs=[dout_t]) if self.dtype == "bfloat16": @@ -166,7 +172,7 @@ def cal_eager_res(self, x, y, dout): x_t = paddle.cast(x, dtype="uint16") y_t = paddle.cast(y, dtype="uint16") dout_t = paddle.cast(dout, dtype="uint16") - paddle.seed(seed) + set_seed() out = paddle.incubate.nn.FusedDropoutAdd(p=self.p)(x_t, y_t) out_grads = paddle.grad([out], [x, y], grad_outputs=[dout_t]) if self.dtype == "bfloat16": @@ -181,7 +187,7 @@ def cal_static_res(self, x, y, dout): x_t = paddle.cast(x, dtype="uint16") y_t = paddle.cast(y, dtype="uint16") dout_t = paddle.cast(dout, dtype="uint16") - paddle.seed(seed) + set_seed() out = paddle.incubate.nn.FusedDropoutAdd(p=self.p)(x_t, y_t) out_grads = paddle.static.gradients([out], [x], target_gradients=[dout_t]) @@ -191,6 +197,7 @@ def cal_static_res(self, x, y, dout): def test_eager_accuracy(self): x_eager, y_eager, dout_eager = self.gen_eager_inputs_and_dout() + set_seed() out_eager, out_grads_eager = self.cal_eager_res( x_eager, y_eager, dout_eager) del x_eager @@ -244,6 +251,7 @@ def test_static_accuracy(self): dout_static, ) exe = paddle.static.Executor(place=paddle.CUDAPlace(0)) + set_seed() exe.run(sp) out = exe.run( mp, @@ -285,6 +293,7 @@ def test_static_accuracy(self): def test_eager_stability(self): x_eager, y_eager, dout_eager = self.gen_eager_inputs_and_dout() + set_seed() out_eager_baseline, out_grads_eager_baseline = self.cal_eager_res( x_eager, y_eager, dout_eager) out_eager_baseline_np = out_eager_baseline.numpy() @@ -297,6 +306,7 @@ def test_eager_stability(self): paddle.device.cuda.empty_cache() for i in range(50): + set_seed() out_eager, out_grads_eager = self.cal_eager_res( x_eager, y_eager, dout_eager) out_eager = out_eager.numpy() @@ -333,7 +343,7 @@ def test_static_stability(self): dout_static, ) exe = paddle.static.Executor(place=paddle.CUDAPlace(0)) - + set_seed() exe.run(sp) out = exe.run( mp, @@ -346,6 +356,7 @@ def test_static_stability(self): ) out_static_baseline, out_grads_static_baseline = out[0], out[1:] for i in range(50): + set_seed() out = exe.run( mp, feed={ diff --git a/test_dropout/test_nn_Dropout_develop.py b/test_dropout/test_nn_Dropout_develop.py index ddba168..7c169a8 100644 --- a/test_dropout/test_nn_Dropout_develop.py +++ b/test_dropout/test_nn_Dropout_develop.py @@ -3,19 +3,27 @@ import torch import unittest import sys + sys.path.append("..") from utils import TOLERANCE, convert_dtype_to_torch_type from paddle.fluid import core from paddle.utils import map_structure +from paddle.fluid.framework import in_dygraph_mode seed = 1234 -np.random.seed(seed) -paddle.seed(seed) -torch.manual_seed(seed) -if core.is_compiled_with_cuda(): - paddle.set_flags({'FLAGS_cudnn_deterministic': True}) - torch.backends.cudnn.deterministic = True - torch.cuda.manual_seed_all(seed) + + +def set_seed(): + np.random.seed(seed) + paddle.seed(seed) + torch.manual_seed(seed) + if not in_dygraph_mode(): + paddle.framework.random._manual_program_seed(seed) + if core.is_compiled_with_cuda(): + paddle.set_flags({'FLAGS_cudnn_deterministic': True}) + torch.backends.cudnn.deterministic = True + torch.cuda.manual_seed_all(seed) + def generate_np_inputs_and_dout(): p = 0.1 @@ -31,6 +39,7 @@ def generate_np_inputs_and_dout(): class TestMatmulDevelopCase1_FP32(unittest.TestCase): + def setUp(self): self.init_params() self.init_threshold() @@ -73,16 +82,14 @@ def gen_torch_inputs_and_dout(self): self.np_x, device='cuda', dtype=convert_dtype_to_torch_type(self.dtype) - if self.dtype != 'bfloat16' - else torch.float32, + if self.dtype != 'bfloat16' else torch.float32, requires_grad=True, ) dout_torch = torch.tensor( self.np_dout, device='cuda', dtype=convert_dtype_to_torch_type(self.dtype) - if self.dtype != 'bfloat16' - else torch.float32, + if self.dtype != 'bfloat16' else torch.float32, requires_grad=True, ) return x_torch, dout_torch @@ -123,7 +130,7 @@ def cal_torch_res(self, x, dout): if self.dtype == "bfloat16": x_t = x.to(dtype=torch.bfloat16) dout_t = dout.to(dtype=torch.bfloat16) - torch.manual_seed(seed) + set_seed() out = torch.nn.Dropout(p=self.p)(x_t) out_grads = torch.autograd.grad([out], [x], grad_outputs=[dout_t]) if self.dtype == "bfloat16": @@ -136,11 +143,9 @@ def cal_eager_res(self, x, dout): if self.dtype == "bfloat16": x_t = paddle.cast(x, dtype="uint16") dout_t = paddle.cast(dout, dtype="uint16") - paddle.seed(seed) + set_seed() out = paddle.nn.Dropout(p=self.p)(x_t) - out_grads = paddle.grad( - [out], [x], grad_outputs=[dout_t] - ) + out_grads = paddle.grad([out], [x], grad_outputs=[dout_t]) if self.dtype == "bfloat16": out = paddle.cast(out, dtype="float32") return out, out_grads @@ -151,17 +156,17 @@ def cal_static_res(self, x, dout): if self.dtype == "bfloat16": x_t = paddle.cast(x, dtype="uint16") dout_t = paddle.cast(dout, dtype="uint16") - paddle.seed(seed) + set_seed() out = paddle.nn.Dropout(p=self.p)(x_t) - out_grads = paddle.static.gradients( - [out], [x], target_gradients=[dout_t] - ) + out_grads = paddle.static.gradients([out], [x], + target_gradients=[dout_t]) if self.dtype == "bfloat16": out = paddle.cast(out, dtype="float32") return out, out_grads def test_eager_accuracy(self): x_eager, dout_eager = self.gen_eager_inputs_and_dout() + set_seed() out_eager, out_grads_eager = self.cal_eager_res(x_eager, dout_eager) del x_eager del dout_eager @@ -175,7 +180,8 @@ def test_eager_accuracy(self): del out_grads_eager paddle.device.cuda.empty_cache() # save eager res for test_matmul_incubate - np.savez(self.save_eager_res_path, out_eager=out_eager_np, + np.savez(self.save_eager_res_path, + out_eager=out_eager_np, out_grads_eager_0=out_grads_eager_np[0]) # compare eager res with torch @@ -184,10 +190,9 @@ def test_eager_accuracy(self): self.out_torch, self.atol, self.rtol, - err_msg=( - 'Develop: compare paddle.nn.functional.dropout eager forward res with torch failed in %s dtype' - ) - % self.dtype, + err_msg= + ('Develop: compare paddle.nn.functional.dropout eager forward res with torch failed in %s dtype' + ) % self.dtype, ) for idx in range(len(out_grads_eager_np)): np.testing.assert_allclose( @@ -195,11 +200,11 @@ def test_eager_accuracy(self): self.out_grads_torch[idx], self.atol, self.rtol, - err_msg=( - 'Develop: compare paddle.nn.Dropout eager grad res with torch failed in %s dtype' - ) - % self.dtype, + err_msg= + ('Develop: compare paddle.nn.Dropout eager grad res with torch failed in %s dtype' + ) % self.dtype, ) + def test_static_accuracy(self): with paddle.fluid.framework._dygraph_guard(None): mp, sp = paddle.static.Program(), paddle.static.Program() @@ -209,19 +214,22 @@ def test_static_accuracy(self): x_static, dout_static, ) - exe = paddle.static.Executor( - place=paddle.CUDAPlace(0) - ) + exe = paddle.static.Executor(place=paddle.CUDAPlace(0)) + set_seed() exe.run(sp) out = exe.run( mp, - feed={"x": self.np_x, "dout": self.np_dout}, + feed={ + "x": self.np_x, + "dout": self.np_dout + }, fetch_list=[out_static] + out_grads_static, ) out_static, out_grads_static = out[0], out[1:] # save static res for test_matmul_incubate - np.savez(self.save_static_res_path, out_static=out_static, + np.savez(self.save_static_res_path, + out_static=out_static, out_grads_static_0=out_grads_static[0]) # compare static res with torch @@ -230,10 +238,9 @@ def test_static_accuracy(self): self.out_torch, self.atol, self.rtol, - err_msg=( - 'Develop: compare paddle.nn.Dropout static forward res with torch failed in %s dtype' - ) - % self.dtype, + err_msg= + ('Develop: compare paddle.nn.Dropout static forward res with torch failed in %s dtype' + ) % self.dtype, ) for idx in range(len(out_grads_static)): np.testing.assert_allclose( @@ -241,14 +248,14 @@ def test_static_accuracy(self): self.out_grads_torch[idx], self.atol, self.rtol, - err_msg=( - 'Develop: compare paddle.nn.Dropout static grad res with torch failed in %s dtype' - ) - % self.dtype, + err_msg= + ('Develop: compare paddle.nn.Dropout static grad res with torch failed in %s dtype' + ) % self.dtype, ) - + def test_eager_stability(self): x_eager, dout_eager = self.gen_eager_inputs_and_dout() + set_seed() out_eager_baseline, out_grads_eager_baseline = self.cal_eager_res( x_eager, dout_eager) out_eager_baseline_np = out_eager_baseline.numpy() @@ -261,6 +268,7 @@ def test_eager_stability(self): paddle.device.cuda.empty_cache() for i in range(50): + set_seed() out_eager, out_grads_eager = self.cal_eager_res( x_eager, dout_eager) out_eager = out_eager.numpy() @@ -271,24 +279,21 @@ def test_eager_stability(self): np.testing.assert_equal( out_eager, out_eager_baseline_np, - err_msg=( - 'Develop: paddle.nn.Dropout eager forward is unstable in %s dtype' - ) - % self.dtype, + err_msg= + ('Develop: paddle.nn.Dropout eager forward is unstable in %s dtype' + ) % self.dtype, ) for idx in range(len(out_grads_eager)): np.testing.assert_equal( out_grads_eager[idx], out_grads_eager_baseline_np[idx], - err_msg=( - 'Develop: paddle.nn.Dropout eager grad is unstable in %s dtype' - ) - % self.dtype, + err_msg= + ('Develop: paddle.nn.Dropout eager grad is unstable in %s dtype' + ) % self.dtype, ) def test_static_stability(self): with paddle.fluid.framework._dygraph_guard(None): - paddle.framework.random._manual_program_seed(seed) mp, sp = paddle.static.Program(), paddle.static.Program() with paddle.static.program_guard(mp, sp): x_static, dout_static = self.gen_static_inputs_and_dout() @@ -296,43 +301,48 @@ def test_static_stability(self): x_static, dout_static, ) - exe = paddle.static.Executor( - place=paddle.CUDAPlace(0) - ) - + exe = paddle.static.Executor(place=paddle.CUDAPlace(0)) + set_seed() exe.run(sp) out = exe.run( mp, - feed={"x": self.np_x, "dout": self.np_dout}, + feed={ + "x": self.np_x, + "dout": self.np_dout + }, fetch_list=[out_static_pg] + out_grads_static_pg, ) out_static_baseline, out_grads_static_baseline = out[0], out[1:] for i in range(50): + set_seed() out = exe.run( mp, - feed={"x": self.np_x, "dout": self.np_dout}, + feed={ + "x": self.np_x, + "dout": self.np_dout + }, fetch_list=[out_static_pg] + out_grads_static_pg, ) out_static, out_grads_static = out[0], out[1:] np.testing.assert_equal( out_static, out_static_baseline, - err_msg=( - 'Develop: paddle.nn.Dropout static forward is unstable in %s dtype' - ) - % self.dtype, + err_msg= + ('Develop: paddle.nn.Dropout static forward is unstable in %s dtype' + ) % self.dtype, ) for idx in range(len(out_grads_static)): np.testing.assert_equal( out_grads_static[idx], out_grads_static_baseline[idx], - err_msg=( - 'Develop: paddle.nn.Dropout static grad is unstable in %s dtype' - ) - % self.dtype, + err_msg= + ('Develop: paddle.nn.Dropout static grad is unstable in %s dtype' + ) % self.dtype, ) + class TestMatmulDevelopCase1_FP16(TestMatmulDevelopCase1_FP32): + def init_params(self): self.np_input_dir = "./inputs_case1.npz" self.dtype = "float16" @@ -341,6 +351,7 @@ def init_params(self): class TestMatmulDevelopCase1_BFP16(TestMatmulDevelopCase1_FP32): + def init_params(self): self.np_input_dir = "./inputs_case1.npz" self.dtype = "bfloat16" @@ -349,6 +360,7 @@ def init_params(self): class TestMatmulDevelopCase2_FP32(TestMatmulDevelopCase1_FP32): + def init_params(self): self.np_input_dir = "./inputs_case2.npz" self.dtype = "float32" @@ -357,6 +369,7 @@ def init_params(self): class TestMatmulDevelopCase2_FP16(TestMatmulDevelopCase1_FP32): + def init_params(self): self.np_input_dir = "./inputs_case2.npz" self.dtype = "float16" @@ -365,6 +378,7 @@ def init_params(self): class TestMatmulDevelopCase2_BFP16(TestMatmulDevelopCase1_FP32): + def init_params(self): self.np_input_dir = "./inputs_case2.npz" self.dtype = "bfloat16" diff --git a/test_dropout/test_nn_functional_dropout_develop.py b/test_dropout/test_nn_functional_dropout_develop.py index 52243ab..58f53a1 100644 --- a/test_dropout/test_nn_functional_dropout_develop.py +++ b/test_dropout/test_nn_functional_dropout_develop.py @@ -3,22 +3,28 @@ import torch import unittest import sys + sys.path.append("..") from utils import TOLERANCE, convert_dtype_to_torch_type from paddle.fluid import core from paddle.utils import map_structure +from paddle.fluid.framework import in_dygraph_mode seed = 1234 + def set_seed(): np.random.seed(seed) paddle.seed(seed) torch.manual_seed(seed) + if not in_dygraph_mode(): + paddle.framework.random._manual_program_seed(seed) if core.is_compiled_with_cuda(): paddle.set_flags({'FLAGS_cudnn_deterministic': True}) torch.backends.cudnn.deterministic = True torch.cuda.manual_seed_all(seed) + def generate_np_inputs_and_dout(): p = 0.1 @@ -33,8 +39,8 @@ def generate_np_inputs_and_dout(): class TestMatmulDevelopCase1_FP32(unittest.TestCase): + def setUp(self): - set_seed() self.init_params() self.init_threshold() self.init_np_inputs_and_dout() @@ -76,16 +82,14 @@ def gen_torch_inputs_and_dout(self): self.np_x, device='cuda', dtype=convert_dtype_to_torch_type(self.dtype) - if self.dtype != 'bfloat16' - else torch.float32, + if self.dtype != 'bfloat16' else torch.float32, requires_grad=True, ) dout_torch = torch.tensor( self.np_dout, device='cuda', dtype=convert_dtype_to_torch_type(self.dtype) - if self.dtype != 'bfloat16' - else torch.float32, + if self.dtype != 'bfloat16' else torch.float32, requires_grad=True, ) return x_torch, dout_torch @@ -126,7 +130,7 @@ def cal_torch_res(self, x, dout): if self.dtype == "bfloat16": x_t = x.to(dtype=torch.bfloat16) dout_t = dout.to(dtype=torch.bfloat16) - torch.manual_seed(seed) + set_seed() out = torch.nn.functional.dropout(x_t, p=self.p) out_grads = torch.autograd.grad([out], [x], grad_outputs=[dout_t]) if self.dtype == "bfloat16": @@ -139,11 +143,9 @@ def cal_eager_res(self, x, dout): if self.dtype == "bfloat16": x_t = paddle.cast(x, dtype="uint16") dout_t = paddle.cast(dout, dtype="uint16") - paddle.seed(seed) + set_seed() out = paddle.nn.functional.dropout(x_t, p=self.p) - out_grads = paddle.grad( - [out], [x], grad_outputs=[dout_t] - ) + out_grads = paddle.grad([out], [x], grad_outputs=[dout_t]) if self.dtype == "bfloat16": out = paddle.cast(out, dtype="float32") return out, out_grads @@ -154,17 +156,17 @@ def cal_static_res(self, x, dout): if self.dtype == "bfloat16": x_t = paddle.cast(x, dtype="uint16") dout_t = paddle.cast(dout, dtype="uint16") - paddle.seed(seed) + set_seed() out = paddle.nn.functional.dropout(x_t, p=self.p) - out_grads = paddle.static.gradients( - [out], [x], target_gradients=[dout_t] - ) + out_grads = paddle.static.gradients([out], [x], + target_gradients=[dout_t]) if self.dtype == "bfloat16": out = paddle.cast(out, dtype="float32") return out, out_grads def test_eager_accuracy(self): x_eager, dout_eager = self.gen_eager_inputs_and_dout() + set_seed() out_eager, out_grads_eager = self.cal_eager_res(x_eager, dout_eager) del x_eager del dout_eager @@ -178,7 +180,8 @@ def test_eager_accuracy(self): del out_grads_eager paddle.device.cuda.empty_cache() # save eager res for test_matmul_incubate - np.savez(self.save_eager_res_path, out_eager=out_eager_np, + np.savez(self.save_eager_res_path, + out_eager=out_eager_np, out_grads_eager_0=out_grads_eager_np[0]) # compare eager res with torch @@ -187,10 +190,9 @@ def test_eager_accuracy(self): self.out_torch, self.atol, self.rtol, - err_msg=( - 'Develop: compare paddle.nn.functional.dropout eager forward res with torch failed in %s dtype' - ) - % self.dtype, + err_msg= + ('Develop: compare paddle.nn.functional.dropout eager forward res with torch failed in %s dtype' + ) % self.dtype, ) for idx in range(len(out_grads_eager_np)): np.testing.assert_allclose( @@ -198,11 +200,11 @@ def test_eager_accuracy(self): self.out_grads_torch[idx], self.atol, self.rtol, - err_msg=( - 'Develop: compare paddle.nn.functional.dropout eager grad res with torch failed in %s dtype' - ) - % self.dtype, + err_msg= + ('Develop: compare paddle.nn.functional.dropout eager grad res with torch failed in %s dtype' + ) % self.dtype, ) + def test_static_accuracy(self): with paddle.fluid.framework._dygraph_guard(None): mp, sp = paddle.static.Program(), paddle.static.Program() @@ -212,19 +214,22 @@ def test_static_accuracy(self): x_static, dout_static, ) - exe = paddle.static.Executor( - place=paddle.CUDAPlace(0) - ) + exe = paddle.static.Executor(place=paddle.CUDAPlace(0)) + set_seed() exe.run(sp) out = exe.run( mp, - feed={"x": self.np_x, "dout": self.np_dout}, + feed={ + "x": self.np_x, + "dout": self.np_dout + }, fetch_list=[out_static] + out_grads_static, ) out_static, out_grads_static = out[0], out[1:] # save static res for test_matmul_incubate - np.savez(self.save_static_res_path, out_static=out_static, + np.savez(self.save_static_res_path, + out_static=out_static, out_grads_static_0=out_grads_static[0]) # compare static res with torch @@ -233,10 +238,9 @@ def test_static_accuracy(self): self.out_torch, self.atol, self.rtol, - err_msg=( - 'Develop: compare paddle.nn.functional.dropout static forward res with torch failed in %s dtype' - ) - % self.dtype, + err_msg= + ('Develop: compare paddle.nn.functional.dropout static forward res with torch failed in %s dtype' + ) % self.dtype, ) for idx in range(len(out_grads_static)): np.testing.assert_allclose( @@ -244,14 +248,14 @@ def test_static_accuracy(self): self.out_grads_torch[idx], self.atol, self.rtol, - err_msg=( - 'Develop: compare paddle.nn.functional.dropout static grad res with torch failed in %s dtype' - ) - % self.dtype, + err_msg= + ('Develop: compare paddle.nn.functional.dropout static grad res with torch failed in %s dtype' + ) % self.dtype, ) - + def test_eager_stability(self): x_eager, dout_eager = self.gen_eager_inputs_and_dout() + set_seed() out_eager_baseline, out_grads_eager_baseline = self.cal_eager_res( x_eager, dout_eager) out_eager_baseline_np = out_eager_baseline.numpy() @@ -264,6 +268,7 @@ def test_eager_stability(self): paddle.device.cuda.empty_cache() for i in range(50): + set_seed() out_eager, out_grads_eager = self.cal_eager_res( x_eager, dout_eager) out_eager = out_eager.numpy() @@ -274,19 +279,17 @@ def test_eager_stability(self): np.testing.assert_equal( out_eager, out_eager_baseline_np, - err_msg=( - 'Develop: paddle.nn.functional.dropout eager forward is unstable in %s dtype' - ) - % self.dtype, + err_msg= + ('Develop: paddle.nn.functional.dropout eager forward is unstable in %s dtype' + ) % self.dtype, ) for idx in range(len(out_grads_eager)): np.testing.assert_equal( out_grads_eager[idx], out_grads_eager_baseline_np[idx], - err_msg=( - 'Develop: paddle.nn.functional.dropout eager grad is unstable in %s dtype' - ) - % self.dtype, + err_msg= + ('Develop: paddle.nn.functional.dropout eager grad is unstable in %s dtype' + ) % self.dtype, ) def test_static_stability(self): @@ -299,43 +302,48 @@ def test_static_stability(self): x_static, dout_static, ) - exe = paddle.static.Executor( - place=paddle.CUDAPlace(0) - ) - + exe = paddle.static.Executor(place=paddle.CUDAPlace(0)) + set_seed() exe.run(sp) out = exe.run( mp, - feed={"x": self.np_x, "dout": self.np_dout}, + feed={ + "x": self.np_x, + "dout": self.np_dout + }, fetch_list=[out_static_pg] + out_grads_static_pg, ) out_static_baseline, out_grads_static_baseline = out[0], out[1:] for i in range(50): + set_seed() out = exe.run( mp, - feed={"x": self.np_x, "dout": self.np_dout}, + feed={ + "x": self.np_x, + "dout": self.np_dout + }, fetch_list=[out_static_pg] + out_grads_static_pg, ) out_static, out_grads_static = out[0], out[1:] np.testing.assert_equal( out_static, out_static_baseline, - err_msg=( - 'Develop: paddle.nn.functional.dropout static forward is unstable in %s dtype' - ) - % self.dtype, + err_msg= + ('Develop: paddle.nn.functional.dropout static forward is unstable in %s dtype' + ) % self.dtype, ) for idx in range(len(out_grads_static)): np.testing.assert_equal( out_grads_static[idx], out_grads_static_baseline[idx], - err_msg=( - 'Develop: paddle.nn.functional.dropout static grad is unstable in %s dtype' - ) - % self.dtype, + err_msg= + ('Develop: paddle.nn.functional.dropout static grad is unstable in %s dtype' + ) % self.dtype, ) + class TestMatmulDevelopCase1_FP16(TestMatmulDevelopCase1_FP32): + def init_params(self): self.np_input_dir = "./inputs_case1.npz" self.dtype = "float16" @@ -344,6 +352,7 @@ def init_params(self): class TestMatmulDevelopCase1_BFP16(TestMatmulDevelopCase1_FP32): + def init_params(self): self.np_input_dir = "./inputs_case1.npz" self.dtype = "bfloat16" @@ -352,6 +361,7 @@ def init_params(self): class TestMatmulDevelopCase2_FP32(TestMatmulDevelopCase1_FP32): + def init_params(self): self.np_input_dir = "./inputs_case2.npz" self.dtype = "float32" @@ -360,6 +370,7 @@ def init_params(self): class TestMatmulDevelopCase2_FP16(TestMatmulDevelopCase1_FP32): + def init_params(self): self.np_input_dir = "./inputs_case2.npz" self.dtype = "float16" @@ -368,6 +379,7 @@ def init_params(self): class TestMatmulDevelopCase2_BFP16(TestMatmulDevelopCase1_FP32): + def init_params(self): self.np_input_dir = "./inputs_case2.npz" self.dtype = "bfloat16"