diff --git a/test_dropout/test_FusedDropoutAdd_dropout_incubate.py b/test_dropout/test_FusedDropoutAdd_dropout_incubate.py new file mode 100644 index 0000000..f96b53f --- /dev/null +++ b/test_dropout/test_FusedDropoutAdd_dropout_incubate.py @@ -0,0 +1,370 @@ +import numpy as np +import paddle +import torch +import unittest +from paddle.fluid.layers.utils import map_structure +import sys + +sys.path.append("..") +from utils import TOLERANCE, convert_dtype_to_torch_type +from paddle.fluid import core +from paddle.fluid.framework import in_dygraph_mode + +seed = 1234 + + +def set_seed(): + np.random.seed(seed) + paddle.seed(seed) + torch.manual_seed(seed) + if not in_dygraph_mode(): + paddle.framework.random._manual_program_seed(seed) + if core.is_compiled_with_cuda(): + paddle.set_flags({'FLAGS_cudnn_deterministic': True}) + torch.backends.cudnn.deterministic = True + torch.cuda.manual_seed_all(seed) + + +class TestMatmulIncubateCase1_FP32(unittest.TestCase): + + def setUp(self): + set_seed() + self.init_params() + self.init_threshold() + self.init_np_inputs_and_dout() + + def init_params(self): + self.np_input_dir = "./inputs_case1.npz" + self.dtype = "float32" + self.save_static_res_path = "./static_develop_res_case1_fp32.npz" + self.save_eager_res_path = "./eager_develop_res_case1_fp32.npz" + + def init_threshold(self): + self.atol = TOLERANCE[self.dtype]["atol"] + self.rtol = TOLERANCE[self.dtype]["rtol"] + + def init_np_inputs_and_dout(self): + np_inputs_array = np.load(self.np_input_dir) + # get np array from npz file + self.np_x = np_inputs_array["x"] + self.np_y = np_inputs_array["y"] + self.p = float(np_inputs_array["p"]) + self.np_dout = np_inputs_array["dout"] + # convert np array dtype + if self.dtype == "float16": + self.np_x = self.np_x.astype("float16") + self.np_y = self.np_y.astype("float16") + self.np_dout = self.np_dout.astype("float16") + + def gen_eager_inputs_and_dout(self): + x_eager = paddle.to_tensor( + self.np_x, + dtype=self.dtype if self.dtype != 'bfloat16' else "float32", + place="gpu", + ) + x_eager.stop_gradient = False + y_eager = paddle.to_tensor( + self.np_y, + dtype=self.dtype if self.dtype != 'bfloat16' else "float32", + place="gpu", + ) + y_eager.stop_gradient = False + dout_eager = paddle.to_tensor( + self.np_dout, + dtype=self.dtype if self.dtype != 'bfloat16' else "float32", + place="gpu", + ) + dout_eager.stop_gradient = False + return x_eager, y_eager, dout_eager + + def gen_static_inputs_and_dout(self): + x_static = paddle.static.data( + 'x', + shape=self.np_x.shape, + dtype=self.dtype if self.dtype != "bfloat16" else "float32", + ) + x_static.stop_gradient = False + y_static = paddle.static.data( + 'y', + shape=self.np_y.shape, + dtype=self.dtype if self.dtype != "bfloat16" else "float32", + ) + y_static.stop_gradient = False + dout_static = paddle.static.data( + 'dout', + shape=self.np_dout.shape, + dtype=self.dtype if self.dtype != "bfloat16" else "float32", + ) + dout_static.stop_gradient = False + return x_static, y_static, dout_static + + def cal_eager_res(self, x, y, dout): + x_t = x + y_t = y + dout_t = dout + if self.dtype == "bfloat16": + x_t = paddle.cast(x, dtype="uint16") + y_t = paddle.cast(y, dtype="uint16") + dout_t = paddle.cast(dout, dtype="uint16") + set_seed() + out = paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout( + x_t, p=self.p) + y_t + out_grads = paddle.grad([out], [x, y], + grad_outputs=[dout_t], + retain_graph=True) + if self.dtype == "bfloat16": + out = paddle.cast(out, dtype="float32") + return out, out_grads + + def cal_static_res(self, x, y, dout): + x_t = x + y_t = y + dout_t = dout + if self.dtype == "bfloat16": + x_t = paddle.cast(x, dtype="uint16") + y_t = paddle.cast(y, dtype="uint16") + dout_t = paddle.cast(dout, dtype="uint16") + set_seed() + out = paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout( + x_t, p=self.p) + y_t + out_grads = paddle.static.gradients([out], [x, y], + target_gradients=[dout_t]) + if self.dtype == "bfloat16": + out = paddle.cast(out, dtype="float32") + return out, out_grads + + def test_eager_accuracy(self): + # get develop eager res + develop_res_array = np.load(self.save_eager_res_path) + out_eager_develop = develop_res_array["out_eager"] + out_eager_grad_0_develop = develop_res_array["out_grads_eager_0"] + out_eager_grad_1_develop = develop_res_array["out_grads_eager_1"] + out_eager_grads_develop = [ + out_eager_grad_0_develop, out_eager_grad_1_develop + ] + + # calculate incubate eager res + x_eager, y_eager, dout_eager = self.gen_eager_inputs_and_dout() + set_seed() + out_eager, out_grads_eager = self.cal_eager_res( + x_eager, y_eager, dout_eager) + del x_eager + del y_eager + del dout_eager + paddle.device.cuda.empty_cache() + out_eager_np = out_eager.numpy() + out_grads_eager_np = map_structure( + lambda x: x.numpy(), + out_grads_eager, + ) + del out_eager + del out_grads_eager + paddle.device.cuda.empty_cache() + # compare incubate eager res with develop eager res + np.testing.assert_equal( + out_eager_np, + out_eager_develop, + err_msg= + ('Incubate: compare paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout incubate eager forward res with develop eager forward res failed in %s dtype' + ) % self.dtype, + ) + for idx in range(len(out_grads_eager_np)): + np.testing.assert_equal( + out_grads_eager_np[idx], + out_eager_grads_develop[idx], + err_msg= + ('Incubate: compare paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout incubate eager grad res with develop eager grad res failed in %s dtype' + ) % self.dtype, + ) + + def test_static_accuracy(self): + # get develop static res + develop_res_array = np.load(self.save_static_res_path) + out_static_develop = develop_res_array["out_static"] + out_grads_static_0_develop = develop_res_array["out_grads_static_0"] + out_grads_static_develop = [ + out_grads_static_0_develop, out_grads_static_1_develop + ] + + # calculate incubate static res + with paddle.fluid.framework._dygraph_guard(None): + mp, sp = paddle.static.Program(), paddle.static.Program() + with paddle.static.program_guard(mp, sp): + x_static, y_static, dout_static = self.gen_static_inputs_and_dout( + ) + (out_static, out_grads_static) = self.cal_static_res( + x_static, + y_static, + dout_static, + ) + exe = paddle.static.Executor(place=paddle.CUDAPlace(0)) + set_seed() + exe.run(sp) + out = exe.run( + mp, + feed={ + "x": self.np_x, + "y": self.np_y, + "dout": self.np_dout + }, + fetch_list=[out_static] + out_grads_static, + ) + out_static, out_grads_static = out[0], out[1:] + + # compare incubate static res with develop static res + np.testing.assert_equal( + out_static, + out_static_develop, + err_msg= + ('Incubate: compare paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout incubate static forward res with develop static forward res failed in %s dtype' + ) % self.dtype, + ) + for idx in range(len(out_grads_static)): + np.testing.assert_equal( + out_grads_static[idx], + out_grads_static_develop[idx], + err_msg= + ('Incubate: compare paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout incubate static grad res with develop static grad res failed in %s dtype' + ) % self.dtype, + ) + + def test_eager_stability(self): + x_eager, y_eager, dout_eager = self.gen_eager_inputs_and_dout() + set_seed() + out_eager_baseline, out_grads_eager_baseline = self.cal_eager_res( + x_eager, y_eager, dout_eager) + out_eager_baseline_np = out_eager_baseline.numpy() + out_grads_eager_baseline_np = map_structure( + lambda x: x.numpy(), + out_grads_eager_baseline, + ) + del out_eager_baseline + del out_grads_eager_baseline + paddle.device.cuda.empty_cache() + + for i in range(50): + set_seed() + out_eager, out_grads_eager = self.cal_eager_res( + x_eager, y_eager, dout_eager) + out_eager = out_eager.numpy() + out_grads_eager = map_structure( + lambda x: x.numpy(), + out_grads_eager, + ) + np.testing.assert_equal( + out_eager, + out_eager_baseline_np, + err_msg= + ('Incubate: paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout eager forward is unstable in %s dtype' + ) % self.dtype, + ) + for idx in range(len(out_grads_eager)): + np.testing.assert_equal( + out_grads_eager[idx], + out_grads_eager_baseline_np[idx], + err_msg= + ('Incubate: paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout eager grad is unstable in %s dtype' + ) % self.dtype, + ) + + def test_static_stability(self): + with paddle.fluid.framework._dygraph_guard(None): + paddle.framework.random._manual_program_seed(seed) + mp, sp = paddle.static.Program(), paddle.static.Program() + with paddle.static.program_guard(mp, sp): + x_static, y_static, dout_static = self.gen_static_inputs_and_dout( + ) + (out_static_pg, out_grads_static_pg) = self.cal_static_res( + x_static, + y_static, + dout_static, + ) + exe = paddle.static.Executor(place=paddle.CUDAPlace(0)) + set_seed() + exe.run(sp) + out = exe.run( + mp, + feed={ + "x": self.np_x, + "y": self.np_y, + "dout": self.np_dout + }, + fetch_list=[out_static_pg] + out_grads_static_pg, + ) + out_static_baseline, out_grads_static_baseline = out[0], out[1:] + for i in range(50): + set_seed() + out = exe.run( + mp, + feed={ + "x": self.np_x, + "y": self.np_y, + "dout": self.np_dout + }, + fetch_list=[out_static_pg] + out_grads_static_pg, + ) + out_static, out_grads_static = out[0], out[1:] + np.testing.assert_equal( + out_static, + out_static_baseline, + err_msg= + ('Incubate: paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout static forward is unstable in %s dtype' + ) % self.dtype, + ) + for idx in range(len(out_grads_static)): + np.testing.assert_equal( + out_grads_static[idx], + out_grads_static_baseline[idx], + err_msg= + ('Incubate: paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout static grad is unstable in %s dtype' + ) % self.dtype, + ) + + +class TestMatmulIncubateCase1_FP16(TestMatmulIncubateCase1_FP32): + + def init_params(self): + self.np_input_dir = "./inputs_case1.npz" + self.dtype = "float16" + self.save_static_res_path = "./static_develop_res_case1_fp16.npz" + self.save_eager_res_path = "./eager_develop_res_case1_fp16.npz" + + +class TestMatmulIncubateCase1_BFP16(TestMatmulIncubateCase1_FP32): + + def init_params(self): + self.np_input_dir = "./inputs_case1.npz" + self.dtype = "bfloat16" + self.save_static_res_path = "./static_develop_res_case1_bfp16.npz" + self.save_eager_res_path = "./eager_develop_res_case1_bfp16.npz" + + +class TestMatmulIncubateCase2_FP32(TestMatmulIncubateCase1_FP32): + + def init_params(self): + self.np_input_dir = "./inputs_case2.npz" + self.dtype = "float32" + self.save_static_res_path = "./static_develop_res_case2_fp32.npz" + self.save_eager_res_path = "./eager_develop_res_case2_fp32.npz" + + +class TestMatmulIncubateCase2_FP16(TestMatmulIncubateCase1_FP32): + + def init_params(self): + self.np_input_dir = "./inputs_case2.npz" + self.dtype = "float16" + self.save_static_res_path = "./static_develop_res_case2_fp16.npz" + self.save_eager_res_path = "./eager_develop_res_case2_fp16.npz" + + +class TestMatmulIncubateCase2_BFP16(TestMatmulIncubateCase1_FP32): + + def init_params(self): + self.np_input_dir = "./inputs_case2.npz" + self.dtype = "bfloat16" + self.save_static_res_path = "./static_develop_res_case2_bfp16.npz" + self.save_eager_res_path = "./eager_develop_res_case2_bfp16.npz" + + +if __name__ == '__main__': + unittest.main() diff --git a/test_dropout/test_dropout_incubate.py b/test_dropout/test_dropout_incubate.py new file mode 100644 index 0000000..afd6b2d --- /dev/null +++ b/test_dropout/test_dropout_incubate.py @@ -0,0 +1,338 @@ +import numpy as np +import paddle +import torch +import unittest +from paddle.fluid.layers.utils import map_structure +import sys + +sys.path.append("..") +from utils import TOLERANCE, convert_dtype_to_torch_type +from paddle.fluid import core +from paddle.fluid.framework import in_dygraph_mode + +seed = 1234 + + +def set_seed(): + np.random.seed(seed) + paddle.seed(seed) + torch.manual_seed(seed) + if not in_dygraph_mode(): + paddle.framework.random._manual_program_seed(seed) + if core.is_compiled_with_cuda(): + paddle.set_flags({'FLAGS_cudnn_deterministic': True}) + torch.backends.cudnn.deterministic = True + torch.cuda.manual_seed_all(seed) + + +class TestMatmulIncubateCase1_FP32(unittest.TestCase): + + def setUp(self): + set_seed() + self.init_params() + self.init_threshold() + self.init_np_inputs_and_dout() + + def init_params(self): + self.np_input_dir = "./inputs_case1.npz" + self.dtype = "float32" + self.save_static_res_path = "./static_develop_res_case1_fp32.npz" + self.save_eager_res_path = "./eager_develop_res_case1_fp32.npz" + + def init_threshold(self): + self.atol = TOLERANCE[self.dtype]["atol"] + self.rtol = TOLERANCE[self.dtype]["rtol"] + + def init_np_inputs_and_dout(self): + np_inputs_array = np.load(self.np_input_dir) + # get np array from npz file + self.np_x = np_inputs_array["x"] + self.p = float(np_inputs_array["p"]) + self.np_dout = np_inputs_array["dout"] + # convert np array dtype + if self.dtype == "float16": + self.np_x = self.np_x.astype("float16") + self.np_dout = self.np_dout.astype("float16") + + def gen_eager_inputs_and_dout(self): + x_eager = paddle.to_tensor( + self.np_x, + dtype=self.dtype if self.dtype != 'bfloat16' else "float32", + place="gpu", + ) + x_eager.stop_gradient = False + dout_eager = paddle.to_tensor( + self.np_dout, + dtype=self.dtype if self.dtype != 'bfloat16' else "float32", + place="gpu", + ) + dout_eager.stop_gradient = False + return x_eager, dout_eager + + def gen_static_inputs_and_dout(self): + x_static = paddle.static.data( + 'x', + shape=self.np_x.shape, + dtype=self.dtype if self.dtype != "bfloat16" else "float32", + ) + x_static.stop_gradient = False + dout_static = paddle.static.data( + 'dout', + shape=self.np_dout.shape, + dtype=self.dtype if self.dtype != "bfloat16" else "float32", + ) + dout_static.stop_gradient = False + return x_static, dout_static + + def cal_eager_res(self, x, dout): + x_t = x + dout_t = dout + if self.dtype == "bfloat16": + x_t = paddle.cast(x, dtype="uint16") + dout_t = paddle.cast(dout, dtype="uint16") + set_seed() + out = paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout( + x_t, p=self.p) + out_grads = paddle.grad([out], [x], + grad_outputs=[dout_t], + retain_graph=True) + if self.dtype == "bfloat16": + out = paddle.cast(out, dtype="float32") + return out, out_grads + + def cal_static_res(self, x, dout): + x_t = x + dout_t = dout + if self.dtype == "bfloat16": + x_t = paddle.cast(x, dtype="uint16") + dout_t = paddle.cast(dout, dtype="uint16") + set_seed() + out = paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout( + x_t, p=self.p) + out_grads = paddle.static.gradients([out], [x], + target_gradients=[dout_t]) + if self.dtype == "bfloat16": + out = paddle.cast(out, dtype="float32") + return out, out_grads + + def test_eager_accuracy(self): + # get develop eager res + develop_res_array = np.load(self.save_eager_res_path) + out_eager_develop = develop_res_array["out_eager"] + out_eager_grad_0_develop = develop_res_array["out_grads_eager_0"] + out_eager_grads_develop = [out_eager_grad_0_develop] + + # calculate incubate eager res + x_eager, dout_eager = self.gen_eager_inputs_and_dout() + set_seed() + out_eager, out_grads_eager = self.cal_eager_res(x_eager, dout_eager) + del x_eager + del dout_eager + paddle.device.cuda.empty_cache() + out_eager_np = out_eager.numpy() + out_grads_eager_np = map_structure( + lambda x: x.numpy(), + out_grads_eager, + ) + del out_eager + del out_grads_eager + paddle.device.cuda.empty_cache() + # compare incubate eager res with develop eager res + np.testing.assert_equal( + out_eager_np, + out_eager_develop, + err_msg= + ('Incubate: compare paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout incubate eager forward res with develop eager forward res failed in %s dtype' + ) % self.dtype, + ) + for idx in range(len(out_grads_eager_np)): + np.testing.assert_equal( + out_grads_eager_np[idx], + out_eager_grads_develop[idx], + err_msg= + ('Incubate: compare paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout incubate eager grad res with develop eager grad res failed in %s dtype' + ) % self.dtype, + ) + + def test_static_accuracy(self): + # get develop static res + develop_res_array = np.load(self.save_static_res_path) + out_static_develop = develop_res_array["out_static"] + out_grads_static_0_develop = develop_res_array["out_grads_static_0"] + out_grads_static_develop = [out_grads_static_0_develop] + + # calculate incubate static res + with paddle.fluid.framework._dygraph_guard(None): + mp, sp = paddle.static.Program(), paddle.static.Program() + with paddle.static.program_guard(mp, sp): + x_static, dout_static = self.gen_static_inputs_and_dout() + (out_static, out_grads_static) = self.cal_static_res( + x_static, + dout_static, + ) + exe = paddle.static.Executor(place=paddle.CUDAPlace(0)) + set_seed() + exe.run(sp) + out = exe.run( + mp, + feed={ + "x": self.np_x, + "dout": self.np_dout + }, + fetch_list=[out_static] + out_grads_static, + ) + out_static, out_grads_static = out[0], out[1:] + + # compare incubate static res with develop static res + np.testing.assert_equal( + out_static, + out_static_develop, + err_msg= + ('Incubate: compare paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout incubate static forward res with develop static forward res failed in %s dtype' + ) % self.dtype, + ) + for idx in range(len(out_grads_static)): + np.testing.assert_equal( + out_grads_static[idx], + out_grads_static_develop[idx], + err_msg= + ('Incubate: compare paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout incubate static grad res with develop static grad res failed in %s dtype' + ) % self.dtype, + ) + + def test_eager_stability(self): + x_eager, dout_eager = self.gen_eager_inputs_and_dout() + set_seed() + out_eager_baseline, out_grads_eager_baseline = self.cal_eager_res( + x_eager, dout_eager) + out_eager_baseline_np = out_eager_baseline.numpy() + out_grads_eager_baseline_np = map_structure( + lambda x: x.numpy(), + out_grads_eager_baseline, + ) + del out_eager_baseline + del out_grads_eager_baseline + paddle.device.cuda.empty_cache() + + for i in range(50): + set_seed() + out_eager, out_grads_eager = self.cal_eager_res( + x_eager, dout_eager) + out_eager = out_eager.numpy() + out_grads_eager = map_structure( + lambda x: x.numpy(), + out_grads_eager, + ) + np.testing.assert_equal( + out_eager, + out_eager_baseline_np, + err_msg= + ('Incubate: paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout eager forward is unstable in %s dtype' + ) % self.dtype, + ) + for idx in range(len(out_grads_eager)): + np.testing.assert_equal( + out_grads_eager[idx], + out_grads_eager_baseline_np[idx], + err_msg= + ('Incubate: paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout eager grad is unstable in %s dtype' + ) % self.dtype, + ) + + def test_static_stability(self): + with paddle.fluid.framework._dygraph_guard(None): + paddle.framework.random._manual_program_seed(seed) + mp, sp = paddle.static.Program(), paddle.static.Program() + with paddle.static.program_guard(mp, sp): + x_static, dout_static = self.gen_static_inputs_and_dout() + (out_static_pg, out_grads_static_pg) = self.cal_static_res( + x_static, + dout_static, + ) + exe = paddle.static.Executor(place=paddle.CUDAPlace(0)) + set_seed() + exe.run(sp) + out = exe.run( + mp, + feed={ + "x": self.np_x, + "dout": self.np_dout + }, + fetch_list=[out_static_pg] + out_grads_static_pg, + ) + out_static_baseline, out_grads_static_baseline = out[0], out[1:] + for i in range(50): + set_seed() + out = exe.run( + mp, + feed={ + "x": self.np_x, + "dout": self.np_dout + }, + fetch_list=[out_static_pg] + out_grads_static_pg, + ) + out_static, out_grads_static = out[0], out[1:] + np.testing.assert_equal( + out_static, + out_static_baseline, + err_msg= + ('Incubate: paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout static forward is unstable in %s dtype' + ) % self.dtype, + ) + for idx in range(len(out_grads_static)): + np.testing.assert_equal( + out_grads_static[idx], + out_grads_static_baseline[idx], + err_msg= + ('Incubate: paddle.distributed.fleet.meta_parallel.parallel_layers.random.dropout static grad is unstable in %s dtype' + ) % self.dtype, + ) + + +class TestMatmulIncubateCase1_FP16(TestMatmulIncubateCase1_FP32): + + def init_params(self): + self.np_input_dir = "./inputs_case1.npz" + self.dtype = "float16" + self.save_static_res_path = "./static_develop_res_case1_fp16.npz" + self.save_eager_res_path = "./eager_develop_res_case1_fp16.npz" + + +class TestMatmulIncubateCase1_BFP16(TestMatmulIncubateCase1_FP32): + + def init_params(self): + self.np_input_dir = "./inputs_case1.npz" + self.dtype = "bfloat16" + self.save_static_res_path = "./static_develop_res_case1_bfp16.npz" + self.save_eager_res_path = "./eager_develop_res_case1_bfp16.npz" + + +class TestMatmulIncubateCase2_FP32(TestMatmulIncubateCase1_FP32): + + def init_params(self): + self.np_input_dir = "./inputs_case2.npz" + self.dtype = "float32" + self.save_static_res_path = "./static_develop_res_case2_fp32.npz" + self.save_eager_res_path = "./eager_develop_res_case2_fp32.npz" + + +class TestMatmulIncubateCase2_FP16(TestMatmulIncubateCase1_FP32): + + def init_params(self): + self.np_input_dir = "./inputs_case2.npz" + self.dtype = "float16" + self.save_static_res_path = "./static_develop_res_case2_fp16.npz" + self.save_eager_res_path = "./eager_develop_res_case2_fp16.npz" + + +class TestMatmulIncubateCase2_BFP16(TestMatmulIncubateCase1_FP32): + + def init_params(self): + self.np_input_dir = "./inputs_case2.npz" + self.dtype = "bfloat16" + self.save_static_res_path = "./static_develop_res_case2_bfp16.npz" + self.save_eager_res_path = "./eager_develop_res_case2_bfp16.npz" + + +if __name__ == '__main__': + unittest.main() diff --git a/test_dropout/test_incubate_nn_FusedDropoutAdd_develop.py b/test_dropout/test_incubate_nn_FusedDropoutAdd_develop.py new file mode 100644 index 0000000..c30048d --- /dev/null +++ b/test_dropout/test_incubate_nn_FusedDropoutAdd_develop.py @@ -0,0 +1,434 @@ +import numpy as np +import paddle +import torch +import unittest +import sys + +sys.path.append("..") +from utils import TOLERANCE, convert_dtype_to_torch_type +from paddle.fluid import core +from paddle.utils import map_structure +from paddle.fluid.framework import in_dygraph_mode + +seed = 1234 + + +def set_seed(): + np.random.seed(seed) + paddle.seed(seed) + torch.manual_seed(seed) + if not in_dygraph_mode(): + paddle.framework.random._manual_program_seed(seed) + if core.is_compiled_with_cuda(): + paddle.set_flags({'FLAGS_cudnn_deterministic': True}) + torch.backends.cudnn.deterministic = True + torch.cuda.manual_seed_all(seed) + + +def generate_np_inputs_and_dout(): + p = 0.1 + + x_case1 = np.random.random(size=[1, 4096, 12288]).astype("float32") + y_case1 = np.random.random(size=[1, 4096, 12288]).astype("float32") + dout_case1 = np.random.random(size=[1, 4096, 12288]).astype("float32") + + x_case2 = np.random.random(size=[1, 32, 4096, 4096]).astype("float32") + y_case2 = np.random.random(size=[1, 32, 4096, 4096]).astype("float32") + dout_case2 = np.random.random(size=[1, 32, 4096, 4096]).astype("float32") + + np.savez("./inputs_case1.npz", x=x_case1, y=x_case1, p=p, dout=dout_case1) + np.savez("./inputs_case2.npz", x=x_case2, y=x_case2, p=p, dout=dout_case2) + + +class TestMatmulDevelopCase1_FP32(unittest.TestCase): + + def setUp(self): + self.init_params() + self.init_threshold() + self.init_np_inputs_and_dout() + x_torch, y_torch, dout_torch = self.gen_torch_inputs_and_dout() + out_torch, out_grads_torch = self.cal_torch_res( + x_torch, y_torch, dout_torch) + del x_torch + del y_torch + del dout_torch + self.out_torch = out_torch.cpu().detach().numpy() + self.out_grads_torch = map_structure( + lambda x: x.detach().cpu().numpy(), + out_grads_torch, + ) + del out_torch, out_grads_torch + torch.cuda.empty_cache() + + def init_params(self): + self.np_input_dir = "./inputs_case1.npz" + self.dtype = "float32" + self.save_static_res_path = "./static_develop_res_case1_fp32.npz" + self.save_eager_res_path = "./eager_develop_res_case1_fp32.npz" + + def init_threshold(self): + self.atol = TOLERANCE[self.dtype]["atol"] + self.rtol = TOLERANCE[self.dtype]["rtol"] + + def init_np_inputs_and_dout(self): + np_inputs_array = np.load(self.np_input_dir) + # get np array from npz file + self.np_x = np_inputs_array["x"] + self.np_y = np_inputs_array["y"] + self.np_dout = np_inputs_array["dout"] + self.p = float(np_inputs_array["p"]) + # convert np array dtype + if self.dtype == "float16": + self.np_x = self.np_x.astype("float16") + self.np_y = self.np_y.astype("float16") + self.np_dout = self.np_dout.astype("float16") + + def gen_torch_inputs_and_dout(self): + x_torch = torch.tensor( + self.np_x, + device='cuda', + dtype=convert_dtype_to_torch_type(self.dtype) + if self.dtype != 'bfloat16' else torch.float32, + requires_grad=True, + ) + y_torch = torch.tensor( + self.np_y, + device='cuda', + dtype=convert_dtype_to_torch_type(self.dtype) + if self.dtype != 'bfloat16' else torch.float32, + requires_grad=True, + ) + dout_torch = torch.tensor( + self.np_dout, + device='cuda', + dtype=convert_dtype_to_torch_type(self.dtype) + if self.dtype != 'bfloat16' else torch.float32, + requires_grad=True, + ) + return x_torch, y_torch, dout_torch + + def gen_eager_inputs_and_dout(self): + x_eager = paddle.to_tensor( + self.np_x, + dtype=self.dtype if self.dtype != 'bfloat16' else "float32", + place="gpu", + ) + x_eager.stop_gradient = False + y_eager = paddle.to_tensor( + self.np_y, + dtype=self.dtype if self.dtype != 'bfloat16' else "float32", + place="gpu", + ) + y_eager.stop_gradient = False + dout_eager = paddle.to_tensor( + self.np_dout, + dtype=self.dtype if self.dtype != 'bfloat16' else "float32", + place="gpu", + ) + dout_eager.stop_gradient = False + return x_eager, y_eager, dout_eager + + def gen_static_inputs_and_dout(self): + x_static = paddle.static.data( + 'x', + shape=self.np_x.shape, + dtype=self.dtype if self.dtype != "bfloat16" else "float32", + ) + x_static.stop_gradient = False + y_static = paddle.static.data( + 'y', + shape=self.np_y.shape, + dtype=self.dtype if self.dtype != "bfloat16" else "float32", + ) + y_static.stop_gradient = False + dout_static = paddle.static.data( + 'dout', + shape=self.np_dout.shape, + dtype=self.dtype if self.dtype != "bfloat16" else "float32", + ) + dout_static.stop_gradient = False + return x_static, y_static, dout_static + + def cal_torch_res(self, x, y, dout): + x_t = x + y_t = y + dout_t = dout + if self.dtype == "bfloat16": + x_t = x.to(dtype=torch.bfloat16) + y_t = y.to(dtype=torch.bfloat16) + dout_t = dout.to(dtype=torch.bfloat16) + set_seed() + out = torch.nn.functional.dropout(x_t, p=self.p) + y_t + out_grads = torch.autograd.grad([out], [x, y], grad_outputs=[dout_t]) + if self.dtype == "bfloat16": + out = out.to(dtype=torch.float32) + return out, out_grads + + def cal_eager_res(self, x, y, dout): + x_t = x + y_t = y + dout_t = dout + if self.dtype == "bfloat16": + x_t = paddle.cast(x, dtype="uint16") + y_t = paddle.cast(y, dtype="uint16") + dout_t = paddle.cast(dout, dtype="uint16") + set_seed() + out = paddle.incubate.nn.FusedDropoutAdd(p=self.p)(x_t, y_t) + out_grads = paddle.grad([out], [x, y], grad_outputs=[dout_t]) + if self.dtype == "bfloat16": + out = paddle.cast(out, dtype="float32") + return out, out_grads + + def cal_static_res(self, x, y, dout): + x_t = x + y_t = y + dout_t = dout + if self.dtype == "bfloat16": + x_t = paddle.cast(x, dtype="uint16") + y_t = paddle.cast(y, dtype="uint16") + dout_t = paddle.cast(dout, dtype="uint16") + set_seed() + out = paddle.incubate.nn.FusedDropoutAdd(p=self.p)(x_t, y_t) + out_grads = paddle.static.gradients([out], [x], + target_gradients=[dout_t]) + if self.dtype == "bfloat16": + out = paddle.cast(out, dtype="float32") + return out, out_grads + + def test_eager_accuracy(self): + x_eager, y_eager, dout_eager = self.gen_eager_inputs_and_dout() + set_seed() + out_eager, out_grads_eager = self.cal_eager_res( + x_eager, y_eager, dout_eager) + del x_eager + del y_eager + del dout_eager + paddle.device.cuda.empty_cache() + out_eager_np = out_eager.numpy() + out_grads_eager_np = map_structure( + lambda x: x.numpy(), + out_grads_eager, + ) + del out_eager + del out_grads_eager + paddle.device.cuda.empty_cache() + # save eager res for test_matmul_incubate + np.savez(self.save_eager_res_path, + out_eager=out_eager_np, + out_grads_eager_0=out_grads_eager_np[0], + out_grads_eager_1=out_grads_eager_np[1]) + + # compare eager res with torch + np.testing.assert_allclose( + out_eager_np, + self.out_torch, + self.atol, + self.rtol, + err_msg= + ('Develop: compare paddle.nn.functional.dropout eager forward res with torch failed in %s dtype' + ) % self.dtype, + ) + for idx in range(len(out_grads_eager_np)): + np.testing.assert_allclose( + out_grads_eager_np[idx], + self.out_grads_torch[idx], + self.atol, + self.rtol, + err_msg= + ('Develop: compare paddle.nn.functional.dropout eager grad res with torch failed in %s dtype' + ) % self.dtype, + ) + + def test_static_accuracy(self): + with paddle.fluid.framework._dygraph_guard(None): + mp, sp = paddle.static.Program(), paddle.static.Program() + with paddle.static.program_guard(mp, sp): + x_static, y_static, dout_static = self.gen_static_inputs_and_dout( + ) + (out_static, out_grads_static) = self.cal_static_res( + x_static, + y_static, + dout_static, + ) + exe = paddle.static.Executor(place=paddle.CUDAPlace(0)) + set_seed() + exe.run(sp) + out = exe.run( + mp, + feed={ + "x": self.np_x, + "y": self.np_y, + "dout": self.np_dout + }, + fetch_list=[out_static] + out_grads_static, + ) + out_static, out_grads_static = out[0], out[1:] + + # save static res for test_matmul_incubate + np.savez(self.save_static_res_path, + out_static=out_static, + out_grads_static_0=out_grads_static[0], + out_grads_static_1=out_grads_static[1]) + + # compare static res with torch + np.testing.assert_allclose( + out_static, + self.out_torch, + self.atol, + self.rtol, + err_msg= + ('Develop: compare paddle.nn.functional.dropout static forward res with torch failed in %s dtype' + ) % self.dtype, + ) + for idx in range(len(out_grads_static)): + np.testing.assert_allclose( + out_grads_static[idx], + self.out_grads_torch[idx], + self.atol, + self.rtol, + err_msg= + ('Develop: compare paddle.nn.functional.dropout static grad res with torch failed in %s dtype' + ) % self.dtype, + ) + + def test_eager_stability(self): + x_eager, y_eager, dout_eager = self.gen_eager_inputs_and_dout() + set_seed() + out_eager_baseline, out_grads_eager_baseline = self.cal_eager_res( + x_eager, y_eager, dout_eager) + out_eager_baseline_np = out_eager_baseline.numpy() + out_grads_eager_baseline_np = map_structure( + lambda x: x.numpy(), + out_grads_eager_baseline, + ) + del out_eager_baseline + del out_grads_eager_baseline + paddle.device.cuda.empty_cache() + + for i in range(50): + set_seed() + out_eager, out_grads_eager = self.cal_eager_res( + x_eager, y_eager, dout_eager) + out_eager = out_eager.numpy() + out_grads_eager = map_structure( + lambda x: x.numpy(), + out_grads_eager, + ) + np.testing.assert_equal( + out_eager, + out_eager_baseline_np, + err_msg= + ('Develop: paddle.nn.functional.dropout eager forward is unstable in %s dtype' + ) % self.dtype, + ) + for idx in range(len(out_grads_eager)): + np.testing.assert_equal( + out_grads_eager[idx], + out_grads_eager_baseline_np[idx], + err_msg= + ('Develop: paddle.nn.functional.dropout eager grad is unstable in %s dtype' + ) % self.dtype, + ) + + def test_static_stability(self): + with paddle.fluid.framework._dygraph_guard(None): + paddle.framework.random._manual_program_seed(seed) + mp, sp = paddle.static.Program(), paddle.static.Program() + with paddle.static.program_guard(mp, sp): + x_static, y_static, dout_static = self.gen_static_inputs_and_dout( + ) + (out_static_pg, out_grads_static_pg) = self.cal_static_res( + x_static, + y_static, + dout_static, + ) + exe = paddle.static.Executor(place=paddle.CUDAPlace(0)) + set_seed() + exe.run(sp) + out = exe.run( + mp, + feed={ + "x": self.np_x, + "y": self.np_y, + "dout": self.np_dout + }, + fetch_list=[out_static_pg] + out_grads_static_pg, + ) + out_static_baseline, out_grads_static_baseline = out[0], out[1:] + for i in range(50): + set_seed() + out = exe.run( + mp, + feed={ + "x": self.np_x, + "y": self.np_y, + "dout": self.np_dout + }, + fetch_list=[out_static_pg] + out_grads_static_pg, + ) + out_static, out_grads_static = out[0], out[1:] + np.testing.assert_equal( + out_static, + out_static_baseline, + err_msg= + ('Develop: paddle.nn.functional.dropout static forward is unstable in %s dtype' + ) % self.dtype, + ) + for idx in range(len(out_grads_static)): + np.testing.assert_equal( + out_grads_static[idx], + out_grads_static_baseline[idx], + err_msg= + ('Develop: paddle.nn.functional.dropout static grad is unstable in %s dtype' + ) % self.dtype, + ) + + +class TestMatmulDevelopCase1_FP16(TestMatmulDevelopCase1_FP32): + + def init_params(self): + self.np_input_dir = "./inputs_case1.npz" + self.dtype = "float16" + self.save_static_res_path = "./static_develop_res_case1_fp16.npz" + self.save_eager_res_path = "./eager_develop_res_case1_fp16.npz" + + +class TestMatmulDevelopCase1_BFP16(TestMatmulDevelopCase1_FP32): + + def init_params(self): + self.np_input_dir = "./inputs_case1.npz" + self.dtype = "bfloat16" + self.save_static_res_path = "./static_develop_res_case1_bfp16.npz" + self.save_eager_res_path = "./eager_develop_res_case1_bfp16.npz" + + +class TestMatmulDevelopCase2_FP32(TestMatmulDevelopCase1_FP32): + + def init_params(self): + self.np_input_dir = "./inputs_case2.npz" + self.dtype = "float32" + self.save_static_res_path = "./static_develop_res_case2_fp32.npz" + self.save_eager_res_path = "./eager_develop_res_case2_fp32.npz" + + +class TestMatmulDevelopCase2_FP16(TestMatmulDevelopCase1_FP32): + + def init_params(self): + self.np_input_dir = "./inputs_case2.npz" + self.dtype = "float16" + self.save_static_res_path = "./static_develop_res_case2_fp16.npz" + self.save_eager_res_path = "./eager_develop_res_case2_fp16.npz" + + +class TestMatmulDevelopCase2_BFP16(TestMatmulDevelopCase1_FP32): + + def init_params(self): + self.np_input_dir = "./inputs_case2.npz" + self.dtype = "bfloat16" + self.save_static_res_path = "./static_develop_res_case2_bfp16.npz" + self.save_eager_res_path = "./eager_develop_res_case2_bfp16.npz" + + +if __name__ == '__main__': + generate_np_inputs_and_dout() + unittest.main() diff --git a/test_dropout/test_nn_Dropout_develop.py b/test_dropout/test_nn_Dropout_develop.py new file mode 100644 index 0000000..7c169a8 --- /dev/null +++ b/test_dropout/test_nn_Dropout_develop.py @@ -0,0 +1,391 @@ +import numpy as np +import paddle +import torch +import unittest +import sys + +sys.path.append("..") +from utils import TOLERANCE, convert_dtype_to_torch_type +from paddle.fluid import core +from paddle.utils import map_structure +from paddle.fluid.framework import in_dygraph_mode + +seed = 1234 + + +def set_seed(): + np.random.seed(seed) + paddle.seed(seed) + torch.manual_seed(seed) + if not in_dygraph_mode(): + paddle.framework.random._manual_program_seed(seed) + if core.is_compiled_with_cuda(): + paddle.set_flags({'FLAGS_cudnn_deterministic': True}) + torch.backends.cudnn.deterministic = True + torch.cuda.manual_seed_all(seed) + + +def generate_np_inputs_and_dout(): + p = 0.1 + + x_case1 = np.random.random(size=[1, 4096, 12288]).astype("float32") + dout_case1 = np.random.random(size=[1, 4096, 12288]).astype("float32") + + x_case2 = np.random.random(size=[1, 32, 4096, 4096]).astype("float32") + dout_case2 = np.random.random(size=[1, 32, 4096, 4096]).astype("float32") + + np.savez("./inputs_case1.npz", x=x_case1, p=p, dout=dout_case1) + np.savez("./inputs_case2.npz", x=x_case2, p=p, dout=dout_case2) + + +class TestMatmulDevelopCase1_FP32(unittest.TestCase): + + def setUp(self): + self.init_params() + self.init_threshold() + self.init_np_inputs_and_dout() + x_torch, dout_torch = self.gen_torch_inputs_and_dout() + out_torch, out_grads_torch = self.cal_torch_res(x_torch, dout_torch) + del x_torch + del dout_torch + self.out_torch = out_torch.cpu().detach().numpy() + self.out_grads_torch = map_structure( + lambda x: x.cpu().numpy(), + out_grads_torch, + ) + del out_torch, out_grads_torch + torch.cuda.empty_cache() + + def init_params(self): + self.np_input_dir = "./inputs_case1.npz" + self.dtype = "float32" + self.save_static_res_path = "./static_develop_res_case1_fp32.npz" + self.save_eager_res_path = "./eager_develop_res_case1_fp32.npz" + + def init_threshold(self): + self.atol = TOLERANCE[self.dtype]["atol"] + self.rtol = TOLERANCE[self.dtype]["rtol"] + + def init_np_inputs_and_dout(self): + np_inputs_array = np.load(self.np_input_dir) + # get np array from npz file + self.np_x = np_inputs_array["x"] + self.np_dout = np_inputs_array["dout"] + self.p = float(np_inputs_array["p"]) + # convert np array dtype + if self.dtype == "float16": + self.np_x = self.np_x.astype("float16") + self.np_dout = self.np_dout.astype("float16") + + def gen_torch_inputs_and_dout(self): + x_torch = torch.tensor( + self.np_x, + device='cuda', + dtype=convert_dtype_to_torch_type(self.dtype) + if self.dtype != 'bfloat16' else torch.float32, + requires_grad=True, + ) + dout_torch = torch.tensor( + self.np_dout, + device='cuda', + dtype=convert_dtype_to_torch_type(self.dtype) + if self.dtype != 'bfloat16' else torch.float32, + requires_grad=True, + ) + return x_torch, dout_torch + + def gen_eager_inputs_and_dout(self): + x_eager = paddle.to_tensor( + self.np_x, + dtype=self.dtype if self.dtype != 'bfloat16' else "float32", + place="gpu", + ) + x_eager.stop_gradient = False + dout_eager = paddle.to_tensor( + self.np_dout, + dtype=self.dtype if self.dtype != 'bfloat16' else "float32", + place="gpu", + ) + dout_eager.stop_gradient = False + return x_eager, dout_eager + + def gen_static_inputs_and_dout(self): + x_static = paddle.static.data( + 'x', + shape=self.np_x.shape, + dtype=self.dtype if self.dtype != "bfloat16" else "float32", + ) + x_static.stop_gradient = False + dout_static = paddle.static.data( + 'dout', + shape=self.np_dout.shape, + dtype=self.dtype if self.dtype != "bfloat16" else "float32", + ) + dout_static.stop_gradient = False + return x_static, dout_static + + def cal_torch_res(self, x, dout): + x_t = x + dout_t = dout + if self.dtype == "bfloat16": + x_t = x.to(dtype=torch.bfloat16) + dout_t = dout.to(dtype=torch.bfloat16) + set_seed() + out = torch.nn.Dropout(p=self.p)(x_t) + out_grads = torch.autograd.grad([out], [x], grad_outputs=[dout_t]) + if self.dtype == "bfloat16": + out = out.to(dtype=torch.float32) + return out, out_grads + + def cal_eager_res(self, x, dout): + x_t = x + dout_t = dout + if self.dtype == "bfloat16": + x_t = paddle.cast(x, dtype="uint16") + dout_t = paddle.cast(dout, dtype="uint16") + set_seed() + out = paddle.nn.Dropout(p=self.p)(x_t) + out_grads = paddle.grad([out], [x], grad_outputs=[dout_t]) + if self.dtype == "bfloat16": + out = paddle.cast(out, dtype="float32") + return out, out_grads + + def cal_static_res(self, x, dout): + x_t = x + dout_t = dout + if self.dtype == "bfloat16": + x_t = paddle.cast(x, dtype="uint16") + dout_t = paddle.cast(dout, dtype="uint16") + set_seed() + out = paddle.nn.Dropout(p=self.p)(x_t) + out_grads = paddle.static.gradients([out], [x], + target_gradients=[dout_t]) + if self.dtype == "bfloat16": + out = paddle.cast(out, dtype="float32") + return out, out_grads + + def test_eager_accuracy(self): + x_eager, dout_eager = self.gen_eager_inputs_and_dout() + set_seed() + out_eager, out_grads_eager = self.cal_eager_res(x_eager, dout_eager) + del x_eager + del dout_eager + paddle.device.cuda.empty_cache() + out_eager_np = out_eager.numpy() + out_grads_eager_np = map_structure( + lambda x: x.numpy(), + out_grads_eager, + ) + del out_eager + del out_grads_eager + paddle.device.cuda.empty_cache() + # save eager res for test_matmul_incubate + np.savez(self.save_eager_res_path, + out_eager=out_eager_np, + out_grads_eager_0=out_grads_eager_np[0]) + + # compare eager res with torch + np.testing.assert_allclose( + out_eager_np, + self.out_torch, + self.atol, + self.rtol, + err_msg= + ('Develop: compare paddle.nn.functional.dropout eager forward res with torch failed in %s dtype' + ) % self.dtype, + ) + for idx in range(len(out_grads_eager_np)): + np.testing.assert_allclose( + out_grads_eager_np[idx], + self.out_grads_torch[idx], + self.atol, + self.rtol, + err_msg= + ('Develop: compare paddle.nn.Dropout eager grad res with torch failed in %s dtype' + ) % self.dtype, + ) + + def test_static_accuracy(self): + with paddle.fluid.framework._dygraph_guard(None): + mp, sp = paddle.static.Program(), paddle.static.Program() + with paddle.static.program_guard(mp, sp): + x_static, dout_static = self.gen_static_inputs_and_dout() + (out_static, out_grads_static) = self.cal_static_res( + x_static, + dout_static, + ) + exe = paddle.static.Executor(place=paddle.CUDAPlace(0)) + set_seed() + exe.run(sp) + out = exe.run( + mp, + feed={ + "x": self.np_x, + "dout": self.np_dout + }, + fetch_list=[out_static] + out_grads_static, + ) + out_static, out_grads_static = out[0], out[1:] + + # save static res for test_matmul_incubate + np.savez(self.save_static_res_path, + out_static=out_static, + out_grads_static_0=out_grads_static[0]) + + # compare static res with torch + np.testing.assert_allclose( + out_static, + self.out_torch, + self.atol, + self.rtol, + err_msg= + ('Develop: compare paddle.nn.Dropout static forward res with torch failed in %s dtype' + ) % self.dtype, + ) + for idx in range(len(out_grads_static)): + np.testing.assert_allclose( + out_grads_static[idx], + self.out_grads_torch[idx], + self.atol, + self.rtol, + err_msg= + ('Develop: compare paddle.nn.Dropout static grad res with torch failed in %s dtype' + ) % self.dtype, + ) + + def test_eager_stability(self): + x_eager, dout_eager = self.gen_eager_inputs_and_dout() + set_seed() + out_eager_baseline, out_grads_eager_baseline = self.cal_eager_res( + x_eager, dout_eager) + out_eager_baseline_np = out_eager_baseline.numpy() + out_grads_eager_baseline_np = map_structure( + lambda x: x.numpy(), + out_grads_eager_baseline, + ) + del out_eager_baseline + del out_grads_eager_baseline + paddle.device.cuda.empty_cache() + + for i in range(50): + set_seed() + out_eager, out_grads_eager = self.cal_eager_res( + x_eager, dout_eager) + out_eager = out_eager.numpy() + out_grads_eager = map_structure( + lambda x: x.numpy(), + out_grads_eager, + ) + np.testing.assert_equal( + out_eager, + out_eager_baseline_np, + err_msg= + ('Develop: paddle.nn.Dropout eager forward is unstable in %s dtype' + ) % self.dtype, + ) + for idx in range(len(out_grads_eager)): + np.testing.assert_equal( + out_grads_eager[idx], + out_grads_eager_baseline_np[idx], + err_msg= + ('Develop: paddle.nn.Dropout eager grad is unstable in %s dtype' + ) % self.dtype, + ) + + def test_static_stability(self): + with paddle.fluid.framework._dygraph_guard(None): + mp, sp = paddle.static.Program(), paddle.static.Program() + with paddle.static.program_guard(mp, sp): + x_static, dout_static = self.gen_static_inputs_and_dout() + (out_static_pg, out_grads_static_pg) = self.cal_static_res( + x_static, + dout_static, + ) + exe = paddle.static.Executor(place=paddle.CUDAPlace(0)) + set_seed() + exe.run(sp) + out = exe.run( + mp, + feed={ + "x": self.np_x, + "dout": self.np_dout + }, + fetch_list=[out_static_pg] + out_grads_static_pg, + ) + out_static_baseline, out_grads_static_baseline = out[0], out[1:] + for i in range(50): + set_seed() + out = exe.run( + mp, + feed={ + "x": self.np_x, + "dout": self.np_dout + }, + fetch_list=[out_static_pg] + out_grads_static_pg, + ) + out_static, out_grads_static = out[0], out[1:] + np.testing.assert_equal( + out_static, + out_static_baseline, + err_msg= + ('Develop: paddle.nn.Dropout static forward is unstable in %s dtype' + ) % self.dtype, + ) + for idx in range(len(out_grads_static)): + np.testing.assert_equal( + out_grads_static[idx], + out_grads_static_baseline[idx], + err_msg= + ('Develop: paddle.nn.Dropout static grad is unstable in %s dtype' + ) % self.dtype, + ) + + +class TestMatmulDevelopCase1_FP16(TestMatmulDevelopCase1_FP32): + + def init_params(self): + self.np_input_dir = "./inputs_case1.npz" + self.dtype = "float16" + self.save_static_res_path = "./static_develop_res_case1_fp16.npz" + self.save_eager_res_path = "./eager_develop_res_case1_fp16.npz" + + +class TestMatmulDevelopCase1_BFP16(TestMatmulDevelopCase1_FP32): + + def init_params(self): + self.np_input_dir = "./inputs_case1.npz" + self.dtype = "bfloat16" + self.save_static_res_path = "./static_develop_res_case1_bfp16.npz" + self.save_eager_res_path = "./eager_develop_res_case1_bfp16.npz" + + +class TestMatmulDevelopCase2_FP32(TestMatmulDevelopCase1_FP32): + + def init_params(self): + self.np_input_dir = "./inputs_case2.npz" + self.dtype = "float32" + self.save_static_res_path = "./static_develop_res_case2_fp32.npz" + self.save_eager_res_path = "./eager_develop_res_case2_fp32.npz" + + +class TestMatmulDevelopCase2_FP16(TestMatmulDevelopCase1_FP32): + + def init_params(self): + self.np_input_dir = "./inputs_case2.npz" + self.dtype = "float16" + self.save_static_res_path = "./static_develop_res_case2_fp16.npz" + self.save_eager_res_path = "./eager_develop_res_case2_fp16.npz" + + +class TestMatmulDevelopCase2_BFP16(TestMatmulDevelopCase1_FP32): + + def init_params(self): + self.np_input_dir = "./inputs_case2.npz" + self.dtype = "bfloat16" + self.save_static_res_path = "./static_develop_res_case2_bfp16.npz" + self.save_eager_res_path = "./eager_develop_res_case2_bfp16.npz" + + +if __name__ == '__main__': + generate_np_inputs_and_dout() + unittest.main() diff --git a/test_dropout/test_nn_functional_dropout_develop.py b/test_dropout/test_nn_functional_dropout_develop.py new file mode 100644 index 0000000..58f53a1 --- /dev/null +++ b/test_dropout/test_nn_functional_dropout_develop.py @@ -0,0 +1,392 @@ +import numpy as np +import paddle +import torch +import unittest +import sys + +sys.path.append("..") +from utils import TOLERANCE, convert_dtype_to_torch_type +from paddle.fluid import core +from paddle.utils import map_structure +from paddle.fluid.framework import in_dygraph_mode + +seed = 1234 + + +def set_seed(): + np.random.seed(seed) + paddle.seed(seed) + torch.manual_seed(seed) + if not in_dygraph_mode(): + paddle.framework.random._manual_program_seed(seed) + if core.is_compiled_with_cuda(): + paddle.set_flags({'FLAGS_cudnn_deterministic': True}) + torch.backends.cudnn.deterministic = True + torch.cuda.manual_seed_all(seed) + + +def generate_np_inputs_and_dout(): + p = 0.1 + + x_case1 = np.random.random(size=[1, 4096, 12288]).astype("float32") + dout_case1 = np.random.random(size=[1, 4096, 12288]).astype("float32") + + x_case2 = np.random.random(size=[1, 32, 4096, 4096]).astype("float32") + dout_case2 = np.random.random(size=[1, 32, 4096, 4096]).astype("float32") + + np.savez("./inputs_case1.npz", x=x_case1, p=p, dout=dout_case1) + np.savez("./inputs_case2.npz", x=x_case2, p=p, dout=dout_case2) + + +class TestMatmulDevelopCase1_FP32(unittest.TestCase): + + def setUp(self): + self.init_params() + self.init_threshold() + self.init_np_inputs_and_dout() + x_torch, dout_torch = self.gen_torch_inputs_and_dout() + out_torch, out_grads_torch = self.cal_torch_res(x_torch, dout_torch) + del x_torch + del dout_torch + self.out_torch = out_torch.cpu().detach().numpy() + self.out_grads_torch = map_structure( + lambda x: x.cpu().numpy(), + out_grads_torch, + ) + del out_torch, out_grads_torch + torch.cuda.empty_cache() + + def init_params(self): + self.np_input_dir = "./inputs_case1.npz" + self.dtype = "float16" + self.save_static_res_path = "./static_develop_res_case1_fp32.npz" + self.save_eager_res_path = "./eager_develop_res_case1_fp32.npz" + + def init_threshold(self): + self.atol = TOLERANCE[self.dtype]["atol"] + self.rtol = TOLERANCE[self.dtype]["rtol"] + + def init_np_inputs_and_dout(self): + np_inputs_array = np.load(self.np_input_dir) + # get np array from npz file + self.np_x = np_inputs_array["x"] + self.np_dout = np_inputs_array["dout"] + self.p = float(np_inputs_array["p"]) + # convert np array dtype + if self.dtype == "float16": + self.np_x = self.np_x.astype("float16") + self.np_dout = self.np_dout.astype("float16") + + def gen_torch_inputs_and_dout(self): + x_torch = torch.tensor( + self.np_x, + device='cuda', + dtype=convert_dtype_to_torch_type(self.dtype) + if self.dtype != 'bfloat16' else torch.float32, + requires_grad=True, + ) + dout_torch = torch.tensor( + self.np_dout, + device='cuda', + dtype=convert_dtype_to_torch_type(self.dtype) + if self.dtype != 'bfloat16' else torch.float32, + requires_grad=True, + ) + return x_torch, dout_torch + + def gen_eager_inputs_and_dout(self): + x_eager = paddle.to_tensor( + self.np_x, + dtype=self.dtype if self.dtype != 'bfloat16' else "float32", + place="gpu", + ) + x_eager.stop_gradient = False + dout_eager = paddle.to_tensor( + self.np_dout, + dtype=self.dtype if self.dtype != 'bfloat16' else "float32", + place="gpu", + ) + dout_eager.stop_gradient = False + return x_eager, dout_eager + + def gen_static_inputs_and_dout(self): + x_static = paddle.static.data( + 'x', + shape=self.np_x.shape, + dtype=self.dtype if self.dtype != "bfloat16" else "float32", + ) + x_static.stop_gradient = False + dout_static = paddle.static.data( + 'dout', + shape=self.np_dout.shape, + dtype=self.dtype if self.dtype != "bfloat16" else "float32", + ) + dout_static.stop_gradient = False + return x_static, dout_static + + def cal_torch_res(self, x, dout): + x_t = x + dout_t = dout + if self.dtype == "bfloat16": + x_t = x.to(dtype=torch.bfloat16) + dout_t = dout.to(dtype=torch.bfloat16) + set_seed() + out = torch.nn.functional.dropout(x_t, p=self.p) + out_grads = torch.autograd.grad([out], [x], grad_outputs=[dout_t]) + if self.dtype == "bfloat16": + out = out.to(dtype=torch.float32) + return out, out_grads + + def cal_eager_res(self, x, dout): + x_t = x + dout_t = dout + if self.dtype == "bfloat16": + x_t = paddle.cast(x, dtype="uint16") + dout_t = paddle.cast(dout, dtype="uint16") + set_seed() + out = paddle.nn.functional.dropout(x_t, p=self.p) + out_grads = paddle.grad([out], [x], grad_outputs=[dout_t]) + if self.dtype == "bfloat16": + out = paddle.cast(out, dtype="float32") + return out, out_grads + + def cal_static_res(self, x, dout): + x_t = x + dout_t = dout + if self.dtype == "bfloat16": + x_t = paddle.cast(x, dtype="uint16") + dout_t = paddle.cast(dout, dtype="uint16") + set_seed() + out = paddle.nn.functional.dropout(x_t, p=self.p) + out_grads = paddle.static.gradients([out], [x], + target_gradients=[dout_t]) + if self.dtype == "bfloat16": + out = paddle.cast(out, dtype="float32") + return out, out_grads + + def test_eager_accuracy(self): + x_eager, dout_eager = self.gen_eager_inputs_and_dout() + set_seed() + out_eager, out_grads_eager = self.cal_eager_res(x_eager, dout_eager) + del x_eager + del dout_eager + paddle.device.cuda.empty_cache() + out_eager_np = out_eager.numpy() + out_grads_eager_np = map_structure( + lambda x: x.numpy(), + out_grads_eager, + ) + del out_eager + del out_grads_eager + paddle.device.cuda.empty_cache() + # save eager res for test_matmul_incubate + np.savez(self.save_eager_res_path, + out_eager=out_eager_np, + out_grads_eager_0=out_grads_eager_np[0]) + + # compare eager res with torch + np.testing.assert_allclose( + out_eager_np, + self.out_torch, + self.atol, + self.rtol, + err_msg= + ('Develop: compare paddle.nn.functional.dropout eager forward res with torch failed in %s dtype' + ) % self.dtype, + ) + for idx in range(len(out_grads_eager_np)): + np.testing.assert_allclose( + out_grads_eager_np[idx], + self.out_grads_torch[idx], + self.atol, + self.rtol, + err_msg= + ('Develop: compare paddle.nn.functional.dropout eager grad res with torch failed in %s dtype' + ) % self.dtype, + ) + + def test_static_accuracy(self): + with paddle.fluid.framework._dygraph_guard(None): + mp, sp = paddle.static.Program(), paddle.static.Program() + with paddle.static.program_guard(mp, sp): + x_static, dout_static = self.gen_static_inputs_and_dout() + (out_static, out_grads_static) = self.cal_static_res( + x_static, + dout_static, + ) + exe = paddle.static.Executor(place=paddle.CUDAPlace(0)) + set_seed() + exe.run(sp) + out = exe.run( + mp, + feed={ + "x": self.np_x, + "dout": self.np_dout + }, + fetch_list=[out_static] + out_grads_static, + ) + out_static, out_grads_static = out[0], out[1:] + + # save static res for test_matmul_incubate + np.savez(self.save_static_res_path, + out_static=out_static, + out_grads_static_0=out_grads_static[0]) + + # compare static res with torch + np.testing.assert_allclose( + out_static, + self.out_torch, + self.atol, + self.rtol, + err_msg= + ('Develop: compare paddle.nn.functional.dropout static forward res with torch failed in %s dtype' + ) % self.dtype, + ) + for idx in range(len(out_grads_static)): + np.testing.assert_allclose( + out_grads_static[idx], + self.out_grads_torch[idx], + self.atol, + self.rtol, + err_msg= + ('Develop: compare paddle.nn.functional.dropout static grad res with torch failed in %s dtype' + ) % self.dtype, + ) + + def test_eager_stability(self): + x_eager, dout_eager = self.gen_eager_inputs_and_dout() + set_seed() + out_eager_baseline, out_grads_eager_baseline = self.cal_eager_res( + x_eager, dout_eager) + out_eager_baseline_np = out_eager_baseline.numpy() + out_grads_eager_baseline_np = map_structure( + lambda x: x.numpy(), + out_grads_eager_baseline, + ) + del out_eager_baseline + del out_grads_eager_baseline + paddle.device.cuda.empty_cache() + + for i in range(50): + set_seed() + out_eager, out_grads_eager = self.cal_eager_res( + x_eager, dout_eager) + out_eager = out_eager.numpy() + out_grads_eager = map_structure( + lambda x: x.numpy(), + out_grads_eager, + ) + np.testing.assert_equal( + out_eager, + out_eager_baseline_np, + err_msg= + ('Develop: paddle.nn.functional.dropout eager forward is unstable in %s dtype' + ) % self.dtype, + ) + for idx in range(len(out_grads_eager)): + np.testing.assert_equal( + out_grads_eager[idx], + out_grads_eager_baseline_np[idx], + err_msg= + ('Develop: paddle.nn.functional.dropout eager grad is unstable in %s dtype' + ) % self.dtype, + ) + + def test_static_stability(self): + with paddle.fluid.framework._dygraph_guard(None): + paddle.framework.random._manual_program_seed(seed) + mp, sp = paddle.static.Program(), paddle.static.Program() + with paddle.static.program_guard(mp, sp): + x_static, dout_static = self.gen_static_inputs_and_dout() + (out_static_pg, out_grads_static_pg) = self.cal_static_res( + x_static, + dout_static, + ) + exe = paddle.static.Executor(place=paddle.CUDAPlace(0)) + set_seed() + exe.run(sp) + out = exe.run( + mp, + feed={ + "x": self.np_x, + "dout": self.np_dout + }, + fetch_list=[out_static_pg] + out_grads_static_pg, + ) + out_static_baseline, out_grads_static_baseline = out[0], out[1:] + for i in range(50): + set_seed() + out = exe.run( + mp, + feed={ + "x": self.np_x, + "dout": self.np_dout + }, + fetch_list=[out_static_pg] + out_grads_static_pg, + ) + out_static, out_grads_static = out[0], out[1:] + np.testing.assert_equal( + out_static, + out_static_baseline, + err_msg= + ('Develop: paddle.nn.functional.dropout static forward is unstable in %s dtype' + ) % self.dtype, + ) + for idx in range(len(out_grads_static)): + np.testing.assert_equal( + out_grads_static[idx], + out_grads_static_baseline[idx], + err_msg= + ('Develop: paddle.nn.functional.dropout static grad is unstable in %s dtype' + ) % self.dtype, + ) + + +class TestMatmulDevelopCase1_FP16(TestMatmulDevelopCase1_FP32): + + def init_params(self): + self.np_input_dir = "./inputs_case1.npz" + self.dtype = "float16" + self.save_static_res_path = "./static_develop_res_case1_fp16.npz" + self.save_eager_res_path = "./eager_develop_res_case1_fp16.npz" + + +class TestMatmulDevelopCase1_BFP16(TestMatmulDevelopCase1_FP32): + + def init_params(self): + self.np_input_dir = "./inputs_case1.npz" + self.dtype = "bfloat16" + self.save_static_res_path = "./static_develop_res_case1_bfp16.npz" + self.save_eager_res_path = "./eager_develop_res_case1_bfp16.npz" + + +class TestMatmulDevelopCase2_FP32(TestMatmulDevelopCase1_FP32): + + def init_params(self): + self.np_input_dir = "./inputs_case2.npz" + self.dtype = "float32" + self.save_static_res_path = "./static_develop_res_case2_fp32.npz" + self.save_eager_res_path = "./eager_develop_res_case2_fp32.npz" + + +class TestMatmulDevelopCase2_FP16(TestMatmulDevelopCase1_FP32): + + def init_params(self): + self.np_input_dir = "./inputs_case2.npz" + self.dtype = "float16" + self.save_static_res_path = "./static_develop_res_case2_fp16.npz" + self.save_eager_res_path = "./eager_develop_res_case2_fp16.npz" + + +class TestMatmulDevelopCase2_BFP16(TestMatmulDevelopCase1_FP32): + + def init_params(self): + self.np_input_dir = "./inputs_case2.npz" + self.dtype = "bfloat16" + self.save_static_res_path = "./static_develop_res_case2_bfp16.npz" + self.save_eager_res_path = "./eager_develop_res_case2_bfp16.npz" + + +if __name__ == '__main__': + generate_np_inputs_and_dout() + unittest.main()