diff --git a/test_fp32_with_bf16/run.sh b/test_fp32_with_bf16/run.sh new file mode 100644 index 0000000..9ecae02 --- /dev/null +++ b/test_fp32_with_bf16/run.sh @@ -0,0 +1,7 @@ +!/bin/bash +for dir in test_flash_attention/ test_fused_linear/ test_layernorm/ test_matmul/ test_silu/ test_vocab_parallel_embedding/ +do + cd $dir + ./run.sh & + cd .. +done diff --git a/test_fp32_with_bf16/test_flash_attention/case.txt b/test_fp32_with_bf16/test_flash_attention/case.txt new file mode 100644 index 0000000..1ef302a --- /dev/null +++ b/test_fp32_with_bf16/test_flash_attention/case.txt @@ -0,0 +1,8 @@ +/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/fa-fp32-fp32-bf16-bf16-fp32-eager_tmp_111-eager_tmp_112-eager_tmp_113-eager_tmp_114-eager_tmp_115-pp-11-mp-0 +/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/fa-fp32-fp32-bf16-bf16-fp32-eager_tmp_111-eager_tmp_112-eager_tmp_113-eager_tmp_114-eager_tmp_115-pp-11-mp-1 +/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/fa-fp32-fp32-bf16-bf16-fp32-eager_tmp_111-eager_tmp_112-eager_tmp_113-eager_tmp_114-eager_tmp_115-pp-11-mp-2 +/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/fa-fp32-fp32-bf16-bf16-fp32-eager_tmp_111-eager_tmp_112-eager_tmp_113-eager_tmp_114-eager_tmp_115-pp-11-mp-3 +/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/fa-fp32-fp32-bf16-bf16-fp32-eager_tmp_111-eager_tmp_112-eager_tmp_113-eager_tmp_114-eager_tmp_115-pp-11-mp-4 +/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/fa-fp32-fp32-bf16-bf16-fp32-eager_tmp_111-eager_tmp_112-eager_tmp_113-eager_tmp_114-eager_tmp_115-pp-11-mp-5 +/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/fa-fp32-fp32-bf16-bf16-fp32-eager_tmp_111-eager_tmp_112-eager_tmp_113-eager_tmp_114-eager_tmp_115-pp-11-mp-6 +/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/fa-fp32-fp32-bf16-bf16-fp32-eager_tmp_111-eager_tmp_112-eager_tmp_113-eager_tmp_114-eager_tmp_115-pp-11-mp-7 diff --git a/test_fp32_with_bf16/test_flash_attention/run.sh b/test_fp32_with_bf16/test_flash_attention/run.sh new file mode 100755 index 0000000..0c7d742 --- /dev/null +++ b/test_fp32_with_bf16/test_flash_attention/run.sh @@ -0,0 +1,12 @@ +#!/bin/bash +export CUDA_VISIBLE_DEVICES=1 +export NVIDIA_TF32_OVERRIDE=0 +export LD_LIBRARY_PATH=/usr/local/cuda/compat:$LD_LIBRARY_PATH:/usr/lib64/:/usr/local/lib/ +rm new_log_test_flash_attention_fp32vs_bfp16 +# 读取字符串 +while IFS= read -r line +do + # 调用 train.py 并传递字符串作为参数 + python test_flash_attention_fp32vs_bfp16.py "$line" 2>&1|tee >> new_log_test_flash_attention_fp32vs_bfp16 + +done < case.txt diff --git a/test_fp32_with_bf16/test_flash_attention/test_flash_attention_fp32vs_bfp16.py b/test_fp32_with_bf16/test_flash_attention/test_flash_attention_fp32vs_bfp16.py index 1a224a8..71a937b 100644 --- a/test_fp32_with_bf16/test_flash_attention/test_flash_attention_fp32vs_bfp16.py +++ b/test_fp32_with_bf16/test_flash_attention/test_flash_attention_fp32vs_bfp16.py @@ -28,6 +28,8 @@ convert_dtype_to_torch_type, np_assert_accuracy ) + +niuliling_path = None # 全局变量 def get_triangle_upper_mask(shape): mask = paddle.full(shape=shape, fill_value=-np.inf) mask.stop_gradient = True @@ -56,26 +58,47 @@ def setUp(self): def init_np_inputs_and_dout(self): # init np array - self.np_x = np.random.random(size=[1,8192,14,128]).astype("float32") - 0.5 - self.np_dout = np.random.random(size=[1,8192,14,128]).astype("float32") - 0.5 + data_xwb = np.load(niuliling_path+".npz") + data_dout = np.load(niuliling_path+".npy") + + self.np_q = data_xwb["query"].astype("float32") + self.np_k = data_xwb["key"].astype("float32") + self.np_v = data_xwb["value"].astype("float32") + + print("shape q ", self.np_q.shape) # = data_xwb["query"].astype("float32") + print("shape k", self.np_k.shape) # = data_xwb["key"].astype("float32") + print("shape v", self.np_v.shape) # = data_xwb["value"].astype("float32") + self.np_dout = data_dout.astype("float32") def gen_eager_inputs_and_dout(self): x_eager = paddle.to_tensor( - self.np_x, + self.np_q, dtype="float32", place="gpu", ) x_eager.stop_gradient = False + k_eager = paddle.to_tensor( + self.np_k, + dtype="float32", + place="gpu", + ) + k_eager.stop_gradient = False + v_eager = paddle.to_tensor( + self.np_v, + dtype="float32", + place="gpu", + ) + v_eager.stop_gradient = False dout_eager = paddle.to_tensor( self.np_dout, dtype="float32", place="gpu", ) dout_eager.stop_gradient = False - return x_eager, dout_eager - def cal_bfp16_res(self, x, dout): - out, _ = flash_attention(x, x, x, 0.0, True, False) - out_grads = paddle.grad([out], [x], grad_outputs=[dout]) + return x_eager, k_eager, v_eager, dout_eager + def cal_bfp16_res(self, x, k, v, dout): + out, _ = flash_attention(x, k, v, 0.0, True, False) + out_grads = paddle.grad([out], [x,k,v], grad_outputs=[dout]) return out, out_grads # def cal_fp32_res(self, x, dout): @@ -87,16 +110,16 @@ def cal_bfp16_res(self, x, dout): # out_grads = map_structure(lambda x: paddle.cast(x, dtype="float32"), out_grads) # return out, out_grads - def cal_fp32_res(self, x, dout): - out = multi_head_attention(x, x, x) - out_grads = paddle.grad([out], [x], grad_outputs=[dout]) + def cal_fp32_res(self, x, k, v, dout): + out = multi_head_attention(x, k, v) + out_grads = paddle.grad([out], [x,k,v], grad_outputs=[dout]) return out, out_grads def test_flash_atten_fp32vsbfp16_mode1(self): - x_bfp16, dout_bfp16 = map_structure(lambda x: paddle.cast(x, dtype="bfloat16"), self.gen_eager_inputs_and_dout()) - x_fp32, dout_fp32 = paddle.cast(x_bfp16,"float32"), paddle.cast(dout_bfp16,"float32") - out_fp32, out_grads_fp32 = self.cal_fp32_res(x_fp32, dout_fp32) - out_bfp16, out_grads_bfp16 = self.cal_bfp16_res(x_bfp16, dout_bfp16) + x_bfp16, k_bfp16, v_bfp16, dout_bfp16 = map_structure(lambda x: paddle.cast(x, dtype="bfloat16"), self.gen_eager_inputs_and_dout()) + x_fp32, k_fp32, v_fp32, dout_fp32 = paddle.cast(x_bfp16,"float32"), paddle.cast(k_bfp16,"float32"),paddle.cast(v_bfp16,"float32"),paddle.cast(dout_bfp16,"float32") + out_fp32, out_grads_fp32 = self.cal_fp32_res(x_fp32,k_fp32, v_fp32, dout_fp32) + out_bfp16, out_grads_bfp16 = self.cal_bfp16_res(x_bfp16, k_bfp16, v_bfp16, dout_bfp16) pt_out_bfp16 = paddle.cast(out_bfp16, "float32") pt_out_grads_bfp16 = map_structure(lambda x: paddle.cast(x, dtype="float32"), out_grads_bfp16) try: @@ -132,11 +155,11 @@ def test_flash_atten_fp32vsbfp16_mode1(self): print(e) def test_flash_atten_fp32vsbfp16_mode2(self): - x_bfp16, dout_bfp16 = map_structure(lambda x: paddle.cast(x, dtype="bfloat16"), self.gen_eager_inputs_and_dout()) - x_fp32, dout_fp32 = paddle.cast(x_bfp16,"float32"), paddle.cast(dout_bfp16,"float32") - out_fp32, out_grads_fp32 = self.cal_fp32_res(x_fp32, dout_fp32) + x_bfp16, k_bfp16, v_bfp16, dout_bfp16 = map_structure(lambda x: paddle.cast(x, dtype="bfloat16"), self.gen_eager_inputs_and_dout()) + x_fp32, k_fp32, v_fp32, dout_fp32 = paddle.cast(x_bfp16,"float32"), paddle.cast(k_bfp16,"float32"),paddle.cast(v_bfp16,"float32"),paddle.cast(dout_bfp16,"float32") + out_fp32, out_grads_fp32 = self.cal_fp32_res(x_fp32, k_fp32, v_fp32,dout_fp32) out_grads_fp32 = map_structure(lambda x: paddle.cast(paddle.cast(x,"bfloat16"),"float32"), out_grads_fp32) - out_bfp16, out_grads_bfp16 = self.cal_bfp16_res(x_bfp16, dout_bfp16) + out_bfp16, out_grads_bfp16 = self.cal_bfp16_res(x_bfp16, k_bfp16, v_bfp16, dout_bfp16) pt_out_bfp16 = paddle.cast(out_bfp16, "float32") pt_out_grads_bfp16 = map_structure(lambda x: paddle.cast(x, dtype="float32"), out_grads_bfp16) try: @@ -172,5 +195,14 @@ def test_flash_atten_fp32vsbfp16_mode2(self): print(e) if __name__ == '__main__': + if len(sys.argv) < 2: + print("请提供 data_path 参数") + sys.exit(1) + + tmp = sys.argv[1] # 设置全局变量 data_path + niuliling_path = tmp + print(tmp) + + del sys.argv[1] np.random.seed(2023) unittest.main() diff --git a/test_fp32_with_bf16/test_fused_linear/case.txt b/test_fp32_with_bf16/test_fused_linear/case.txt new file mode 100644 index 0000000..c45cd38 --- /dev/null +++ b/test_fp32_with_bf16/test_fused_linear/case.txt @@ -0,0 +1,8 @@ +/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/fl-fp32-bf16-bf16-bf16-eager_in_tmp_9-server_nlg_mask_lm_out_fc_0.w_0-server_nlg_mask_lm_out_fc_0.b_0-eager_in_tmp_11-pp-11-mp-0 +/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/fl-fp32-bf16-bf16-bf16-eager_in_tmp_9-server_nlg_mask_lm_out_fc_1.w_0-server_nlg_mask_lm_out_fc_1.b_0-eager_in_tmp_11-pp-11-mp-1 +/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/fl-fp32-bf16-bf16-bf16-eager_in_tmp_9-server_nlg_mask_lm_out_fc_2.w_0-server_nlg_mask_lm_out_fc_2.b_0-eager_in_tmp_11-pp-11-mp-2 +/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/fl-fp32-bf16-bf16-bf16-eager_in_tmp_9-server_nlg_mask_lm_out_fc_3.w_0-server_nlg_mask_lm_out_fc_3.b_0-eager_in_tmp_11-pp-11-mp-3 +/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/fl-fp32-bf16-bf16-bf16-eager_in_tmp_9-server_nlg_mask_lm_out_fc_4.w_0-server_nlg_mask_lm_out_fc_4.b_0-eager_in_tmp_11-pp-11-mp-4 +/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/fl-fp32-bf16-bf16-bf16-eager_in_tmp_9-server_nlg_mask_lm_out_fc_5.w_0-server_nlg_mask_lm_out_fc_5.b_0-eager_in_tmp_11-pp-11-mp-5 +/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/fl-fp32-bf16-bf16-bf16-eager_in_tmp_9-server_nlg_mask_lm_out_fc_6.w_0-server_nlg_mask_lm_out_fc_6.b_0-eager_in_tmp_11-pp-11-mp-6 +/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/fl-fp32-bf16-bf16-bf16-eager_in_tmp_9-server_nlg_mask_lm_out_fc_7.w_0-server_nlg_mask_lm_out_fc_7.b_0-eager_in_tmp_11-pp-11-mp-7 diff --git a/test_fp32_with_bf16/test_fused_linear/run.sh b/test_fp32_with_bf16/test_fused_linear/run.sh new file mode 100755 index 0000000..b61e622 --- /dev/null +++ b/test_fp32_with_bf16/test_fused_linear/run.sh @@ -0,0 +1,12 @@ +#!/bin/bash +export CUDA_VISIBLE_DEVICES=2 +export NVIDIA_TF32_OVERRIDE=0 +export LD_LIBRARY_PATH=/usr/local/cuda/compat:$LD_LIBRARY_PATH:/usr/lib64/:/usr/local/lib/ +rm new_log_test_fused_linear_develop +# 读取字符串 +while IFS= read -r line +do + # 调用 train.py 并传递字符串作为参数 + python test_fused_linear_develop.py "$line" 2>&1|tee >> new_log_test_fused_linear_develop + +done < case.txt diff --git a/test_fp32_with_bf16/test_fused_linear/test_fused_linear_develop.py b/test_fp32_with_bf16/test_fused_linear/test_fused_linear_develop.py index f1fae7a..347b34d 100644 --- a/test_fp32_with_bf16/test_fused_linear/test_fused_linear_develop.py +++ b/test_fp32_with_bf16/test_fused_linear/test_fused_linear_develop.py @@ -13,6 +13,7 @@ global_out = [] global_dout = [] +niuliling_path = None # 全局变量 class TestFCDevelop(unittest.TestCase): def __init__(self, shape, dtype, test_mode): @@ -28,10 +29,15 @@ def init_threshold(self): def init_np_inputs_and_dout(self): np.random.seed(123) - self.np_x = np.random.random(size=self.shape["x"]).astype("float32") - 0.5 - self.np_w = np.random.random(size=self.shape["w"]).astype("float32") - 0.5 - self.np_b = np.random.random(size=self.shape["b"]).astype("float32") - 0.5 - self.np_dout = np.random.random(size=self.shape["dout"]).astype("float32") - 0.5 + data_xwb = np.load(niuliling_path+".npz") + data_dout = np.load(niuliling_path+".npy") + self.np_x = data_xwb["x"] + self.np_w = data_xwb["weight"] + self.np_b = data_xwb["bias"] + print("shape x ", self.np_x.shape) + print("shape w ", self.np_w.shape) + print("shape b ", self.np_b.shape) + self.np_dout = data_dout.astype("float32") # convert np array dtype if self.dtype == "float16": @@ -122,10 +128,20 @@ def test_eager_accuracy(self): del out_grads_eager if __name__ == '__main__': - x = [[1, 8192, 14336], [1, 8192, 14336], [1, 8192, 1792], [1, 8192, 4816]] - w = [[14336, 5376], [14336, 9632], [1792, 14336], [4816, 14336]] - b = [[5376], [9632], [14336], [14336]] - dout = [[1, 8192, 5376], [1, 8192, 9632], [1, 8192, 14336], [1, 8192, 14336]] + if len(sys.argv) < 2: + print("请提供 data_path 参数") + sys.exit(1) + + tmp = sys.argv[1] # 设置全局变量 data_path + niuliling_path = tmp + print(tmp) + del sys.argv[1] + + + x = [[1, 8192, 14336]] #, [1, 8192, 14336], [1, 8192, 1792], [1, 8192, 4816]] + w = [[14336, 5376]] #, [14336, 9632], [1792, 14336], [4816, 14336]] + b = [[5376], [9632]] #, [14336], [14336]] + dout = [[1, 8192, 5376]]#, [1, 8192, 9632], [1, 8192, 14336], [1, 8192, 14336]] shape_list = [] for i in range(len(x)): shape = {} @@ -134,13 +150,15 @@ def test_eager_accuracy(self): shape["b"] = b[i] shape["dout"] = dout[i] shape_list.append(shape) + for test_mode in [1,2]: if test_mode == 1: atol = 1e-2 elif test_mode == 2: atol = 1e-6 - print("test_mode_{test_mode} start*************************************************************************" \ - .format(test_mode=test_mode)) + #print("test_mode_{test_mode} start*************************************************************************" \ + # .format(test_mode=test_mode)) + print(tmp, "test_mode", test_mode) for shape in shape_list: global_out.clear() global_dout.clear() diff --git a/test_fp32_with_bf16/test_layernorm/case.txt b/test_fp32_with_bf16/test_layernorm/case.txt new file mode 100644 index 0000000..51c2ed3 --- /dev/null +++ b/test_fp32_with_bf16/test_layernorm/case.txt @@ -0,0 +1,8 @@ +/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/ln-bf16-fp32-fp32-fp32-eager_in_tmp_0-encoder_layer_119_pre_att__layer_norm_scale_0.w_0-encoder_layer_119_pre_att_layer_norm_bias_0.b_0-eager_tmp_205-pp-11-mp-0 +/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/ln-bf16-fp32-fp32-fp32-eager_in_tmp_0-encoder_layer_119_pre_att__layer_norm_scale_0.w_0-encoder_layer_119_pre_att_layer_norm_bias_0.b_0-eager_tmp_205-pp-11-mp-1 +/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/ln-bf16-fp32-fp32-fp32-eager_in_tmp_0-encoder_layer_119_pre_att__layer_norm_scale_0.w_0-encoder_layer_119_pre_att_layer_norm_bias_0.b_0-eager_tmp_205-pp-11-mp-2 +/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/ln-bf16-fp32-fp32-fp32-eager_in_tmp_0-encoder_layer_119_pre_att__layer_norm_scale_0.w_0-encoder_layer_119_pre_att_layer_norm_bias_0.b_0-eager_tmp_205-pp-11-mp-3 +/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/ln-bf16-fp32-fp32-fp32-eager_in_tmp_0-encoder_layer_119_pre_att__layer_norm_scale_0.w_0-encoder_layer_119_pre_att_layer_norm_bias_0.b_0-eager_tmp_205-pp-11-mp-4 +/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/ln-bf16-fp32-fp32-fp32-eager_in_tmp_0-encoder_layer_119_pre_att__layer_norm_scale_0.w_0-encoder_layer_119_pre_att_layer_norm_bias_0.b_0-eager_tmp_205-pp-11-mp-5 +/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/ln-bf16-fp32-fp32-fp32-eager_in_tmp_0-encoder_layer_119_pre_att__layer_norm_scale_0.w_0-encoder_layer_119_pre_att_layer_norm_bias_0.b_0-eager_tmp_205-pp-11-mp-6 +/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/ln-bf16-fp32-fp32-fp32-eager_in_tmp_0-encoder_layer_119_pre_att__layer_norm_scale_0.w_0-encoder_layer_119_pre_att_layer_norm_bias_0.b_0-eager_tmp_205-pp-11-mp-7 diff --git a/test_fp32_with_bf16/test_layernorm/run.sh b/test_fp32_with_bf16/test_layernorm/run.sh new file mode 100755 index 0000000..64f143e --- /dev/null +++ b/test_fp32_with_bf16/test_layernorm/run.sh @@ -0,0 +1,12 @@ +#!/bin/bash +export CUDA_VISIBLE_DEVICES=1 +export NVIDIA_TF32_OVERRIDE=0 +export LD_LIBRARY_PATH=/usr/local/cuda/compat:$LD_LIBRARY_PATH:/usr/lib64/:/usr/local/lib/ +rm new_log_test_layernorm_fp32vsbfp16 +# 读取字符串 +while IFS= read -r line +do + # 调用 train.py 并传递字符串作为参数 + python test_layernorm_fp32vsbfp16.py "$line" 2>&1|tee >> new_log_test_layernorm_fp32vsbfp16 + +done < case.txt diff --git a/test_fp32_with_bf16/test_layernorm/test_layernorm_fp32vsbfp16.py b/test_fp32_with_bf16/test_layernorm/test_layernorm_fp32vsbfp16.py index d933f4a..cde167e 100644 --- a/test_fp32_with_bf16/test_layernorm/test_layernorm_fp32vsbfp16.py +++ b/test_fp32_with_bf16/test_layernorm/test_layernorm_fp32vsbfp16.py @@ -8,16 +8,19 @@ from utils import TOLERANCE, convert_dtype_to_torch_type, np_assert_accuracy +niuliling_path = None # 全局变量 class TestLayerNormFP32vsBFP16(unittest.TestCase): def setUp(self): self.init_np_inputs_and_dout() def init_np_inputs_and_dout(self): # init np array - self.np_x = np.random.random(size=[1, 8192, 14336]).astype("float32") - 0.5 - self.np_w = np.random.random(size=[14336]).astype("float32") - 0.5 - self.np_b= np.random.random(size=[14336]).astype("float32") - 0.5 - self.np_dout = np.random.random(size=[1, 8192, 14336]).astype("float32") - 0.5 + data_xwb = np.load(niuliling_path+".npz") + data_dout = np.load(niuliling_path+".npy") + self.np_x = data_xwb["x"].astype("float32") + self.np_w = data_xwb["weight"].astype("float32") + self.np_b = data_xwb["bias"].astype("float32") + self.np_dout = data_dout.astype("float32") def gen_eager_inputs_and_dout(self): x = paddle.to_tensor( @@ -133,5 +136,14 @@ def test_layernorm_fp32vsbfp16_mode2(self): except Exception as e: print(e) if __name__ == '__main__': + if len(sys.argv) < 2: + print("请提供 data_path 参数") + sys.exit(1) + + tmp = sys.argv[1] # 设置全局变量 data_path + niuliling_path = tmp + print(tmp) + del sys.argv[1] + np.random.seed(2023) - unittest.main() \ No newline at end of file + unittest.main() diff --git a/test_fp32_with_bf16/test_matmul/case.txt b/test_fp32_with_bf16/test_matmul/case.txt new file mode 100644 index 0000000..ed44119 --- /dev/null +++ b/test_fp32_with_bf16/test_matmul/case.txt @@ -0,0 +1,8 @@ +/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/matmul-bf16-bf16-bf16-eager_tmp_100-encoder_layer_130_multi_head_att_output_fc_0.w_0-eager_tmp_101-pp-11-mp-0 +/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/matmul-bf16-bf16-bf16-eager_tmp_100-encoder_layer_130_multi_head_att_output_fc_1.w_0-eager_tmp_101-pp-11-mp-1 +/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/matmul-bf16-bf16-bf16-eager_tmp_100-encoder_layer_130_multi_head_att_output_fc_2.w_0-eager_tmp_101-pp-11-mp-2 +/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/matmul-bf16-bf16-bf16-eager_tmp_100-encoder_layer_130_multi_head_att_output_fc_3.w_0-eager_tmp_101-pp-11-mp-3 +/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/matmul-bf16-bf16-bf16-eager_tmp_100-encoder_layer_130_multi_head_att_output_fc_4.w_0-eager_tmp_101-pp-11-mp-4 +/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/matmul-bf16-bf16-bf16-eager_tmp_100-encoder_layer_130_multi_head_att_output_fc_5.w_0-eager_tmp_101-pp-11-mp-5 +/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/matmul-bf16-bf16-bf16-eager_tmp_100-encoder_layer_130_multi_head_att_output_fc_6.w_0-eager_tmp_101-pp-11-mp-6 +/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/matmul-bf16-bf16-bf16-eager_tmp_100-encoder_layer_130_multi_head_att_output_fc_7.w_0-eager_tmp_101-pp-11-mp-7 diff --git a/test_fp32_with_bf16/test_matmul/run.sh b/test_fp32_with_bf16/test_matmul/run.sh new file mode 100755 index 0000000..9ecedaf --- /dev/null +++ b/test_fp32_with_bf16/test_matmul/run.sh @@ -0,0 +1,11 @@ +#!/bin/bash +export CUDA_VISIBLE_DEVICES=0 +export NVIDIA_TF32_OVERRIDE=0 +export LD_LIBRARY_PATH=/usr/local/cuda/compat:$LD_LIBRARY_PATH:/usr/lib64/:/usr/local/lib/ +# 读取字符串 +rm new_log_matmul_fp32vsbf16 +while IFS= read -r line +do + # 调用 train.py 并传递字符串作为参数 + python test_matmul_fp32vsbfp16.py "$line" 2>&1|tee >> new_log_matmul_fp32vsbf16 +done < case.txt diff --git a/test_fp32_with_bf16/test_matmul/test_matmul_fp32vsbfp16.py b/test_fp32_with_bf16/test_matmul/test_matmul_fp32vsbfp16.py index 3a46d8d..9da66f1 100644 --- a/test_fp32_with_bf16/test_matmul/test_matmul_fp32vsbfp16.py +++ b/test_fp32_with_bf16/test_matmul/test_matmul_fp32vsbfp16.py @@ -25,6 +25,7 @@ from utils import ( np_assert_accuracy, ) +niuliling_path = None # 全局变量 class TestMatmulFP32vsBFP16(unittest.TestCase): def setUp(self): @@ -32,9 +33,14 @@ def setUp(self): def init_np_inputs_and_dout(self): # init np array - self.np_x = np.random.random(size=[1, 8192, 14336]).astype("float32") - 0.5 - self.np_y = np.random.random(size=[14336, 12528]).astype("float32") - 0.5 - self.np_dout = np.random.random(size=[1, 8192, 12528]).astype("float32") - 0.5 + data_xy = np.load(niuliling_path+".npz") + data_dout = np.load(niuliling_path+".npy") + + self.np_x = data_xy["x"].astype("float32") + self.np_y = data_xy["y"].astype("float32") + print("shape x", self.np_x.shape) + print("shape y", self.np_y.shape) + self.np_dout = data_dout.astype("float32") def gen_eager_inputs_and_dout(self): x = paddle.to_tensor( @@ -56,6 +62,7 @@ def gen_eager_inputs_and_dout(self): ) dout.stop_gradient = False return x, y, dout + def cal_res(self, x, y, dout): out = paddle.matmul(x, y) out_grads = paddle.grad([out], [x, y], grad_outputs=[dout]) @@ -143,5 +150,13 @@ def test_matmul_fp32vsbfp16_mode2(self): print(e) if __name__ == '__main__': + if len(sys.argv) < 2: + print("请提供 data_path 参数") + sys.exit(1) + + tmp = sys.argv[1] # 设置全局变量 data_path + niuliling_path = tmp + print(tmp) + del sys.argv[1] np.random.seed(2023) unittest.main()