Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions test_fp32_with_bf16/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
!/bin/bash
for dir in test_flash_attention/ test_fused_linear/ test_layernorm/ test_matmul/ test_silu/ test_vocab_parallel_embedding/
do
cd $dir
./run.sh &
cd ..
done
8 changes: 8 additions & 0 deletions test_fp32_with_bf16/test_flash_attention/case.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/fa-fp32-fp32-bf16-bf16-fp32-eager_tmp_111-eager_tmp_112-eager_tmp_113-eager_tmp_114-eager_tmp_115-pp-11-mp-0
/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/fa-fp32-fp32-bf16-bf16-fp32-eager_tmp_111-eager_tmp_112-eager_tmp_113-eager_tmp_114-eager_tmp_115-pp-11-mp-1
/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/fa-fp32-fp32-bf16-bf16-fp32-eager_tmp_111-eager_tmp_112-eager_tmp_113-eager_tmp_114-eager_tmp_115-pp-11-mp-2
/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/fa-fp32-fp32-bf16-bf16-fp32-eager_tmp_111-eager_tmp_112-eager_tmp_113-eager_tmp_114-eager_tmp_115-pp-11-mp-3
/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/fa-fp32-fp32-bf16-bf16-fp32-eager_tmp_111-eager_tmp_112-eager_tmp_113-eager_tmp_114-eager_tmp_115-pp-11-mp-4
/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/fa-fp32-fp32-bf16-bf16-fp32-eager_tmp_111-eager_tmp_112-eager_tmp_113-eager_tmp_114-eager_tmp_115-pp-11-mp-5
/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/fa-fp32-fp32-bf16-bf16-fp32-eager_tmp_111-eager_tmp_112-eager_tmp_113-eager_tmp_114-eager_tmp_115-pp-11-mp-6
/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/fa-fp32-fp32-bf16-bf16-fp32-eager_tmp_111-eager_tmp_112-eager_tmp_113-eager_tmp_114-eager_tmp_115-pp-11-mp-7
12 changes: 12 additions & 0 deletions test_fp32_with_bf16/test_flash_attention/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/bin/bash
export CUDA_VISIBLE_DEVICES=1
export NVIDIA_TF32_OVERRIDE=0
export LD_LIBRARY_PATH=/usr/local/cuda/compat:$LD_LIBRARY_PATH:/usr/lib64/:/usr/local/lib/
rm new_log_test_flash_attention_fp32vs_bfp16
# 读取字符串
while IFS= read -r line
do
# 调用 train.py 并传递字符串作为参数
python test_flash_attention_fp32vs_bfp16.py "$line" 2>&1|tee >> new_log_test_flash_attention_fp32vs_bfp16

done < case.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
convert_dtype_to_torch_type,
np_assert_accuracy
)

niuliling_path = None # 全局变量
def get_triangle_upper_mask(shape):
mask = paddle.full(shape=shape, fill_value=-np.inf)
mask.stop_gradient = True
Expand Down Expand Up @@ -56,26 +58,47 @@ def setUp(self):

def init_np_inputs_and_dout(self):
# init np array
self.np_x = np.random.random(size=[1,8192,14,128]).astype("float32") - 0.5
self.np_dout = np.random.random(size=[1,8192,14,128]).astype("float32") - 0.5
data_xwb = np.load(niuliling_path+".npz")
data_dout = np.load(niuliling_path+".npy")

self.np_q = data_xwb["query"].astype("float32")
self.np_k = data_xwb["key"].astype("float32")
self.np_v = data_xwb["value"].astype("float32")

print("shape q ", self.np_q.shape) # = data_xwb["query"].astype("float32")
print("shape k", self.np_k.shape) # = data_xwb["key"].astype("float32")
print("shape v", self.np_v.shape) # = data_xwb["value"].astype("float32")
self.np_dout = data_dout.astype("float32")

def gen_eager_inputs_and_dout(self):
x_eager = paddle.to_tensor(
self.np_x,
self.np_q,
dtype="float32",
place="gpu",
)
x_eager.stop_gradient = False
k_eager = paddle.to_tensor(
self.np_k,
dtype="float32",
place="gpu",
)
k_eager.stop_gradient = False
v_eager = paddle.to_tensor(
self.np_v,
dtype="float32",
place="gpu",
)
v_eager.stop_gradient = False
dout_eager = paddle.to_tensor(
self.np_dout,
dtype="float32",
place="gpu",
)
dout_eager.stop_gradient = False
return x_eager, dout_eager
def cal_bfp16_res(self, x, dout):
out, _ = flash_attention(x, x, x, 0.0, True, False)
out_grads = paddle.grad([out], [x], grad_outputs=[dout])
return x_eager, k_eager, v_eager, dout_eager
def cal_bfp16_res(self, x, k, v, dout):
out, _ = flash_attention(x, k, v, 0.0, True, False)
out_grads = paddle.grad([out], [x,k,v], grad_outputs=[dout])
return out, out_grads

# def cal_fp32_res(self, x, dout):
Expand All @@ -87,16 +110,16 @@ def cal_bfp16_res(self, x, dout):
# out_grads = map_structure(lambda x: paddle.cast(x, dtype="float32"), out_grads)
# return out, out_grads

def cal_fp32_res(self, x, dout):
out = multi_head_attention(x, x, x)
out_grads = paddle.grad([out], [x], grad_outputs=[dout])
def cal_fp32_res(self, x, k, v, dout):
out = multi_head_attention(x, k, v)
out_grads = paddle.grad([out], [x,k,v], grad_outputs=[dout])
return out, out_grads

def test_flash_atten_fp32vsbfp16_mode1(self):
x_bfp16, dout_bfp16 = map_structure(lambda x: paddle.cast(x, dtype="bfloat16"), self.gen_eager_inputs_and_dout())
x_fp32, dout_fp32 = paddle.cast(x_bfp16,"float32"), paddle.cast(dout_bfp16,"float32")
out_fp32, out_grads_fp32 = self.cal_fp32_res(x_fp32, dout_fp32)
out_bfp16, out_grads_bfp16 = self.cal_bfp16_res(x_bfp16, dout_bfp16)
x_bfp16, k_bfp16, v_bfp16, dout_bfp16 = map_structure(lambda x: paddle.cast(x, dtype="bfloat16"), self.gen_eager_inputs_and_dout())
x_fp32, k_fp32, v_fp32, dout_fp32 = paddle.cast(x_bfp16,"float32"), paddle.cast(k_bfp16,"float32"),paddle.cast(v_bfp16,"float32"),paddle.cast(dout_bfp16,"float32")
out_fp32, out_grads_fp32 = self.cal_fp32_res(x_fp32,k_fp32, v_fp32, dout_fp32)
out_bfp16, out_grads_bfp16 = self.cal_bfp16_res(x_bfp16, k_bfp16, v_bfp16, dout_bfp16)
pt_out_bfp16 = paddle.cast(out_bfp16, "float32")
pt_out_grads_bfp16 = map_structure(lambda x: paddle.cast(x, dtype="float32"), out_grads_bfp16)
try:
Expand Down Expand Up @@ -132,11 +155,11 @@ def test_flash_atten_fp32vsbfp16_mode1(self):
print(e)

def test_flash_atten_fp32vsbfp16_mode2(self):
x_bfp16, dout_bfp16 = map_structure(lambda x: paddle.cast(x, dtype="bfloat16"), self.gen_eager_inputs_and_dout())
x_fp32, dout_fp32 = paddle.cast(x_bfp16,"float32"), paddle.cast(dout_bfp16,"float32")
out_fp32, out_grads_fp32 = self.cal_fp32_res(x_fp32, dout_fp32)
x_bfp16, k_bfp16, v_bfp16, dout_bfp16 = map_structure(lambda x: paddle.cast(x, dtype="bfloat16"), self.gen_eager_inputs_and_dout())
x_fp32, k_fp32, v_fp32, dout_fp32 = paddle.cast(x_bfp16,"float32"), paddle.cast(k_bfp16,"float32"),paddle.cast(v_bfp16,"float32"),paddle.cast(dout_bfp16,"float32")
out_fp32, out_grads_fp32 = self.cal_fp32_res(x_fp32, k_fp32, v_fp32,dout_fp32)
out_grads_fp32 = map_structure(lambda x: paddle.cast(paddle.cast(x,"bfloat16"),"float32"), out_grads_fp32)
out_bfp16, out_grads_bfp16 = self.cal_bfp16_res(x_bfp16, dout_bfp16)
out_bfp16, out_grads_bfp16 = self.cal_bfp16_res(x_bfp16, k_bfp16, v_bfp16, dout_bfp16)
pt_out_bfp16 = paddle.cast(out_bfp16, "float32")
pt_out_grads_bfp16 = map_structure(lambda x: paddle.cast(x, dtype="float32"), out_grads_bfp16)
try:
Expand Down Expand Up @@ -172,5 +195,14 @@ def test_flash_atten_fp32vsbfp16_mode2(self):
print(e)

if __name__ == '__main__':
if len(sys.argv) < 2:
print("请提供 data_path 参数")
sys.exit(1)

tmp = sys.argv[1] # 设置全局变量 data_path
niuliling_path = tmp
print(tmp)

del sys.argv[1]
np.random.seed(2023)
unittest.main()
8 changes: 8 additions & 0 deletions test_fp32_with_bf16/test_fused_linear/case.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/fl-fp32-bf16-bf16-bf16-eager_in_tmp_9-server_nlg_mask_lm_out_fc_0.w_0-server_nlg_mask_lm_out_fc_0.b_0-eager_in_tmp_11-pp-11-mp-0
/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/fl-fp32-bf16-bf16-bf16-eager_in_tmp_9-server_nlg_mask_lm_out_fc_1.w_0-server_nlg_mask_lm_out_fc_1.b_0-eager_in_tmp_11-pp-11-mp-1
/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/fl-fp32-bf16-bf16-bf16-eager_in_tmp_9-server_nlg_mask_lm_out_fc_2.w_0-server_nlg_mask_lm_out_fc_2.b_0-eager_in_tmp_11-pp-11-mp-2
/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/fl-fp32-bf16-bf16-bf16-eager_in_tmp_9-server_nlg_mask_lm_out_fc_3.w_0-server_nlg_mask_lm_out_fc_3.b_0-eager_in_tmp_11-pp-11-mp-3
/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/fl-fp32-bf16-bf16-bf16-eager_in_tmp_9-server_nlg_mask_lm_out_fc_4.w_0-server_nlg_mask_lm_out_fc_4.b_0-eager_in_tmp_11-pp-11-mp-4
/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/fl-fp32-bf16-bf16-bf16-eager_in_tmp_9-server_nlg_mask_lm_out_fc_5.w_0-server_nlg_mask_lm_out_fc_5.b_0-eager_in_tmp_11-pp-11-mp-5
/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/fl-fp32-bf16-bf16-bf16-eager_in_tmp_9-server_nlg_mask_lm_out_fc_6.w_0-server_nlg_mask_lm_out_fc_6.b_0-eager_in_tmp_11-pp-11-mp-6
/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/fl-fp32-bf16-bf16-bf16-eager_in_tmp_9-server_nlg_mask_lm_out_fc_7.w_0-server_nlg_mask_lm_out_fc_7.b_0-eager_in_tmp_11-pp-11-mp-7
12 changes: 12 additions & 0 deletions test_fp32_with_bf16/test_fused_linear/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/bin/bash
export CUDA_VISIBLE_DEVICES=2
export NVIDIA_TF32_OVERRIDE=0
export LD_LIBRARY_PATH=/usr/local/cuda/compat:$LD_LIBRARY_PATH:/usr/lib64/:/usr/local/lib/
rm new_log_test_fused_linear_develop
# 读取字符串
while IFS= read -r line
do
# 调用 train.py 并传递字符串作为参数
python test_fused_linear_develop.py "$line" 2>&1|tee >> new_log_test_fused_linear_develop

done < case.txt
38 changes: 28 additions & 10 deletions test_fp32_with_bf16/test_fused_linear/test_fused_linear_develop.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

global_out = []
global_dout = []
niuliling_path = None # 全局变量

class TestFCDevelop(unittest.TestCase):
def __init__(self, shape, dtype, test_mode):
Expand All @@ -28,10 +29,15 @@ def init_threshold(self):

def init_np_inputs_and_dout(self):
np.random.seed(123)
self.np_x = np.random.random(size=self.shape["x"]).astype("float32") - 0.5
self.np_w = np.random.random(size=self.shape["w"]).astype("float32") - 0.5
self.np_b = np.random.random(size=self.shape["b"]).astype("float32") - 0.5
self.np_dout = np.random.random(size=self.shape["dout"]).astype("float32") - 0.5
data_xwb = np.load(niuliling_path+".npz")
data_dout = np.load(niuliling_path+".npy")
self.np_x = data_xwb["x"]
self.np_w = data_xwb["weight"]
self.np_b = data_xwb["bias"]
print("shape x ", self.np_x.shape)
print("shape w ", self.np_w.shape)
print("shape b ", self.np_b.shape)
self.np_dout = data_dout.astype("float32")

# convert np array dtype
if self.dtype == "float16":
Expand Down Expand Up @@ -122,10 +128,20 @@ def test_eager_accuracy(self):
del out_grads_eager

if __name__ == '__main__':
x = [[1, 8192, 14336], [1, 8192, 14336], [1, 8192, 1792], [1, 8192, 4816]]
w = [[14336, 5376], [14336, 9632], [1792, 14336], [4816, 14336]]
b = [[5376], [9632], [14336], [14336]]
dout = [[1, 8192, 5376], [1, 8192, 9632], [1, 8192, 14336], [1, 8192, 14336]]
if len(sys.argv) < 2:
print("请提供 data_path 参数")
sys.exit(1)

tmp = sys.argv[1] # 设置全局变量 data_path
niuliling_path = tmp
print(tmp)
del sys.argv[1]


x = [[1, 8192, 14336]] #, [1, 8192, 14336], [1, 8192, 1792], [1, 8192, 4816]]
w = [[14336, 5376]] #, [14336, 9632], [1792, 14336], [4816, 14336]]
b = [[5376], [9632]] #, [14336], [14336]]
dout = [[1, 8192, 5376]]#, [1, 8192, 9632], [1, 8192, 14336], [1, 8192, 14336]]
shape_list = []
for i in range(len(x)):
shape = {}
Expand All @@ -134,13 +150,15 @@ def test_eager_accuracy(self):
shape["b"] = b[i]
shape["dout"] = dout[i]
shape_list.append(shape)

for test_mode in [1,2]:
if test_mode == 1:
atol = 1e-2
elif test_mode == 2:
atol = 1e-6
print("test_mode_{test_mode} start*************************************************************************" \
.format(test_mode=test_mode))
#print("test_mode_{test_mode} start*************************************************************************" \
# .format(test_mode=test_mode))
print(tmp, "test_mode", test_mode)
for shape in shape_list:
global_out.clear()
global_dout.clear()
Expand Down
8 changes: 8 additions & 0 deletions test_fp32_with_bf16/test_layernorm/case.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/ln-bf16-fp32-fp32-fp32-eager_in_tmp_0-encoder_layer_119_pre_att__layer_norm_scale_0.w_0-encoder_layer_119_pre_att_layer_norm_bias_0.b_0-eager_tmp_205-pp-11-mp-0
/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/ln-bf16-fp32-fp32-fp32-eager_in_tmp_0-encoder_layer_119_pre_att__layer_norm_scale_0.w_0-encoder_layer_119_pre_att_layer_norm_bias_0.b_0-eager_tmp_205-pp-11-mp-1
/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/ln-bf16-fp32-fp32-fp32-eager_in_tmp_0-encoder_layer_119_pre_att__layer_norm_scale_0.w_0-encoder_layer_119_pre_att_layer_norm_bias_0.b_0-eager_tmp_205-pp-11-mp-2
/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/ln-bf16-fp32-fp32-fp32-eager_in_tmp_0-encoder_layer_119_pre_att__layer_norm_scale_0.w_0-encoder_layer_119_pre_att_layer_norm_bias_0.b_0-eager_tmp_205-pp-11-mp-3
/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/ln-bf16-fp32-fp32-fp32-eager_in_tmp_0-encoder_layer_119_pre_att__layer_norm_scale_0.w_0-encoder_layer_119_pre_att_layer_norm_bias_0.b_0-eager_tmp_205-pp-11-mp-4
/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/ln-bf16-fp32-fp32-fp32-eager_in_tmp_0-encoder_layer_119_pre_att__layer_norm_scale_0.w_0-encoder_layer_119_pre_att_layer_norm_bias_0.b_0-eager_tmp_205-pp-11-mp-5
/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/ln-bf16-fp32-fp32-fp32-eager_in_tmp_0-encoder_layer_119_pre_att__layer_norm_scale_0.w_0-encoder_layer_119_pre_att_layer_norm_bias_0.b_0-eager_tmp_205-pp-11-mp-6
/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/ln-bf16-fp32-fp32-fp32-eager_in_tmp_0-encoder_layer_119_pre_att__layer_norm_scale_0.w_0-encoder_layer_119_pre_att_layer_norm_bias_0.b_0-eager_tmp_205-pp-11-mp-7
12 changes: 12 additions & 0 deletions test_fp32_with_bf16/test_layernorm/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/bin/bash
export CUDA_VISIBLE_DEVICES=1
export NVIDIA_TF32_OVERRIDE=0
export LD_LIBRARY_PATH=/usr/local/cuda/compat:$LD_LIBRARY_PATH:/usr/lib64/:/usr/local/lib/
rm new_log_test_layernorm_fp32vsbfp16
# 读取字符串
while IFS= read -r line
do
# 调用 train.py 并传递字符串作为参数
python test_layernorm_fp32vsbfp16.py "$line" 2>&1|tee >> new_log_test_layernorm_fp32vsbfp16

done < case.txt
22 changes: 17 additions & 5 deletions test_fp32_with_bf16/test_layernorm/test_layernorm_fp32vsbfp16.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,19 @@
from utils import TOLERANCE, convert_dtype_to_torch_type, np_assert_accuracy


niuliling_path = None # 全局变量
class TestLayerNormFP32vsBFP16(unittest.TestCase):
def setUp(self):
self.init_np_inputs_and_dout()

def init_np_inputs_and_dout(self):
# init np array
self.np_x = np.random.random(size=[1, 8192, 14336]).astype("float32") - 0.5
self.np_w = np.random.random(size=[14336]).astype("float32") - 0.5
self.np_b= np.random.random(size=[14336]).astype("float32") - 0.5
self.np_dout = np.random.random(size=[1, 8192, 14336]).astype("float32") - 0.5
data_xwb = np.load(niuliling_path+".npz")
data_dout = np.load(niuliling_path+".npy")
self.np_x = data_xwb["x"].astype("float32")
self.np_w = data_xwb["weight"].astype("float32")
self.np_b = data_xwb["bias"].astype("float32")
self.np_dout = data_dout.astype("float32")

def gen_eager_inputs_and_dout(self):
x = paddle.to_tensor(
Expand Down Expand Up @@ -133,5 +136,14 @@ def test_layernorm_fp32vsbfp16_mode2(self):
except Exception as e:
print(e)
if __name__ == '__main__':
if len(sys.argv) < 2:
print("请提供 data_path 参数")
sys.exit(1)

tmp = sys.argv[1] # 设置全局变量 data_path
niuliling_path = tmp
print(tmp)
del sys.argv[1]

np.random.seed(2023)
unittest.main()
unittest.main()
8 changes: 8 additions & 0 deletions test_fp32_with_bf16/test_matmul/case.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/matmul-bf16-bf16-bf16-eager_tmp_100-encoder_layer_130_multi_head_att_output_fc_0.w_0-eager_tmp_101-pp-11-mp-0
/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/matmul-bf16-bf16-bf16-eager_tmp_100-encoder_layer_130_multi_head_att_output_fc_1.w_0-eager_tmp_101-pp-11-mp-1
/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/matmul-bf16-bf16-bf16-eager_tmp_100-encoder_layer_130_multi_head_att_output_fc_2.w_0-eager_tmp_101-pp-11-mp-2
/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/matmul-bf16-bf16-bf16-eager_tmp_100-encoder_layer_130_multi_head_att_output_fc_3.w_0-eager_tmp_101-pp-11-mp-3
/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/matmul-bf16-bf16-bf16-eager_tmp_100-encoder_layer_130_multi_head_att_output_fc_4.w_0-eager_tmp_101-pp-11-mp-4
/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/matmul-bf16-bf16-bf16-eager_tmp_100-encoder_layer_130_multi_head_att_output_fc_5.w_0-eager_tmp_101-pp-11-mp-5
/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/matmul-bf16-bf16-bf16-eager_tmp_100-encoder_layer_130_multi_head_att_output_fc_6.w_0-eager_tmp_101-pp-11-mp-6
/home/niuliling/more/PaddleApiTest/test_fp32_with_bf16/numpy_data/matmul-bf16-bf16-bf16-eager_tmp_100-encoder_layer_130_multi_head_att_output_fc_7.w_0-eager_tmp_101-pp-11-mp-7
11 changes: 11 additions & 0 deletions test_fp32_with_bf16/test_matmul/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/bash
export CUDA_VISIBLE_DEVICES=0
export NVIDIA_TF32_OVERRIDE=0
export LD_LIBRARY_PATH=/usr/local/cuda/compat:$LD_LIBRARY_PATH:/usr/lib64/:/usr/local/lib/
# 读取字符串
rm new_log_matmul_fp32vsbf16
while IFS= read -r line
do
# 调用 train.py 并传递字符串作为参数
python test_matmul_fp32vsbfp16.py "$line" 2>&1|tee >> new_log_matmul_fp32vsbf16
done < case.txt
21 changes: 18 additions & 3 deletions test_fp32_with_bf16/test_matmul/test_matmul_fp32vsbfp16.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,16 +25,22 @@
from utils import (
np_assert_accuracy,
)
niuliling_path = None # 全局变量

class TestMatmulFP32vsBFP16(unittest.TestCase):
def setUp(self):
self.init_np_inputs_and_dout()

def init_np_inputs_and_dout(self):
# init np array
self.np_x = np.random.random(size=[1, 8192, 14336]).astype("float32") - 0.5
self.np_y = np.random.random(size=[14336, 12528]).astype("float32") - 0.5
self.np_dout = np.random.random(size=[1, 8192, 12528]).astype("float32") - 0.5
data_xy = np.load(niuliling_path+".npz")
data_dout = np.load(niuliling_path+".npy")

self.np_x = data_xy["x"].astype("float32")
self.np_y = data_xy["y"].astype("float32")
print("shape x", self.np_x.shape)
print("shape y", self.np_y.shape)
self.np_dout = data_dout.astype("float32")

def gen_eager_inputs_and_dout(self):
x = paddle.to_tensor(
Expand All @@ -56,6 +62,7 @@ def gen_eager_inputs_and_dout(self):
)
dout.stop_gradient = False
return x, y, dout

def cal_res(self, x, y, dout):
out = paddle.matmul(x, y)
out_grads = paddle.grad([out], [x, y], grad_outputs=[dout])
Expand Down Expand Up @@ -143,5 +150,13 @@ def test_matmul_fp32vsbfp16_mode2(self):
print(e)

if __name__ == '__main__':
if len(sys.argv) < 2:
print("请提供 data_path 参数")
sys.exit(1)

tmp = sys.argv[1] # 设置全局变量 data_path
niuliling_path = tmp
print(tmp)
del sys.argv[1]
np.random.seed(2023)
unittest.main()