A set of blackwell kernels in cuda and cute
# BF16 input with FP32 accumulation kernel
nvcc -arch=sm_120a -O3 -std=c++17 -o bf16_fp32_kernel mma_m16n8k16_bf16_fp32_accum.cu
# FP16 input with FP16 accumulation kernel
nvcc -arch=sm_120a -O3 -std=c++17 -o fp16_fp16_kernel mma_m16n8k16_fp16_f16_accum.cu
# FP16 input with FP32 accumulation kernel
nvcc -arch=sm_120a -O3 -std=c++17 -o fp16_fp32_kernel mma_m16n8k16_fp16_f32_accum.cu# Profile BF16 input with FP32 accumulation kernel
ncu --set full --call-stack --nvtx --import-source yes -o bf16_fp32_profile -f ./bf16_fp32_kernel
# Profile FP16 input with FP16 accumulation kernel
ncu --set full --call-stack --nvtx --import-source yes -o fp16_fp16_profile -f ./fp16_fp16_kernel
# Profile FP16 input with FP32 accumulation kernel
ncu --set full --call-stack --nvtx --import-source yes -o fp16_fp32_profile -f ./fp16_fp32_kernel