From b60fa0491709540abd54417181d23869859147f6 Mon Sep 17 00:00:00 2001 From: Vivek Kale <11766050+vlkale@users.noreply.github.com> Date: Mon, 7 Mar 2022 13:36:17 -0500 Subject: [PATCH 1/4] Update Benchmark_su3.cc --- benchmarks/Benchmark_su3.cc | 314 +++++++++++++++++++++++++++--------- 1 file changed, 241 insertions(+), 73 deletions(-) diff --git a/benchmarks/Benchmark_su3.cc b/benchmarks/Benchmark_su3.cc index c7ac5d3..e410bdb 100644 --- a/benchmarks/Benchmark_su3.cc +++ b/benchmarks/Benchmark_su3.cc @@ -26,6 +26,9 @@ Author: Peter Boyle See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ + + *************************************************************************************/ + /* END LEGAL */ #include #include using namespace std; @@ -33,16 +36,21 @@ using namespace Grid; #ifdef OMPTARGET_UVM #pragma omp requires unified_shared_memory #endif + +//#define TILE_UNROLL +#define UNROLL_TILE +#define TILE_SIZE 64 +#define UNROLL_FACTOR 16 int main (int argc, char ** argv) { Grid_init(&argc,&argv); #define LMAX (48) -#define LMIN (8) -#define LADD (8) +#define LMIN (4) +#define LADD (4) int64_t Nwarm=50; int64_t Nloop=1000; - + Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); std::cout<= vol) { - std::cout<<"Spot check failed; index out of bound"< latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); @@ -280,17 +449,16 @@ int64_t Nloop=1000; LatticeColourMatrix y(&Grid); random(pRNG,y); for(int mu=0;mu<4;mu++){ - double start=usecond(); - for(int64_t i=0;i Date: Mon, 7 Mar 2022 13:38:44 -0500 Subject: [PATCH 2/4] Update Benchmark_su3.cc --- benchmarks/Benchmark_su3.cc | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/benchmarks/Benchmark_su3.cc b/benchmarks/Benchmark_su3.cc index e410bdb..f334b9e 100644 --- a/benchmarks/Benchmark_su3.cc +++ b/benchmarks/Benchmark_su3.cc @@ -202,7 +202,12 @@ int64_t Nloop=1000; for(int64_t s = ss+ 13*TILE_SIZE; s < ss+14*TILE_SIZE; s++) zv[s]=xv[s]*yv[s]; - + + for(int64_t s = ss+ 14*TILE_SIZE; s < ss+15*TILE_SIZE; s++) + zv[s]=xv[s]*yv[s]; + for(int64_t s = ss+ 15*TILE_SIZE; s < ss+16*TILE_SIZE; s++) + zv[s]=xv[s]*yv[s]; + } #else @@ -211,8 +216,6 @@ int64_t Nloop=1000; for(int64_t ss=0; ss Date: Fri, 18 Mar 2022 16:05:24 -0400 Subject: [PATCH 3/4] Update Benchmark_su3.cc --- benchmarks/Benchmark_su3.cc | 156 ++---------------------------------- 1 file changed, 7 insertions(+), 149 deletions(-) diff --git a/benchmarks/Benchmark_su3.cc b/benchmarks/Benchmark_su3.cc index f334b9e..6731c39 100644 --- a/benchmarks/Benchmark_su3.cc +++ b/benchmarks/Benchmark_su3.cc @@ -160,80 +160,15 @@ int64_t Nloop=1000; #ifdef UNROLL_TILE #pragma omp target teams distribute parallel for thread_limit(gpu_threads) unroll partial(UNROLL_FACTOR) tile sizes(TILE_SIZE) - for(int64_t ss=0;ss Date: Tue, 26 Apr 2022 23:43:21 -0400 Subject: [PATCH 4/4] Adding OpenMP multi-GPU parallelization for OpenMP offload Adding OpenMP multi-GPU parallelization for OpenMP offload in the version without function overloading of the star operator. --- benchmarks/Benchmark_su3.cc | 52 +++++++++++++++++++++++++------------ 1 file changed, 35 insertions(+), 17 deletions(-) diff --git a/benchmarks/Benchmark_su3.cc b/benchmarks/Benchmark_su3.cc index 6731c39..18aa1e3 100644 --- a/benchmarks/Benchmark_su3.cc +++ b/benchmarks/Benchmark_su3.cc @@ -155,42 +155,60 @@ int64_t Nloop=1000; map(to:xv._odata[ :xv.size()]) \ map(to:yv._odata[ :yv.size()]) - for(int64_t i=0;i