From bf2e9879c0174c2e76e2d07321c8f8a4ccab8e9a Mon Sep 17 00:00:00 2001 From: Vivek Kale <11766050+vlkale@users.noreply.github.com> Date: Thu, 27 Jan 2022 11:25:18 -0500 Subject: [PATCH 1/8] Update Benchmark_su3.cc --- benchmarks/Benchmark_su3.cc | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/benchmarks/Benchmark_su3.cc b/benchmarks/Benchmark_su3.cc index 3eaaab2..0feeea3 100644 --- a/benchmarks/Benchmark_su3.cc +++ b/benchmarks/Benchmark_su3.cc @@ -132,8 +132,33 @@ int main (int argc, char ** argv) for(int64_t i=0;i Date: Thu, 27 Jan 2022 11:26:25 -0500 Subject: [PATCH 2/8] Update Benchmark_su3.cc --- benchmarks/Benchmark_su3.cc | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/benchmarks/Benchmark_su3.cc b/benchmarks/Benchmark_su3.cc index 0feeea3..a4d617e 100644 --- a/benchmarks/Benchmark_su3.cc +++ b/benchmarks/Benchmark_su3.cc @@ -31,6 +31,15 @@ Author: Peter Boyle using namespace std; using namespace Grid; + +#define TILE_SZ 4 + +#define UNROLL_FACTOR 2 + +#define TILE +#define UNROLL +//#define OMP_TILE +//#define OMP_UNROLL int main (int argc, char ** argv) { Grid_init(&argc,&argv); From cc757c4e5de1df8ea04acb5d3ad9e237128a9229 Mon Sep 17 00:00:00 2001 From: Vivek Kale <11766050+vlkale@users.noreply.github.com> Date: Thu, 27 Jan 2022 11:30:00 -0500 Subject: [PATCH 3/8] Update Benchmark_su3.cc --- benchmarks/Benchmark_su3.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/Benchmark_su3.cc b/benchmarks/Benchmark_su3.cc index a4d617e..513f057 100644 --- a/benchmarks/Benchmark_su3.cc +++ b/benchmarks/Benchmark_su3.cc @@ -150,7 +150,7 @@ int main (int argc, char ** argv) } #endif #ifdef OMP_UNROLL - #pragma omp unroll (UNROLL_FACTOR) + #pragma omp unroll factor(UNROLL_FACTOR) for(int64_t s=0;s Date: Thu, 27 Jan 2022 11:43:26 -0500 Subject: [PATCH 4/8] Update Benchmark_su3.cc --- benchmarks/Benchmark_su3.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/Benchmark_su3.cc b/benchmarks/Benchmark_su3.cc index 513f057..c0ce205 100644 --- a/benchmarks/Benchmark_su3.cc +++ b/benchmarks/Benchmark_su3.cc @@ -150,7 +150,7 @@ int main (int argc, char ** argv) } #endif #ifdef OMP_UNROLL - #pragma omp unroll factor(UNROLL_FACTOR) + #pragma omp unroll patial(UNROLL_FACTOR) for(int64_t s=0;s Date: Thu, 27 Jan 2022 11:48:49 -0500 Subject: [PATCH 5/8] Update Makefile --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 6d7ee28..b957851 100644 --- a/Makefile +++ b/Makefile @@ -13,7 +13,7 @@ MAIN=Benchmark_su3 ##Clang CXX=clang++ -CXXFLAGS=-std=c++14 -g -fopenmp -fopenmp-cuda-mode -O3 -fopenmp-targets=nvptx64-nvidia-cuda -lcudart +CXXFLAGS=-std=c++14 -g -fopenmp -fopenmp-cuda-mode -O3 -fopenmp-targets=nvptx64-nvidia-cuda -lcudart -fno-exceptions -march=native -fopenmp-version=51 -fno-unroll-loops -fno-vectorize -llvm_info -Rpass=loop-unroll CXXFLAGS += -DOMPTARGET CXXFLAGS +=-DOMPTARGET_MANAGED #CXXFLAGS += -DVECTOR_LOOPS From 149820d16eafd4ec92b4708cf6f411362583cd4d Mon Sep 17 00:00:00 2001 From: Vivek Kale <11766050+vlkale@users.noreply.github.com> Date: Fri, 28 Jan 2022 19:18:16 -0500 Subject: [PATCH 6/8] Update Benchmark_su3.cc --- benchmarks/Benchmark_su3.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/Benchmark_su3.cc b/benchmarks/Benchmark_su3.cc index c0ce205..f932f0f 100644 --- a/benchmarks/Benchmark_su3.cc +++ b/benchmarks/Benchmark_su3.cc @@ -150,7 +150,7 @@ int main (int argc, char ** argv) } #endif #ifdef OMP_UNROLL - #pragma omp unroll patial(UNROLL_FACTOR) + #pragma omp unroll partial(UNROLL_FACTOR) for(int64_t s=0;s Date: Fri, 28 Jan 2022 19:19:12 -0500 Subject: [PATCH 7/8] Update Makefile --- Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile b/Makefile index b957851..e77bb49 100644 --- a/Makefile +++ b/Makefile @@ -11,6 +11,7 @@ MAIN=Benchmark_su3 #CXX=pgc++ #CXXFLAGS=-fast --c++14 -acc -Mnollvm -Minfo=accel -ta=tesla:cc70,managed -Mlarge_arrays --no_exceptions +## Add flags for forcing no compiler-automated loop unroll ##Clang CXX=clang++ CXXFLAGS=-std=c++14 -g -fopenmp -fopenmp-cuda-mode -O3 -fopenmp-targets=nvptx64-nvidia-cuda -lcudart -fno-exceptions -march=native -fopenmp-version=51 -fno-unroll-loops -fno-vectorize -llvm_info -Rpass=loop-unroll From d6fc64d10424eb8c815a4ad8f670244651b3ccbe Mon Sep 17 00:00:00 2001 From: Vivek Kale <11766050+vlkale@users.noreply.github.com> Date: Mon, 14 Feb 2022 14:15:55 -0500 Subject: [PATCH 8/8] Adding OpenMP unroll factor and omp unroll directive --- Grid/threads/Pragmas.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Grid/threads/Pragmas.h b/Grid/threads/Pragmas.h index 8a9b147..652805f 100644 --- a/Grid/threads/Pragmas.h +++ b/Grid/threads/Pragmas.h @@ -36,6 +36,7 @@ Author: paboyle #define strong_inline __attribute__((always_inline)) inline #define UNROLL _Pragma("unroll") +#define OMP_UROLL_FACT 4 ////////////////////////////////////////////////////////////////////////////////// // New primitives; explicit host thread calls, and accelerator data parallel calls ////////////////////////////////////////////////////////////////////////////////// @@ -132,13 +133,15 @@ extern uint32_t gpu_threads; #define accelerator_for(iterator,num,nsimd, ... ) \ { \ uint32_t nteams=(num+gpu_threads-1)/gpu_threads; \ - _Pragma("omp target teams distribute parallel for num_teams(nteams) thread_limit(gpu_threads)") \ + uint32_t unroll_factor = OMP_UROLL_FACT; + _Pragma("omp target teams distribute parallel for num_teams(nteams) thread_limit(gpu_threads) unroll partial(unroll_factor)") \ naked_for(iterator, num, { __VA_ARGS__ }); \ } #define accelerator_forNB(iterator,num,nsimd, ... ) \ { \ uint32_t nteams=(num+gpu_threads-1)/gpu_threads; \ - _Pragma("omp target teams distribute parallel for num_teams(nteams) thread_limit(gpu_threads)") \ + uint32_t unroll_factor = OMP_UROLL_FACT; + _Pragma("omp target teams distribute parallel for num_teams(nteams) thread_limit(gpu_threads) unroll partial(unroll_factor)") \ naked_for(iterator, num, { __VA_ARGS__ }); \ }