From bf2e9879c0174c2e76e2d07321c8f8a4ccab8e9a Mon Sep 17 00:00:00 2001
From: Vivek Kale <11766050+vlkale@users.noreply.github.com>
Date: Thu, 27 Jan 2022 11:25:18 -0500
Subject: [PATCH 1/8] Update Benchmark_su3.cc

---
 benchmarks/Benchmark_su3.cc | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/benchmarks/Benchmark_su3.cc b/benchmarks/Benchmark_su3.cc
index 3eaaab2..0feeea3 100644
--- a/benchmarks/Benchmark_su3.cc
+++ b/benchmarks/Benchmark_su3.cc
@@ -132,8 +132,33 @@ int main (int argc, char ** argv)
 
       for(int64_t i=0;i<Nloop;i++){
       #pragma omp target teams distribute parallel for
-	for(int64_t s=0;s<vol;s++){
-          zv[s]=xv[s]*yv[s];
+
+	      
+#ifdef UNROLL
+	 for(int64_t s=0;s<vol;s+=UNROLL_FACTOR) { 
+		 zv[s]=xv[s]*yv[s];
+		 zv[s+1]=xv[s+1]*yv[s+1];
+	 }
+#endif
+#ifdef OMP_UNROLL
+	  #pragma omp unroll (UNROLL_FACTOR)
+	 for(int64_t s=0;s<vol;s++)
+		 zv[s]=xv[s]*yv[s];
+#endif
+
+
+#ifdef TILE
+	 for(int64_t s=0;s<vol;s+=TILE_SZ) {
+          for (int64_t t = s; t< min(s+TILE_SZ, vol); t++)
+             zv[t]=xv[t]*yv[t];
+	 }
+#endif
+	      
+#ifdef OMP_TILE
+	  #pragma omp tile sizes(TILE_SZ)
+	 for(int64_t s=0;s<vol;s++)
+		 zv[s]=xv[s]*yv[s];
+#endif
         }
       }
 

From 70881b769108e22f899f8a2925801da351e29044 Mon Sep 17 00:00:00 2001
From: Vivek Kale <11766050+vlkale@users.noreply.github.com>
Date: Thu, 27 Jan 2022 11:26:25 -0500
Subject: [PATCH 2/8] Update Benchmark_su3.cc

---
 benchmarks/Benchmark_su3.cc | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/benchmarks/Benchmark_su3.cc b/benchmarks/Benchmark_su3.cc
index 0feeea3..a4d617e 100644
--- a/benchmarks/Benchmark_su3.cc
+++ b/benchmarks/Benchmark_su3.cc
@@ -31,6 +31,15 @@ Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 using namespace std;
 using namespace Grid;
 
+
+#define TILE_SZ 4
+
+#define UNROLL_FACTOR 2
+
+#define TILE
+#define UNROLL
+//#define OMP_TILE
+//#define OMP_UNROLL
 int main (int argc, char ** argv)
 {
   Grid_init(&argc,&argv);

From cc757c4e5de1df8ea04acb5d3ad9e237128a9229 Mon Sep 17 00:00:00 2001
From: Vivek Kale <11766050+vlkale@users.noreply.github.com>
Date: Thu, 27 Jan 2022 11:30:00 -0500
Subject: [PATCH 3/8] Update Benchmark_su3.cc

---
 benchmarks/Benchmark_su3.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/Benchmark_su3.cc b/benchmarks/Benchmark_su3.cc
index a4d617e..513f057 100644
--- a/benchmarks/Benchmark_su3.cc
+++ b/benchmarks/Benchmark_su3.cc
@@ -150,7 +150,7 @@ int main (int argc, char ** argv)
 	 }
 #endif
 #ifdef OMP_UNROLL
-	  #pragma omp unroll (UNROLL_FACTOR)
+	  #pragma omp unroll factor(UNROLL_FACTOR)
 	 for(int64_t s=0;s<vol;s++)
 		 zv[s]=xv[s]*yv[s];
 #endif

From 42d640f53893218eaa3d3cdea4d8629739bf7875 Mon Sep 17 00:00:00 2001
From: Vivek Kale <11766050+vlkale@users.noreply.github.com>
Date: Thu, 27 Jan 2022 11:43:26 -0500
Subject: [PATCH 4/8] Update Benchmark_su3.cc

---
 benchmarks/Benchmark_su3.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/Benchmark_su3.cc b/benchmarks/Benchmark_su3.cc
index 513f057..c0ce205 100644
--- a/benchmarks/Benchmark_su3.cc
+++ b/benchmarks/Benchmark_su3.cc
@@ -150,7 +150,7 @@ int main (int argc, char ** argv)
 	 }
 #endif
 #ifdef OMP_UNROLL
-	  #pragma omp unroll factor(UNROLL_FACTOR)
+	  #pragma omp unroll patial(UNROLL_FACTOR)
 	 for(int64_t s=0;s<vol;s++)
 		 zv[s]=xv[s]*yv[s];
 #endif

From 16849ccba5fc259ba9ed65c22e784c1371e61177 Mon Sep 17 00:00:00 2001
From: Vivek Kale <11766050+vlkale@users.noreply.github.com>
Date: Thu, 27 Jan 2022 11:48:49 -0500
Subject: [PATCH 5/8] Update Makefile

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 6d7ee28..b957851 100644
--- a/Makefile
+++ b/Makefile
@@ -13,7 +13,7 @@ MAIN=Benchmark_su3
 
 ##Clang
 CXX=clang++
-CXXFLAGS=-std=c++14 -g -fopenmp -fopenmp-cuda-mode  -O3 -fopenmp-targets=nvptx64-nvidia-cuda -lcudart
+CXXFLAGS=-std=c++14 -g -fopenmp -fopenmp-cuda-mode  -O3 -fopenmp-targets=nvptx64-nvidia-cuda -lcudart -fno-exceptions -march=native -fopenmp-version=51 -fno-unroll-loops -fno-vectorize -llvm_info -Rpass=loop-unroll
 CXXFLAGS += -DOMPTARGET 
 CXXFLAGS +=-DOMPTARGET_MANAGED
 #CXXFLAGS += -DVECTOR_LOOPS

From 149820d16eafd4ec92b4708cf6f411362583cd4d Mon Sep 17 00:00:00 2001
From: Vivek Kale <11766050+vlkale@users.noreply.github.com>
Date: Fri, 28 Jan 2022 19:18:16 -0500
Subject: [PATCH 6/8] Update Benchmark_su3.cc

---
 benchmarks/Benchmark_su3.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/Benchmark_su3.cc b/benchmarks/Benchmark_su3.cc
index c0ce205..f932f0f 100644
--- a/benchmarks/Benchmark_su3.cc
+++ b/benchmarks/Benchmark_su3.cc
@@ -150,7 +150,7 @@ int main (int argc, char ** argv)
 	 }
 #endif
 #ifdef OMP_UNROLL
-	  #pragma omp unroll patial(UNROLL_FACTOR)
+	  #pragma omp unroll partial(UNROLL_FACTOR)
 	 for(int64_t s=0;s<vol;s++)
 		 zv[s]=xv[s]*yv[s];
 #endif

From da254dd7c7f7cdacf94b08e0c60ef1818ecb69f6 Mon Sep 17 00:00:00 2001
From: Vivek Kale <11766050+vlkale@users.noreply.github.com>
Date: Fri, 28 Jan 2022 19:19:12 -0500
Subject: [PATCH 7/8] Update Makefile

---
 Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Makefile b/Makefile
index b957851..e77bb49 100644
--- a/Makefile
+++ b/Makefile
@@ -11,6 +11,7 @@ MAIN=Benchmark_su3
 #CXX=pgc++
 #CXXFLAGS=-fast --c++14 -acc -Mnollvm -Minfo=accel -ta=tesla:cc70,managed -Mlarge_arrays --no_exceptions
 
+## Add flags for forcing no compiler-automated loop unroll
 ##Clang
 CXX=clang++
 CXXFLAGS=-std=c++14 -g -fopenmp -fopenmp-cuda-mode  -O3 -fopenmp-targets=nvptx64-nvidia-cuda -lcudart -fno-exceptions -march=native -fopenmp-version=51 -fno-unroll-loops -fno-vectorize -llvm_info -Rpass=loop-unroll

From d6fc64d10424eb8c815a4ad8f670244651b3ccbe Mon Sep 17 00:00:00 2001
From: Vivek Kale <11766050+vlkale@users.noreply.github.com>
Date: Mon, 14 Feb 2022 14:15:55 -0500
Subject: [PATCH 8/8] Adding OpenMP unroll factor and omp unroll directive

---
 Grid/threads/Pragmas.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/Grid/threads/Pragmas.h b/Grid/threads/Pragmas.h
index 8a9b147..652805f 100644
--- a/Grid/threads/Pragmas.h
+++ b/Grid/threads/Pragmas.h
@@ -36,6 +36,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #define strong_inline     __attribute__((always_inline)) inline
 #define UNROLL  _Pragma("unroll")
 
+#define OMP_UROLL_FACT 4
 //////////////////////////////////////////////////////////////////////////////////
 // New primitives; explicit host thread calls, and accelerator data parallel calls
 //////////////////////////////////////////////////////////////////////////////////
@@ -132,13 +133,15 @@ extern uint32_t gpu_threads;
 #define accelerator_for(iterator,num,nsimd, ... )  \
 {                                                  \
 	uint32_t nteams=(num+gpu_threads-1)/gpu_threads;  \
-       	_Pragma("omp target teams distribute parallel for num_teams(nteams) thread_limit(gpu_threads)") \
+	uint32_t unroll_factor = OMP_UROLL_FACT;
+       	_Pragma("omp target teams distribute parallel for num_teams(nteams) thread_limit(gpu_threads) unroll partial(unroll_factor)") \
 	naked_for(iterator, num, { __VA_ARGS__ }); \
   }
 #define accelerator_forNB(iterator,num,nsimd, ... ) \
   {						    \
   	uint32_t nteams=(num+gpu_threads-1)/gpu_threads;  \
-        _Pragma("omp target teams distribute parallel for num_teams(nteams) thread_limit(gpu_threads)") \
+	uint32_t unroll_factor = OMP_UROLL_FACT;
+        _Pragma("omp target teams distribute parallel for num_teams(nteams) thread_limit(gpu_threads) unroll partial(unroll_factor)") \
         naked_for(iterator, num, { __VA_ARGS__ }); \
   }