From 1c4744b1ce59edeae52f2555fedad83eb0665ccc Mon Sep 17 00:00:00 2001
From: Tarun Prabhu <tarun@lanl.gov>
Date: Mon, 20 Oct 2025 16:13:56 -0600
Subject: [PATCH 01/23] [kitsune] Add print-before-first command line option

Since Tapir currently requires optimizations, it is difficult to get the IR as
soon as it is generated by the frontend. Using -S -emit-llvm will only produce
IR that has been through the entire optimization pipeline. This commit adds a
-print-before-first option to both clang and flang. This prints the IR to stderr
just before the optimization passes are run. This is useful when debugging the
frontend to see exactly what IR is being produced.
---
 .../clang/Basic/DiagnosticDriverKinds.td      |  5 ++-
 clang/include/clang/Driver/Options.td         | 15 +++++--
 clang/lib/CodeGen/BackendUtil.cpp             |  4 ++
 clang/lib/Driver/Driver.cpp                   |  6 ++-
 clang/lib/Driver/KitsuneOptionUtils.cpp       |  1 +
 clang/lib/Driver/ToolChain.cpp                |  1 +
 clang/lib/Frontend/CompilerInvocation.cpp     |  3 ++
 flang/lib/Frontend/FrontendActions.cpp        |  3 ++
 .../include/kitsune/Frontend/KitsuneOptions.h | 14 +++++++
 kitsune/test/driver/print-before-first.c      | 40 +++++++++++++++++++
 kitsune/test/driver/print-before-first.f90    | 39 ++++++++++++++++++
 11 files changed, 125 insertions(+), 6 deletions(-)
 create mode 100644 kitsune/test/driver/print-before-first.c
 create mode 100644 kitsune/test/driver/print-before-first.f90

diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
index 29ff8dbe1f12f..80d80e9699a00 100644
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -901,6 +901,9 @@ def err_drv_unsupported_option_argument_for_frontend : Error<
 def err_drv_kitsune_missing_required : Error<
   "missing required option '--%0'">;
 
+def err_drv_kitsune_tapir_required : Error<
+  "--tapir is required with '%0'">;
+
 def err_drv_kitsune_optzns_required : Error<
   "--tapir requires optimization level O1 or higher">;
 def err_drv_kitsune_lto_o2_required : Error<
@@ -944,8 +947,6 @@ def err_drv_opencilk_missing_abi_bitcode: Error<
 
 def err_drv_kitsune_kokkos_disabled : Error<
   "kokkos support was not enabled when kitsune was built">;
-def err_drv_kitsune_kokkos_no_tapir : Error<
-  "--tapir is required with '%0'">;
 
 def err_drv_kitsune_target_not_enabled: Error<
   "tapir target '%0' was not enabled when kitsune was built">;
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index f71de4f5bfa4d..6d958332755ed 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -4283,7 +4283,7 @@ def fno_trigraphs : Flag<["-"], "fno-trigraphs">, Group<f_Group>,
   HelpText<"Do not process trigraph sequences">,
   Visibility<[ClangOption, CC1Option]>;
 def funique_source_file_names: Flag<["-"], "funique-source-file-names">, Group<f_Group>,
-  HelpText<"Allow the compiler to assume that each translation unit has a unique "                       
+  HelpText<"Allow the compiler to assume that each translation unit has a unique "
            "source file identifier (see -funique-source-file-identifier) at link time">;
 def fno_unique_source_file_names: Flag<["-"], "fno-unique-source-file-names">;
 def unique_source_file_identifier_EQ: Joined<["-"], "funique-source-file-identifier=">, Group<f_Group>,
@@ -7084,7 +7084,7 @@ defm android_pad_segment : BooleanFFlag<"android-pad-segment">, Group<f_Group>;
 def shared_libflangrt : Flag<["-"], "shared-libflangrt">,
   HelpText<"Link the flang-rt shared library">, Group<Link_Group>,
   Visibility<[FlangOption]>, Flags<[NoArgumentUnused]>;
-def static_libflangrt : Flag<["-"], "static-libflangrt">, 
+def static_libflangrt : Flag<["-"], "static-libflangrt">,
   HelpText<"Link the flang-rt static library">, Group<Link_Group>,
   Visibility<[FlangOption]>, Flags<[NoArgumentUnused]>;
 
@@ -9508,6 +9508,15 @@ def fkokkos_no_init : Flag<["-"], "fkokkos-no-init">, Alias<kokkos_no_init>,
   Visibility<[ClangOption, CC1Option]>,
   HelpText<"DEPRECATED: Use --kokkos-no-init">;
 
+// Tapir currently requires at least -O1. This makes it difficult to write tests
+// that check that the frontend generates the expected LLVM IR. If this
+// restriction is ever removed, this option will no longer be necessary since
+// we could just pass -O0 to examine the IR generated by the frontend.
+def print_before_first: Joined<["--"], "print-before-first">,
+  Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>,
+  HelpText<"Print the LLVM Module to stderr before running the optimization "
+           "pipeline">;
+
 def kitrt_verbose: Joined<["--"], "kitrt-verbose">,
   Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>,
   HelpText<"Enable verbose mode in kitsune's runtime">;
@@ -9515,7 +9524,7 @@ def kitrt_verbose: Joined<["--"], "kitrt-verbose">,
 def tapir_EQ : Joined<["--"], "tapir=">,
   Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>,
   MetaVarName<"<target>">,
-  Values<"none,serial,cuda,hip,opencilk">,
+  Values<"nolo,serial,cuda,hip,opencilk">,
   HelpText<"The primary tapir target">;
 
 def ftapir_EQ : Joined<["-"], "ftapir=">, Alias<tapir_EQ>,
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 0a82af6fbbbe6..03fb8a71a6746 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -1216,6 +1216,10 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
     return;
   }
 
+  const KitsuneOptions& kitOpts = CI.getKitsuneOpts();
+  if (kitOpts.hasTTID() && kitOpts.getPrintBeforeFirst())
+    llvm::errs() << *TheModule << "\n";
+
   // Now that we have all of the passes ready, run them.
   {
     PrettyStackTraceString CrashInfo("Optimizer");
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 5381edbe61a3d..d861e8474050d 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -229,13 +229,17 @@ static void CheckKitsuneOptions(const Driver &D, const ArgList &Args,
 
     // If --kokkos is provided, then a tapir target must also be provided.
     if (!Args.hasArg(options::OPT_tapir_EQ)) {
-      D.Diag(diag::err_drv_kitsune_kokkos_no_tapir)
+      D.Diag(diag::err_drv_kitsune_tapir_required)
           << Args.getLastArg(options::OPT_kokkos, options::OPT_kokkos_no_init)
                  ->getSpelling();
       return;
     }
   }
 
+  if (const Arg *A = Args.getLastArg(options::OPT_print_before_first))
+    if (!Args.hasArg(options::OPT_tapir_EQ))
+      D.Diag(diag::err_drv_kitsune_tapir_required) << A->getSpelling();
+
   // Check that the -ftapir flag has a valid value. This stops us from
   // reporting multiple errors because the flag is examined in several places.
   if (const Arg *A = Args.getLastArg(options::OPT_tapir_EQ)) {
diff --git a/clang/lib/Driver/KitsuneOptionUtils.cpp b/clang/lib/Driver/KitsuneOptionUtils.cpp
index a288bdf7f7477..56695829646b3 100644
--- a/clang/lib/Driver/KitsuneOptionUtils.cpp
+++ b/clang/lib/Driver/KitsuneOptionUtils.cpp
@@ -335,6 +335,7 @@ bool clang::parseKitsuneArgs(KitsuneOptions &kitOpts, const char *argv0,
 
   kitOpts.setKitsuneFrontend(IsKitsuneFrontend(argv0));
   kitOpts.setStripmineLoops(args.hasArg(OPT_fstripmine));
+  kitOpts.setPrintBeforeFirst(args.hasArg(OPT_print_before_first));
 
   kitOpts.setTapirVerbose(args.hasArg(OPT_tapir_verbose));
   kitOpts.setKitrtVerbose(args.hasArg(OPT_kitrt_verbose));
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index 9f7a3a4a83e23..a953a300a8309 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -2385,6 +2385,7 @@ void ToolChain::AddKitsuneCompilerArgs(const ArgList &Args,
 
   if (std::optional<TTID> TT = parseTapirTargetIfValid(Args)) {
     Args.AddLastArg(CmdArgs, options::OPT_ffp_contract);
+    Args.AddLastArg(CmdArgs, options::OPT_print_before_first);
     Args.AddLastArg(CmdArgs, options::OPT_kitrt_verbose);
     Args.AddLastArg(CmdArgs, options::OPT_tapir_verbose);
     Args.AddLastArg(CmdArgs, options::OPT_tapir_EQ);
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index c9309fbebae6f..db8c73df8b1ce 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -4849,6 +4849,9 @@ void CompilerInvocationBase::GenerateKitsuneArgs(const KitsuneOptions &Opts,
 
   if (Opts.getStripmineLoops())
     GenerateArg(Consumer, OPT_fstripmine);
+
+  if (Opts.getPrintBeforeFirst())
+    GenerateArg(Consumer, OPT_print_before_first);
 }
 
 bool CompilerInvocation::CheckKitsuneArgs(const ArgList &Args,
diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp
index 51a159697a0a1..bc93989d472ca 100644
--- a/flang/lib/Frontend/FrontendActions.cpp
+++ b/flang/lib/Frontend/FrontendActions.cpp
@@ -1053,6 +1053,9 @@ void CodeGenAction::runOptimizationPipeline(llvm::raw_pwrite_stream &os) {
     return;
   }
 
+  if (kitsuneOpts.hasTTID() && kitsuneOpts.getPrintBeforeFirst())
+    llvm::errs() << *llvmModule << "\n";
+
   // Run the passes.
   mpm.run(*llvmModule, mam);
 
diff --git a/kitsune/include/kitsune/Frontend/KitsuneOptions.h b/kitsune/include/kitsune/Frontend/KitsuneOptions.h
index a5eafaa4d9090..81c877cade54e 100644
--- a/kitsune/include/kitsune/Frontend/KitsuneOptions.h
+++ b/kitsune/include/kitsune/Frontend/KitsuneOptions.h
@@ -90,6 +90,14 @@ class KitsuneOptions {
   /// Should loop strip-mining be enabled.
   unsigned stripmineLoops : 1;
 
+  /// Should the LLVM Module be printed to stderr just before running the
+  /// optimization passes. Tapir currently requires at least -O1. This makes it
+  /// difficult to write tests that check that the frontend generates the
+  /// expected LLVM IR. If this restriction is ever removed, this option will no
+  /// longer be necessary since we could just pass -O0 to examine the IR
+  /// generated by the frontend.
+  unsigned printBeforeFirst : 1;
+
   /// Enable verbose mode for the tapir target LLVM passes. This is different
   /// from -mllvm -debug-only=<tapir-target-llvm-pass-name>. When the verbose
   /// flag is set, the passes may print some subset of the information that is
@@ -219,6 +227,10 @@ class KitsuneOptions {
   void setStripmineLoops(bool stripmineLoops = true) {
     this->stripmineLoops = stripmineLoops;
   }
+
+  void setPrintBeforeFirst(bool printBeforeFirst = true) {
+    this->printBeforeFirst = printBeforeFirst;
+  }
   /// @}
 
   /// @{
@@ -292,6 +304,8 @@ class KitsuneOptions {
 
   bool getStripmineLoops() const { return stripmineLoops; }
 
+  bool getPrintBeforeFirst() const { return printBeforeFirst; }
+
   bool getTapirVerbose() const { return tapirVerbose; }
 
   bool getKitrtVerbose() const { return kitrtVerbose; }
diff --git a/kitsune/test/driver/print-before-first.c b/kitsune/test/driver/print-before-first.c
new file mode 100644
index 0000000000000..2e6fd1db37d3a
--- /dev/null
+++ b/kitsune/test/driver/print-before-first.c
@@ -0,0 +1,40 @@
+// Check that the --print-before-first option is handled correctly.
+//
+// ----------------------------------------------------------------------------
+// The --print-before-first option must be used with a Kitsune frontend
+//
+// RUN: not %clang -O1 --print-before-first %s \
+// RUN:     -S -emit-llvm -o /dev/null 2>&1 \
+// RUN:     | FileCheck %s -check-prefix FRONTEND
+//
+// FRONTEND: option '--print-before-first' must be used with a Kitsune frontend
+//
+// ----------------------------------------------------------------------------
+// The --print-before-first option requires the --tapir option
+//
+// RUN: not %kitcc -O1 --print-before-first %s \
+// RUN:     -S -emit-llvm -o /dev/null 2>&1 \
+// RUN:     | FileCheck %s -check-prefix TT -allow-empty
+//
+// TT: --tapir is required with '--print-before-first'
+//
+// ----------------------------------------------------------------------------
+// If the --print-before-first option has been implemented correctly, mem2reg
+// will not have run, so a stack slot will have been created for the function
+// argument.
+//
+// RUN: %kitcc -O1 --tapir=serial --print-before-first %s \
+// RUN:     -S -emit-llvm -o /dev/null 2>&1 \
+// RUN:     | FileCheck %s -check-prefix EARLY
+//
+// EARLY: define {{.+}}ptr @f(ptr {{.*}}%[[P:[^)]+]])
+// EARLY: %[[SLOT:.+]] = alloca ptr
+// EARLY: store ptr %[[P]], ptr %[[SLOT]]
+// EARLY: %[[RV:.+]] = load ptr, ptr %[[SLOT]]
+// EARLY: ret ptr %[[RV]]
+//
+// ----------------------------------------------------------------------------
+
+void* f(void* p) {
+  return p;
+}
diff --git a/kitsune/test/driver/print-before-first.f90 b/kitsune/test/driver/print-before-first.f90
new file mode 100644
index 0000000000000..1af4ae07173c3
--- /dev/null
+++ b/kitsune/test/driver/print-before-first.f90
@@ -0,0 +1,39 @@
+! REQUIRES: kitfc
+!
+! Check that the --print-before-first option is handled correctly.
+!
+! ----------------------------------------------------------------------------
+! The --print-before-first option must be used with a Kitsune frontend
+!
+! RUN: not %flang -O1 --print-before-first %s \
+! RUN:     -S -emit-llvm -o /dev/null 2>&1 \
+! RUN:     | FileCheck %s -check-prefix FRONTEND
+!
+! FRONTEND: option '--print-before-first' must be used with a Kitsune frontend
+!
+! ----------------------------------------------------------------------------
+! The --print-before-first option requires the --tapir option
+!
+! RUN: not %kitfc -O1 --print-before-first %s \
+! RUN:     -S -emit-llvm -o /dev/null 2>&1 \
+! RUN:     | FileCheck %s -check-prefix TT -allow-empty
+!
+! TT: --tapir is required with '--print-before-first'
+!
+! ----------------------------------------------------------------------------
+! If the --print-before-first option has been implemented correctly, @_QQmain
+! be called from @main. Since @_QQmain is empty, it will be absent after
+! optimizations are run (because the empty body will have been inlined)
+!
+! RUN: %kitfc -O1 --tapir=serial --print-before-first %s \
+! RUN:     -S -emit-llvm -o /dev/null 2>&1 \
+! RUN:     | FileCheck %s -check-prefix EARLY
+!
+! EARLY: define {{.*}}i32 @main
+! EARLY: call {{.+}} @_FortranAProgramStart
+! EARLY: call {{.+}} @_QQmain
+! EARLY: call {{.+}} @_FortranAProgramEndStatement
+!
+! ----------------------------------------------------------------------------
+
+end program

From 8c040e32153b8e6e04656327518e950ba3a4a5ef Mon Sep 17 00:00:00 2001
From: Tarun Prabhu <tarun@lanl.gov>
Date: Mon, 20 Oct 2025 14:03:53 -0600
Subject: [PATCH 02/23] [kitsune] Simplify GPU-centric tapir targets

CudaABI and HipABI had custom argument orders for the outlined functions
consisting of the bodies of tapir loops. This made them more complicated than
necessary since nothing else in the code was particularly dependent on the
order of the arguments. This has been fixed so those two tapir targets just
use the implementation of setupLoopOutlineArgs from the default
LoopOutlineProcessor for most of the work and add a small amount of additional
code. Some code was also cleaned up to use the newer API in llvm::Function to
get the nth llvm::Argument. In addition to updating the tests to reflect these
changes, some were also tweaked to avoid very long lines and to make it easier
to determine what exactly was being matched.
---
 kitsune/test/tapir/cuda/kernel-opt.ll         |   8 +-
 kitsune/test/tapir/cuda/kernel-threadiv-1d.ll |   6 +-
 kitsune/test/tapir/cuda/launch.ll             |   4 +-
 kitsune/test/tapir/hip/kernel-opt.ll          |   8 +-
 .../test/tapir/hip/kernel-threadiv-1d-y.ll    |   6 +-
 kitsune/test/tapir/hip/kernel-threadiv-1d.ll  |   6 +-
 kitsune/test/tapir/hip/launch.ll              |   4 +-
 llvm/include/llvm/Transforms/Tapir/CudaABI.h  |  16 ---
 llvm/include/llvm/Transforms/Tapir/HipABI.h   |  29 -----
 llvm/lib/Transforms/Tapir/CudaABI.cpp         | 121 +++---------------
 llvm/lib/Transforms/Tapir/HipABI.cpp          | 115 +++--------------
 11 files changed, 71 insertions(+), 252 deletions(-)

diff --git a/kitsune/test/tapir/cuda/kernel-opt.ll b/kitsune/test/tapir/cuda/kernel-opt.ll
index 682292a342bed..189a5ad7f93ef 100644
--- a/kitsune/test/tapir/cuda/kernel-opt.ll
+++ b/kitsune/test/tapir/cuda/kernel-opt.ll
@@ -32,7 +32,13 @@
 ; RUN:     | FileCheck %s --check-prefix=O2
 ;
 ; O2-NOT: = phi i64
-; O2: define {{.+}} @__kitcu_{{.+}}(i64 {{.*}}%[[UB:[^,]+]], i64 {{[^,]+}}, i64 {{[^,]+}}, ptr {{.*}}%[[BUF:[^,]+]], i64 {{.*}}%[[N:[^)]+]]) {{.*}}#[[ATTRS:[0-9]+]]
+; O2: define {{.+}} @__kitcu_{{[^(]+}}(
+; O2-SAME: i64 {{[^%]*}}%[[LB:[^,]+]],
+; O2-SAME: i64 {{[^%]*}}%[[UB:[^,]+]],
+; O2-SAME: i64 {{[^%]*}}%[[GRAINSIZE:[^,]+]],
+; O2-SAME: ptr {{[^%]*}}%[[BUF:[^,]+]],
+; O2-SAME: i64 {{[^%]*}}%[[N:[^)]+]])
+; O2-SAME: {{.*}}#[[ATTRS:[0-9]+]]
 ; O2-NEXT: [[BBENTRY:.+]]:
 ; O2-NEXT: %[[TID:.+]] = tail call {{(range.+ )?}}i32 @llvm.nvvm.read.ptx.sreg.tid.x()
 ; O2-NEXT: %[[BIDX:.+]] = tail call {{(range.+ )?}}i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
diff --git a/kitsune/test/tapir/cuda/kernel-threadiv-1d.ll b/kitsune/test/tapir/cuda/kernel-threadiv-1d.ll
index 90132664556fd..ed8d90f8cb134 100644
--- a/kitsune/test/tapir/cuda/kernel-threadiv-1d.ll
+++ b/kitsune/test/tapir/cuda/kernel-threadiv-1d.ll
@@ -6,7 +6,11 @@
 ; RUN:     | %kit-mbc -S \
 ; RUN:     | FileCheck %s
 ;
-; CHECK: define {{.+}}(i64 {{[^%]*}}%[[UB:[^,]+]], {{.+}}) #[[ATTRS:[0-9]+]]
+; CHECK: define {{[^(]+}}(
+; CHECK-SAME: i64 {{[^%]*}}%[[LB:[^,]+]],
+; CHECK-SAME: i64 {{[^%]*}}%[[UB:[^,]+]],
+; CHECK-SAME: i64 {{[^)]+}})
+; CHECK-SAME: #[[ATTRS:[0-9]+]]
 ; CHECK: %[[TID:.+]] = {{.*}}call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
 ; CHECK: %[[BIDX:.+]] = {{.*}}call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
 ; CHECK: %[[BDIM:.+]] = {{.*}}call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
diff --git a/kitsune/test/tapir/cuda/launch.ll b/kitsune/test/tapir/cuda/launch.ll
index 4c81e9c68f494..f3a2a8f608c82 100644
--- a/kitsune/test/tapir/cuda/launch.ll
+++ b/kitsune/test/tapir/cuda/launch.ll
@@ -28,8 +28,8 @@
 ; These are followed by a variable number of arguments that are to be passed to
 ; the kernel being launched. These are typically in the order
 ;
-;   - trip count
 ;   - start index
+;   - trip count
 ;   - grain size
 ;   - ...
 ;
@@ -44,8 +44,8 @@
 ; CHECK-SAME: i32 0,
 ; CHECK-SAME: ptr {{.*}}@[[G_KERNEL_PROPS]],
 ; CHECK-SAME: ptr %[[STREAM]],
-; CHECK-SAME: i64 %n,
 ; CHECK-SAME: i64 0,
+; CHECK-SAME: i64 %n,
 ; CHECK-SAME: i64 1,
 ; CHECK-SAME: ptr %c,
 ; CHECK-SAME: i64 %n
diff --git a/kitsune/test/tapir/hip/kernel-opt.ll b/kitsune/test/tapir/hip/kernel-opt.ll
index 4f58eec184fd2..a16de3d57a85d 100644
--- a/kitsune/test/tapir/hip/kernel-opt.ll
+++ b/kitsune/test/tapir/hip/kernel-opt.ll
@@ -32,7 +32,13 @@
 ; RUN:     | FileCheck %s --check-prefix=O2
 ;
 ; O2-NOT: = phi i64
-; O2: define {{.+}} @__kithip_{{.+}}(i64 {{.*}}%[[UB:[^,]+]], i64 {{[^,]+}}, i64 {{[^,]+}}, ptr {{.*}}%[[BUF:[^,]+]], i64 {{.*}}%[[N:[^)]+]]) {{.*}}#[[ATTRS:[0-9]+]]
+; O2: define {{.+}} @__kithip_{{[^(]+}}(
+; O2-SAME: i64 {{[^%]*}}%[[LB:[^,]+]],
+; O2-SAME: i64 {{[^%]*}}%[[UB:[^,]+]],
+; O2-SAME: i64 {{[^%]*}}%[[GRAINSIZE:[^,]+]],
+; O2-SAME: ptr {{[^%]*}}%[[BUF:[^,]+]],
+; O2-SAME: i64 {{[^%]*}}%[[N:[^)]+]])
+; O2-SAME: {{.*}}#[[ATTRS:[0-9]+]]
 ; O2-NEXT: [[BBENTRY:.+]]:
 ; O2-NEXT: %[[BUFCST:.+]] = addrspacecast ptr %[[BUF]] to ptr addrspace(1)
 ; O2-NEXT: %[[WITEM:.+]] = {{.*}}call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/kitsune/test/tapir/hip/kernel-threadiv-1d-y.ll b/kitsune/test/tapir/hip/kernel-threadiv-1d-y.ll
index 8e93d1d33071b..008d681218a27 100644
--- a/kitsune/test/tapir/hip/kernel-threadiv-1d-y.ll
+++ b/kitsune/test/tapir/hip/kernel-threadiv-1d-y.ll
@@ -7,7 +7,11 @@
 ; RUN:     | %kit-mbc -S \
 ; RUN:     | FileCheck %s
 ;
-; CHECK: define {{.+}}(i64 {{[^%]*}}%[[UB:[^,]+]], {{.+}}) #[[ATTRS:[0-9]+]]
+; CHECK: define {{.+}}(
+; CHECK-SAME: i64 {{[^%]*}}%[[LB:[^,]+]],
+; CHECK-SAME: i64 {{[^%]*}}%[[UB:[^,]+]],
+; CHECK-SAME: i64 {{[^)]+}})
+; CHECK-SAME: #[[ATTRS:[0-9]+]]
 ; CHECK: %[[WITEM:.+]] = {{.*}}call i32 @llvm.amdgcn.workitem.id.y()
 ; CHECK: %[[TID:.+]] = zext i32 %[[WITEM]] to i64
 ; CHECK: %[[BDIM:.+]] = {{.*}}call i64 @__ockl_get_local_size(i32 1)
diff --git a/kitsune/test/tapir/hip/kernel-threadiv-1d.ll b/kitsune/test/tapir/hip/kernel-threadiv-1d.ll
index b2a91a1d97fc0..7247002ef907b 100644
--- a/kitsune/test/tapir/hip/kernel-threadiv-1d.ll
+++ b/kitsune/test/tapir/hip/kernel-threadiv-1d.ll
@@ -6,7 +6,11 @@
 ; RUN:     | %kit-mbc -S \
 ; RUN:     | FileCheck %s
 ;
-; CHECK: define {{.+}}(i64 {{[^%]*}}%[[UB:[^,]+]], {{.+}}) #[[ATTRS:[0-9]+]]
+; CHECK: define {{.+}}(
+; CHECK-SAME: i64 {{[^%]*}}%[[LB:[^,]+]],
+; CHECK-SAME: i64 {{[^%]*}}%[[UB:[^,]+]],
+; CHECK-SAME: i64 {{[^)]+}})
+; CHECK-SAME: #[[ATTRS:[0-9]+]]
 ; CHECK: %[[WITEM:.+]] = {{.*}}call i32 @llvm.amdgcn.workitem.id.x()
 ; CHECK: %[[TID:.+]] = zext i32 %[[WITEM]] to i64
 ; CHECK: %[[BDIM:.+]] = {{.*}}call i64 @__ockl_get_local_size(i32 0)
diff --git a/kitsune/test/tapir/hip/launch.ll b/kitsune/test/tapir/hip/launch.ll
index 72b8dec1de102..183806f2d18d1 100644
--- a/kitsune/test/tapir/hip/launch.ll
+++ b/kitsune/test/tapir/hip/launch.ll
@@ -29,8 +29,8 @@
 ; These are followed by a variable number of arguments that are to be passed to
 ; the kernel being launched. These are typically in the order
 ;
-;   - trip count
 ;   - start index
+;   - trip count
 ;   - grain size
 ;   - ...
 ;
@@ -45,8 +45,8 @@
 ; CHECK-SAME: i32 0,
 ; CHECK-SAME: ptr {{.*}}@[[G_KERNEL_PROPS]],
 ; CHECK-SAME: ptr %[[STREAM]],
-; CHECK-SAME: i64 %n,
 ; CHECK-SAME: i64 0,
+; CHECK-SAME: i64 %n,
 ; CHECK-SAME: i64 1,
 ; CHECK-SAME: ptr %c,
 ; CHECK-SAME: i64 %n
diff --git a/llvm/include/llvm/Transforms/Tapir/CudaABI.h b/llvm/include/llvm/Transforms/Tapir/CudaABI.h
index 15f9efd74c287..5d9edd5dd03e6 100644
--- a/llvm/include/llvm/Transforms/Tapir/CudaABI.h
+++ b/llvm/include/llvm/Transforms/Tapir/CudaABI.h
@@ -144,8 +144,6 @@ class CudaLoop : public LoopOutlineProcessor {
   // Cuda/PTX grid dimensions access.
   Function *CUGridDimX = nullptr, *CUGridDimY = nullptr, *CUGridDimZ = nullptr;
 
-  SmallVector<Value *, 5> OrderedInputs;
-
   /// The GlobalValue's used in the loop that is being outlined. This includes
   /// functions, global variables, aliases and ifunc's.
   std::set<GlobalValue *> UsedGlobalValues;
@@ -161,25 +159,11 @@ class CudaLoop : public LoopOutlineProcessor {
            const TapirTargetOptions &TTOpts);
   ~CudaLoop();
 
-  void setupLoopOutlineArgs(Function &F, ValueSet &HelperArgs,
-                            SmallVectorImpl<Value *> &HelperInputs,
-                            ValueSet &InputSet,
-                            const SmallVectorImpl<Value *> &LCArgs,
-                            const SmallVectorImpl<Value *> &LCInputs,
-                            const ValueSet &TLInputsFixed) override final;
-
-  unsigned getIVArgIndex(const Function &F,
-                         const ValueSet &Args) const override final;
-
-  unsigned getLimitArgIndex(const Function &F,
-                            const ValueSet &Args) const override final;
-
   void preProcessTapirLoop(TapirLoopInfo &TL, ValueToValueMapTy &VMap) override;
   void postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out,
                           ValueToValueMapTy &VMap) override final;
   void processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
                                DominatorTree &DT) override final;
-  void remapData(ValueToValueMapTy &VMap) override final;
 };
 
 } // namespace llvm
diff --git a/llvm/include/llvm/Transforms/Tapir/HipABI.h b/llvm/include/llvm/Transforms/Tapir/HipABI.h
index 77fbc393dab01..dc818073cf4b4 100644
--- a/llvm/include/llvm/Transforms/Tapir/HipABI.h
+++ b/llvm/include/llvm/Transforms/Tapir/HipABI.h
@@ -159,31 +159,6 @@ class HipLoop : public LoopOutlineProcessor {
           const TapirTargetOptions &TTO);
   ~HipLoop();
 
-  /// Prepares the set HelperArgs of function arguments for the outlined helper
-  /// function Helper for a Tapir loop.  Also prepares the list HelperInputs of
-  /// input values passed to a call to Helper.  HelperArgs and HelperInputs are
-  /// derived from the loop-control arguments LCArgs and loop-control inputs
-  /// LCInputs for the Tapir loop, as well the set TLInputsFixed of arguments to
-  /// the task underlying the Tapir loop.
-  void setupLoopOutlineArgs(Function &F, ValueSet &HelperArgs,
-                            SmallVectorImpl<Value *> &HelperInputs,
-                            ValueSet &InputSet,
-                            const SmallVectorImpl<Value *> &LCArgs,
-                            const SmallVectorImpl<Value *> &LCInputs,
-                            const ValueSet &TLInputsFixed) override;
-
-  /// Returns an integer identifying the index of the helper-function argument
-  /// in Args that specifies the starting iteration number.  This return value
-  /// must complement the behavior of setupLoopOutlineArgs().
-  unsigned getIVArgIndex(const Function &F,
-                         const ValueSet &Args) const override;
-
-  /// Returns an integer identifying the index of the helper-function argument
-  /// in Args that specifies the ending iteration number.  This return value
-  /// must complement the behavior of setupLoopOutlineArgs().
-  unsigned getLimitArgIndex(const Function &F,
-                            const ValueSet &Args) const override;
-
   /// Process the TapirLoop before it is outlined -- just prior to the
   /// outlining occurs.  This allows the VMap and related details to be
   /// customized prior to outlining related operations (e.g. cloning of
@@ -199,8 +174,6 @@ class HipLoop : public LoopOutlineProcessor {
   void processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
                                DominatorTree &DT) override;
 
-  void remapData(ValueToValueMapTy &VMap) override final;
-
 private:
   Value *emitWorkItemId(IRBuilder<> &Builder, int ItemIndex);
   Value *emitWorkGroupId(IRBuilder<> &Builder, int ItemIndex);
@@ -220,8 +193,6 @@ class HipLoop : public LoopOutlineProcessor {
   FunctionCallee HipWorkGroupIdXFn, HipWorkGroupIdYFn, HipWorkGroupIdZFn;
   FunctionCallee HipBlockDimFn;
 
-  SmallVector<Value *, 5> OrderedInputs;
-
   /// The GlobalValue's used in the loop that is being outlined. This includes
   /// functions, global variables, aliases and ifunc's.
   std::set<GlobalValue *> UsedGlobalValues;
diff --git a/llvm/lib/Transforms/Tapir/CudaABI.cpp b/llvm/lib/Transforms/Tapir/CudaABI.cpp
index 6d72fa29a937f..ec731bcc7a9d3 100644
--- a/llvm/lib/Transforms/Tapir/CudaABI.cpp
+++ b/llvm/lib/Transforms/Tapir/CudaABI.cpp
@@ -198,78 +198,6 @@ CudaLoop::~CudaLoop() {
                     << KernelName << "'.\n");
 }
 
-void CudaLoop::setupLoopOutlineArgs(Function &F, ValueSet &HelperArgs,
-                                    SmallVectorImpl<Value *> &HelperInputs,
-                                    ValueSet &InputSet,
-                                    const SmallVectorImpl<Value *> &LCArgs,
-                                    const SmallVectorImpl<Value *> &LCInputs,
-                                    const ValueSet &TLInputsFixed) {
-  LLVM_DEBUG(dbgs() << "debug[cuabi]: setting up loop outline arguments...\n");
-
-  // Add the loop control inputs -- the first parameter defines the extent of
-  // the index space (the number of threads to launch).
-  {
-    Argument *EndArg = cast<Argument>(LCArgs[1]);
-    EndArg->setName("runSize");
-    HelperArgs.insert(EndArg);
-
-    Value *InputVal = LCInputs[1];
-    HelperInputs.push_back(InputVal);
-    InputSet.insert(InputVal);
-  }
-
-  // The second parameter defines the start of the index space.
-  {
-    Argument *StartArg = cast<Argument>(LCArgs[0]);
-    StartArg->setName("runStart");
-    HelperArgs.insert(StartArg);
-
-    Value *InputVal = LCInputs[0];
-    HelperInputs.push_back(InputVal);
-    InputSet.insert(InputVal);
-  }
-
-  // The third parameter defines the grain size, if it is not constant.
-  if (!isa<ConstantInt>(LCInputs[2])) {
-    Argument *GrainsizeArg = cast<Argument>(LCArgs[2]);
-    GrainsizeArg->setName("grainSize");
-    HelperArgs.insert(GrainsizeArg);
-
-    Value *InputVal = LCInputs[2];
-    HelperInputs.push_back(InputVal);
-    InputSet.insert(InputVal);
-  }
-
-  // Add the loop-centric kernel parameters (i.e., variables/arrays
-  // used in the loop body).
-  LLVM_DEBUG(dbgs() << "  - adding loop-centric kernel arguments...\n");
-  for (Value *V : TLInputsFixed) {
-    HelperArgs.insert(V);
-    HelperInputs.push_back(V);
-    LLVM_DEBUG(dbgs() << "    - arg: " << V->getName() << "\n");
-  }
-
-  LLVM_DEBUG(dbgs() << "  - adding helper kernel arguments...\n");
-  for (Value *V : HelperInputs) {
-    OrderedInputs.push_back(V);
-    LLVM_DEBUG(dbgs() << "    - helper arg: " << V->getName() << "\n");
-  }
-
-  LLVM_DEBUG(dbgs() << "  - done.\n");
-}
-
-unsigned CudaLoop::getIVArgIndex(const Function &F,
-                                 const ValueSet &Args) const {
-  // The argument for the primary induction variable is the second input.
-  return 1;
-}
-
-unsigned CudaLoop::getLimitArgIndex(const Function &F,
-                                    const ValueSet &Args) const {
-  // The argument for the loop limit is the first input.
-  return 0;
-}
-
 void CudaLoop::preProcessTapirLoop(TapirLoopInfo &TL, ValueToValueMapTy &VMap) {
   LLVM_DEBUG(dbgs() << "debug[cuabi]: -preprocessing loop for kernel '"
                     << KernelName << "'.\n");
@@ -378,23 +306,19 @@ void CudaLoop::postProcessOutline(TapirLoopInfo &TLI, TaskOutlineInfo &Out,
   // loops use canonical induction variables, valid iterations range from 0 to
   // the loop limit with stride 1. The End argument encodes the loop limit. Get
   // end and grainsize arguments
-  Argument *End;
-  Value *Grainsize;
-  {
-    // TODO: We only support a grain size of 1 right now. Not clear if this
-    // could be a future optimization but strip mining on our current tests only
-    // results in degraded performance...
-    auto OutlineArgsIter = KernelF->arg_begin();
-    // End argument is the first LC arg.
-    End = &*OutlineArgsIter++;
-
-    // Get the grainsize value, which is either constant or the third LC arg.
-    // if (unsigned ConstGrainsize = TLI.getGrainsize())
-    //  Grainsize = ConstantInt::get(PrimaryIV->getType(), ConstGrainsize);
-    // else
-    Grainsize =
-        ConstantInt::get(PrimaryIV->getType(), DefaultGrainSize.getValue());
-  }
+
+  // End argument is always the second argument in the kernel function.
+  Argument *End = KernelF->getArg(1);
+
+  // Get the grainsize value, which is either constant or the third LC arg.
+  // TODO: We only support a grain size of 1 right now. Not clear if this
+  // could be a future optimization but strip mining on our current tests only
+  // results in degraded performance.
+  // if (unsigned ConstGrainsize = TLI.getGrainsize())
+  //  Grainsize = ConstantInt::get(PrimaryIV->getType(), ConstGrainsize);
+  // else
+  Value *Grainsize =
+      ConstantInt::get(PrimaryIV->getType(), DefaultGrainSize.getValue());
 
   IRBuilder<> B(Entry->getTerminator());
 
@@ -433,14 +357,6 @@ void CudaLoop::postProcessOutline(TapirLoopInfo &TLI, TaskOutlineInfo &Out,
   ClonedCond->setOperand(TripCountIdx, ThreadEnd);
 }
 
-void CudaLoop::remapData(ValueToValueMapTy &VMap) {
-  for (auto &V : OrderedInputs) {
-    if (auto MappedV = VMap[V]) {
-      V = MappedV;
-    }
-  }
-}
-
 void CudaLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
                                        DominatorTree &DT) {
   LLVM_DEBUG(dbgs() << "cudaloop: processing outlined loop call...\n"
@@ -468,12 +384,13 @@ void CudaLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
   TapirLoopHints Hints(TL.getLoop());
   Value *TPB = ConstantInt::get(Int32Ty, Hints.getThreadsPerBlock());
 
-  BasicBlock *RCBB = TOI.ReplCall->getParent();
-  BasicBlock *NewBB = RCBB->splitBasicBlock(TOI.ReplCall);
+  CallBase* CallOutlined = cast<CallBase>(TOI.ReplCall);
+  BasicBlock *RCBB = CallOutlined->getParent();
+  BasicBlock *NewBB = RCBB->splitBasicBlock(CallOutlined);
   IRBuilder<> Builder(&NewBB->front());
 
   // Deal with type mismatches for the trip count.
-  Value *TripCount = OrderedInputs[0];
+  Value *TripCount = CallOutlined->getArgOperand(1);
   if (TripCount->getType() != Int64Ty)
     TripCount = Builder.CreateSExtOrBitCast(TripCount, Int64Ty, "cast.tc");
 
@@ -485,7 +402,7 @@ void CudaLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
       Builder.CreateIntrinsic(PtrTy, Intrinsic::kit_thread_stream, {CTT});
   std::vector<Value *> Args = {CTT, EmbFB,  KName,     TripCount,
                                TPB, KProps, CudaStream};
-  for (Value *Inp : OrderedInputs)
+  for (Value *Inp : CallOutlined->args())
     Args.push_back(Inp);
 
   // TODO: We should probably have the launch and sync kitsune intrinsics take
@@ -506,7 +423,7 @@ void CudaLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
   // the global again).
   copyNonConstGlobalsDToH(UsedGlobalValues, TTID::Cuda, M, Builder);
 
-  TOI.ReplCall->eraseFromParent();
+  CallOutlined->eraseFromParent();
   LLVM_DEBUG(dbgs() << "*** finished processing outlined call.\n");
 }
 
diff --git a/llvm/lib/Transforms/Tapir/HipABI.cpp b/llvm/lib/Transforms/Tapir/HipABI.cpp
index d2b27ce7dcce0..38a9cfdec855d 100644
--- a/llvm/lib/Transforms/Tapir/HipABI.cpp
+++ b/llvm/lib/Transforms/Tapir/HipABI.cpp
@@ -221,75 +221,6 @@ HipLoop::HipLoop(Module &M, Module &KM, StringRef Name,
 
 HipLoop::~HipLoop() { /* no-op */ }
 
-// TODO: Can we also transform the arguments into a different address space here
-// and avoid our use of 'mutate' elsewhere in the code?
-void HipLoop::setupLoopOutlineArgs(Function &F, ValueSet &HelperArgs,
-                                   SmallVectorImpl<Value *> &HelperInputs,
-                                   ValueSet &InputSet,
-                                   const SmallVectorImpl<Value *> &LCArgs,
-                                   const SmallVectorImpl<Value *> &LCInputs,
-                                   const ValueSet &TLInputsFixed) {
-  LLVM_DEBUG(dbgs() << "\n\n"
-                    << "hipabi: SETTING UP LOOP OUTLINE ARGUMENTS FOR '"
-                    << F.getName() << "()'.\n");
-
-  // Add the loop control inputs -- the first parameter defines the extent of
-  // the index space.
-  {
-    Argument *EndArg = cast<Argument>(LCArgs[1]);
-    EndArg->setName(".kern.input_size"); // nice for debugging...
-    HelperArgs.insert(EndArg);
-
-    Value *InputVal = LCInputs[1];
-    HelperInputs.push_back(InputVal);
-    InputSet.insert(InputVal);
-  }
-
-  // The second parameter defines the start of the index space.
-  {
-    Argument *StartArg = cast<Argument>(LCArgs[0]);
-    StartArg->setName(".kern.start_idx");
-    HelperArgs.insert(StartArg);
-
-    Value *InputVal = LCInputs[0];
-    HelperInputs.push_back(InputVal);
-    InputSet.insert(InputVal);
-  }
-
-  // The third parameter defines the grain size, if it is not constant.
-  if (!isa<ConstantInt>(LCInputs[2])) {
-    Argument *GrainsizeArg = cast<Argument>(LCArgs[2]);
-    GrainsizeArg->setName(".kern.grain_size");
-    HelperArgs.insert(GrainsizeArg);
-
-    Value *InputVal = LCInputs[2];
-    HelperInputs.push_back(InputVal);
-    InputSet.insert(InputVal);
-  }
-
-  // Add the loop-centric kernel parameters (i.e., variables/arrays
-  // used in the loop body).
-  for (Value *V : TLInputsFixed) {
-    HelperArgs.insert(V);
-    HelperInputs.push_back(V);
-  }
-
-  for (Value *V : HelperInputs) {
-    OrderedInputs.push_back(V);
-  }
-}
-
-unsigned HipLoop::getIVArgIndex(const Function &F, const ValueSet &Args) const {
-  // The argument for the primary induction variable is the second input.
-  return 1;
-}
-
-unsigned HipLoop::getLimitArgIndex(const Function &F,
-                                   const ValueSet &Args) const {
-  // The argument for the loop limit is the first input.
-  return 0;
-}
-
 void HipLoop::preProcessTapirLoop(TapirLoopInfo &TL, ValueToValueMapTy &VMap) {
   bool VerboseMode = getOptions().getTapirVerbose();
   if (VerboseMode) {
@@ -429,22 +360,19 @@ void HipLoop::postProcessOutline(TapirLoopInfo &TLI, TaskOutlineInfo &Out,
   // loops use canonical induction variables, valid iterations range from 0 to
   // the loop limit with stride 1. The End argument encodes the limit.
   // Get end and grain size arguments
-  Argument *End;
-  Value *Grainsize;
-  {
-    // TODO: We really only want a grain size of 1 for now...
-    auto OutlineArgsIter = KernelF->arg_begin();
-    // End argument is the first LC arg.
-    End = &*OutlineArgsIter++;
-
-    // Get the grain size value, which is either constant or the third LC
-    // arg.
-    // if (unsigned ConstGrainsize = TLI.getGrainsize())
-    //  Grainsize = ConstantInt::get(PrimaryIV->getType(), ConstGrainsize);
-    // else
-    Grainsize =
-        ConstantInt::get(PrimaryIV->getType(), DefaultGrainSize.getValue());
-  }
+
+  // End argument is always the second argument in the kernel function.
+  Argument *End = KernelF->getArg(1);
+
+  // Get the grainsize value, which is either constant or the third LC arg.
+  // TODO: We only support a grain size of 1 right now. Not clear if this
+  // could be a future optimization but strip mining on our current tests only
+  // results in degraded performance.
+  // if (unsigned ConstGrainsize = TLI.getGrainsize())
+  //  Grainsize = ConstantInt::get(PrimaryIV->getType(), ConstGrainsize);
+  // else
+  Value *Grainsize =
+      ConstantInt::get(PrimaryIV->getType(), DefaultGrainSize.getValue());
 
   IRBuilder<> Builder(Entry->getTerminator());
 
@@ -500,12 +428,6 @@ void HipLoop::postProcessOutline(TapirLoopInfo &TLI, TaskOutlineInfo &Out,
   ClonedCond->setOperand(TripCountIdx, ThreadEnd);
 }
 
-void HipLoop::remapData(ValueToValueMapTy &VMap) {
-  for (auto &V : OrderedInputs)
-    if (Value *MappedV = VMap[V])
-      V = MappedV;
-}
-
 void HipLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
                                       DominatorTree &DT) {
   LLVM_DEBUG(dbgs() << "hiploop: processing outlined loop call...\n"
@@ -541,12 +463,13 @@ void HipLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
   else
     TPB = ConstantInt::get(Int32Ty, 0);
 
-  BasicBlock *RCBB = TOI.ReplCall->getParent();
-  BasicBlock *NewBB = RCBB->splitBasicBlock(TOI.ReplCall);
+  CallBase *CallOutlined = cast<CallBase>(TOI.ReplCall);
+  BasicBlock *RCBB = CallOutlined->getParent();
+  BasicBlock *NewBB = RCBB->splitBasicBlock(CallOutlined);
   IRBuilder<> Builder(&NewBB->front());
 
   // Deal with type mismatches for the trip count.
-  Value *TripCount = OrderedInputs[0];
+  Value *TripCount = CallOutlined->getArgOperand(1);
   if (TripCount->getType() != Int64Ty)
     TripCount = Builder.CreateSExtOrBitCast(TripCount, Int64Ty, "cast.tc");
 
@@ -558,7 +481,7 @@ void HipLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
       Builder.CreateIntrinsic(PtrTy, Intrinsic::kit_thread_stream, {CTT});
   std::vector<Value *> Args = {CTT, EmbFB,  KName,    TripCount,
                                TPB, KProps, HipStream};
-  for (Value *Inp : OrderedInputs)
+  for (Value *Inp : CallOutlined->args())
     Args.push_back(Inp);
 
   // TODO: We should probably have the launch and sync kitsune intrinsics take
@@ -579,7 +502,7 @@ void HipLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
   // the global again).
   copyNonConstGlobalsDToH(UsedGlobalValues, TTID::Hip, M, Builder);
 
-  TOI.ReplCall->eraseFromParent();
+  CallOutlined->eraseFromParent();
   LLVM_DEBUG(dbgs() << "*** finished processing outlined call.\n");
 }
 

From a8ee49d8f25627d62838ab5a830d6350af9b2601 Mon Sep 17 00:00:00 2001
From: George Stelle <stelleg@gmail.com>
Date: Fri, 10 Dec 2021 09:37:44 -0700
Subject: [PATCH 03/23] Basic loop stripmine-based implicit parallel reduction
 (sum only) codegen

---
 llvm/lib/Transforms/Tapir/LoopStripMine.cpp | 112 ++++++++++++++++++++
 1 file changed, 112 insertions(+)

diff --git a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
index ca651c64c7ece..a17f3e81e80c4 100644
--- a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
+++ b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
@@ -1572,5 +1572,117 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
     // Optimize this routine in the future.
     TI->recalculate(*F, *DT);
 
+  // iterate through the stores that should be treated as reductions
+  const std::vector<BasicBlock*>& blocks = L->getBlocks(); 
+  std::set<Value*> reductions;  
+  for (BasicBlock *BB : blocks){
+    for (Instruction &I : *BB) {
+      if(auto si = dyn_cast<StoreInst>(&I)){
+        // TODO: better check if the store should be treated as a
+        // reduction. What we're doing is just checking if it's
+        // storing to a loop invariant pointer
+        Value* ptr = si->getPointerOperand(); 
+        if(L->isLoopInvariant(ptr))
+          reductions.insert(ptr); 
+      }
+    }
+  }
+
+  ValueToValueMap redMap; 
+  // TODO: Modify the strip mining outer loop to be smaller: currently we are
+  // stack allocating n/2048 reduction values.
+  // TODO: Initialize local reductions with unit values
+  for(Value* ptr : reductions){
+    IRBuilder<> B(F->getEntryBlock().getTerminator()); 
+    auto ty = dyn_cast<PointerType>(ptr->getType())->getElementType(); 
+    auto al = B.CreateAlloca(ty, TripCount, ptr->getName() + "_reduction");
+    IRBuilder<> BH(L->getHeader()->getTerminator()); 
+    auto lptr = BH.CreateBitCast(
+      BH.CreateGEP(al, NewIdx), 
+      ptr->getType());                             
+    redMap[ptr] = al; 
+    // TODO: for now, just initializing with the initial sequential
+    // reduction value, which is often unit, but if it isn't this is
+    // wrong.
+    ptr->replaceUsesWithIf(lptr, [L](Use &u){
+      if(auto I = dyn_cast<Instruction>(u.getUser())){
+        I->dump(); 
+        return L->contains(I->getParent()); 
+      } else {
+        return false;
+      }; 
+    });
+    BH.CreateStore(BH.CreateLoad(ptr), lptr);  
+  }
+  
+  // Epilog "join" of reduction values stored in local reduction value arrays.
+  // Should be able to use redMap to map original pointer (which is still used
+  // to reduce the remainder of the strimined loop, so you probably want to
+  // start the reduction with that value).
+  LLVM_DEBUG(dbgs() << "Function after strip mining, before reduction epilogue\n" << *F); 
+   
+  // We insert the reduction code at every sync corresponding to the strimined
+  // loop
+  //
+  // Sync 
+  // RedEpiHeader
+  //   RedEpiBody
+  // RedEpiExit
+
+  if(!reductions.empty()){
+    SmallVector<SyncInst*,2> syncs; 
+    for(auto &bb : *F){
+      if(auto *sync = dyn_cast<SyncInst>(bb.getTerminator())){
+        syncs.push_back(sync);
+      }
+    }
+    for(auto *sync : syncs){
+      if(sync->getSyncRegion() == SyncReg){
+        BasicBlock *PostSync = sync->getSuccessor(0);
+        BasicBlock* RedEpiHeader = SplitBlock(PostSync, PostSync->getTerminator(), DT); 
+        PHINode *Idx = PHINode::Create(TripCount->getType(), 2,
+                                       "reductionepilogueidx",
+                                       RedEpiHeader->getFirstNonPHI());
+        IRBuilder<> BH(RedEpiHeader->getFirstNonPHI()); 
+        Idx->addIncoming(TripCount, PostSync);
+        Instruction *bodyTerm, *exitTerm;
+        Value *cmp = BH.CreateIsNotNull(Idx); 
+        SplitBlockAndInsertIfThenElse(cmp, RedEpiHeader->getTerminator(), &bodyTerm, &exitTerm);
+
+        IRBuilder<> BB(bodyTerm); 
+        // For each reduction, get the allocated thread local reduced values and
+        // reduce them. For now defaults to sums on primitive types.
+        // TODO: Add custom unital magmas and/or infer unital magma
+        for(auto& kv : redMap){
+          auto al = kv.second;
+          Value* ptr = const_cast<Value*>(kv.first);  
+          auto lptr = BB.CreateBitCast(
+            BB.CreateGEP(al, Idx), 
+            ptr->getType());                             
+          auto acc = BB.CreateLoad(ptr); 
+          auto x = BB.CreateLoad(lptr); 
+          auto newacc = acc->getType()->isFloatingPointTy() ? BB.CreateFAdd(acc,x) : BB.CreateAdd(acc,x);
+          BB.CreateStore(newacc, ptr); 
+        }
+        Value *IdxSub =
+            BB.CreateSub(Idx, ConstantInt::get(Idx->getType(), 1),
+                              Idx->getName() + ".sub");
+        Idx->addIncoming(IdxSub, bodyTerm->getParent()); 
+        ReplaceInstWithInst(bodyTerm, BranchInst::Create(RedEpiHeader)); 
+      }
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "Function after reduction epilogue\n" << *F);  
+
+  // TODO: fix DT updates
+  DT->recalculate(*F); 
+  /*
+#ifndef NDEBUG
+  DT->verify();
+  LI->verify(*DT);
+#endif
+  */
+
   return NewLoop;
 }

From bacfd201654a0c0228754d34b2beeb45096556cc Mon Sep 17 00:00:00 2001
From: George Stelle <stelleg@gmail.com>
Date: Fri, 10 Dec 2021 09:39:32 -0700
Subject: [PATCH 04/23] Explicit parallel reductions via unital magmas added

---
 clang/lib/Headers/magma.h      | 43 ++++++++++++++++++++++++++
 clang/lib/Headers/reductions.h | 56 ++++++++++++++++++++++++++++++++++
 2 files changed, 99 insertions(+)
 create mode 100644 clang/lib/Headers/magma.h
 create mode 100644 clang/lib/Headers/reductions.h

diff --git a/clang/lib/Headers/magma.h b/clang/lib/Headers/magma.h
new file mode 100644
index 0000000000000..43c4fb8f1075f
--- /dev/null
+++ b/clang/lib/Headers/magma.h
@@ -0,0 +1,43 @@
+#include<string>
+#include<limits>
+
+template <typename a> 
+struct Magma {
+  virtual a op(a x, a y) = 0; 
+}; 
+
+template <typename a>
+struct UnitalMagma : public Magma<a> {
+  virtual a id() = 0; 
+}; 
+
+// Example unital magmas
+template <typename a>
+struct Sum : UnitalMagma<a>{
+  a op(a x, a y){ return x + y; }
+  a id(){ return 0; }  // look into this more
+}; 
+
+template <typename a>
+struct Product : UnitalMagma<a> {
+  a op(a x, a y){ return x * y; }
+  a id(){ return 1; }
+}; 
+
+struct StringApp : UnitalMagma<std::string> {
+  std::string op(std::string x, std::string y){ return x.append(y); }
+  std::string id() { return ""; }
+}; 
+
+template <typename a>
+struct Max : UnitalMagma<a> {
+  a op(a x, a y){ return x > y ? x : y; }
+  a id() { return std::numeric_limits<a>::min(); }
+}; 
+
+template <typename a>
+struct Min : UnitalMagma<a> {
+  a op(a x, a y){ return x < y ? x : y; }
+  a id() { return std::numeric_limits<a>::max(); }
+}; 
+
diff --git a/clang/lib/Headers/reductions.h b/clang/lib/Headers/reductions.h
new file mode 100644
index 0000000000000..d9abf9cf276b5
--- /dev/null
+++ b/clang/lib/Headers/reductions.h
@@ -0,0 +1,56 @@
+#include"magma.h"
+#include<assert.h>
+#include<unistd.h>
+#include<stdio.h>
+//#include<kitsune.h>
+
+template <typename a, typename um, typename v> 
+a reduce(um m, v& xs){
+  auto acc = m.id(); 
+  for(auto x : xs){
+    acc = m.op(acc, x); 
+  }
+  return acc; 
+}
+
+template <typename a, typename um, typename v>
+a parReduce(um m, v& xs, uint64_t nthreads){
+  uint64_t linesize = sysconf(_SC_LEVEL1_DCACHE_LINESIZE); 
+  assert(linesize % sizeof(a) == 0); 
+  uint64_t linenum = linesize / sizeof(a); 
+  a* accs = new a[nthreads * linenum]; 
+  uint64_t size = xs.end() - xs.begin(); 
+  assert(size % nthreads == 0); 
+  uint64_t grainsize = size / nthreads; 
+  for(uint64_t i=0; i<nthreads; i++){
+    accs[i*linenum] = m.id();
+    for(uint64_t j = i*grainsize; j<(i+1)*grainsize; j++){
+      accs[i*linenum] = m.op(accs[i*linenum], xs[j]);
+    }
+  }
+  a acc = m.id(); 
+  for(uint64_t i=0; i<nthreads; i++){
+    acc = m.op(acc, accs[i*linenum]); 
+  }
+  delete[] accs; 
+  return acc; 
+}
+
+template<typename a, typename um, typename v> 
+a treeReduce(um m, v& xs, uint64_t start, uint64_t end, uint64_t gs){
+  if(end-start < gs){
+    a acc = m.id();
+    for(uint64_t i=start; i<end; i++){
+      acc+=xs[i];
+    }
+    return acc; 
+  }
+  else{
+    uint64_t mid = (start + end) / 2; 
+    return treeReduce(m, xs, start, mid, gs) 
+         + treeReduce(m, xs, mid, end, gs); 
+  }
+}
+
+  
+

From 42d542dab160baedffec962ab7f72624c183e1e3 Mon Sep 17 00:00:00 2001
From: George Stelle <stelleg@gmail.com>
Date: Fri, 10 Dec 2021 09:45:22 -0700
Subject: [PATCH 05/23] Added reductions documentation

---
 kitsune/docs/using.md | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/kitsune/docs/using.md b/kitsune/docs/using.md
index b07a2fa4ea435..e843f6f2f1e76 100644
--- a/kitsune/docs/using.md
+++ b/kitsune/docs/using.md
@@ -99,3 +99,32 @@ Each special mode and runtime transformation/ABI target has its own named config
   * ``opencl.cfg``: OpenCL runtime ABI target specific flags. 
 
 These files can reduce complexity for end users by providing configuration- and build-specific flags.  This can be important when version-specific bitcode files and other details are used.  In addition, these files can provide developers additional flexibility for debugging, testing, and experimenting.  Obviously, all these features can also be hardcoded onto the command line for a more traditional use case.  In addition, to override any of the Kitsune or system configuration files you can place an empty config file within the user directory (no kitsune or system configuration files will be read in this case). 
+
+## Reductions 
+We provide two approaches to reductions. The first (still very much a work in
+progress and likely to break) is implicit reductions. This allows you to write
+basic reductions in the way you would for sequential code, and have them be
+optimized for parallelism, e.g. 
+
+```
+forall(auto x : xs) {
+  acc += x;
+}
+```
+
+should generate efficient parallel reduction code.
+
+Second, we provide a c++ interface for parallel reduction via user-defined
+reduction operators. Formally, we require a unital magma, which is just a
+reduction operator and a unit value, e.g. 0 for sums and 1 for products.
+
+This allows for the following style of reductions: 
+
+```
+#include<reductions.h>
+...
+  double sum = reduce<double>(Sum<double>(), big);
+```
+
+
+

From d51fb68a1b911d2931d2663ff0e31e3bbe9dd843 Mon Sep 17 00:00:00 2001
From: George Stelle <stelleg@gmail.com>
Date: Tue, 28 Jun 2022 10:34:21 -0600
Subject: [PATCH 06/23] Implicit sum reductions functional

---
 llvm/lib/Transforms/Tapir/LoopStripMine.cpp | 28 ++++++++++++++++-----
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
index a17f3e81e80c4..89d71546ec299 100644
--- a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
+++ b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
@@ -1582,20 +1582,32 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
         // reduction. What we're doing is just checking if it's
         // storing to a loop invariant pointer
         Value* ptr = si->getPointerOperand(); 
-        if(L->isLoopInvariant(ptr))
+        if(L->isLoopInvariant(ptr)){
+          LLVM_DEBUG(dbgs() << "Found reduction var: " << ptr->getName() << "\n"); 
           reductions.insert(ptr); 
+        }
       }
     }
   }
+  LLVM_DEBUG(dbgs() << "Found " << reductions.size() << " reduction variables in loop\n"); 
 
   ValueToValueMap redMap; 
   // TODO: Modify the strip mining outer loop to be smaller: currently we are
   // stack allocating n/2048 reduction values.
   // TODO: Initialize local reductions with unit values
+  Instruction *bloc = nullptr;
+  if(Instruction* I = dyn_cast<Instruction>(TripCount)){
+    bloc = I->getNextNode();
+  } else {
+    bloc = F->getEntryBlock().getTerminator();
+  }
+  IRBuilder<> RB(bloc); 
+  Value *outerIters = RB.CreateUDiv(TripCount,
+                               ConstantInt::get(TripCount->getType(), Count),
+                               "stripiter");
   for(Value* ptr : reductions){
-    IRBuilder<> B(F->getEntryBlock().getTerminator()); 
     auto ty = dyn_cast<PointerType>(ptr->getType())->getElementType(); 
-    auto al = B.CreateAlloca(ty, TripCount, ptr->getName() + "_reduction");
+    auto al = RB.CreateAlloca(ty, outerIters, ptr->getName() + "_reduction");
     IRBuilder<> BH(L->getHeader()->getTerminator()); 
     auto lptr = BH.CreateBitCast(
       BH.CreateGEP(al, NewIdx), 
@@ -1638,13 +1650,17 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
     }
     for(auto *sync : syncs){
       if(sync->getSyncRegion() == SyncReg){
+        BasicBlock *Sync = sync->getParent(); 
         BasicBlock *PostSync = sync->getSuccessor(0);
-        BasicBlock* RedEpiHeader = SplitBlock(PostSync, PostSync->getTerminator(), DT); 
-        PHINode *Idx = PHINode::Create(TripCount->getType(), 2,
+        BasicBlock* RedEpiHeader = BasicBlock::Create(Sync->getContext(), "reductionEpilogue", Sync->getParent(), Sync); 
+        RedEpiHeader->moveAfter(Sync); 
+        sync->setSuccessor(0, RedEpiHeader); 
+        BranchInst::Create(PostSync, RedEpiHeader); 
+        PHINode *Idx = PHINode::Create(outerIters->getType(), 2,
                                        "reductionepilogueidx",
                                        RedEpiHeader->getFirstNonPHI());
         IRBuilder<> BH(RedEpiHeader->getFirstNonPHI()); 
-        Idx->addIncoming(TripCount, PostSync);
+        Idx->addIncoming(outerIters, sync->getParent());
         Instruction *bodyTerm, *exitTerm;
         Value *cmp = BH.CreateIsNotNull(Idx); 
         SplitBlockAndInsertIfThenElse(cmp, RedEpiHeader->getTerminator(), &bodyTerm, &exitTerm);

From bf2a4699473547e95dd35766717c5d6ab57ac4ec Mon Sep 17 00:00:00 2001
From: George Stelle <stelleg@gmail.com>
Date: Wed, 1 Feb 2023 08:48:02 -0700
Subject: [PATCH 07/23] Outlined implementation approach for updated reduction
 implementation

---
 llvm/lib/Transforms/Tapir/LoopStripMine.cpp | 31 ++++++++++++++++-----
 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
index 89d71546ec299..cb9b1c892f3b2 100644
--- a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
+++ b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
@@ -1610,12 +1610,28 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
     auto al = RB.CreateAlloca(ty, outerIters, ptr->getName() + "_reduction");
     IRBuilder<> BH(L->getHeader()->getTerminator()); 
     auto lptr = BH.CreateBitCast(
-      BH.CreateGEP(al, NewIdx), 
+      BH.CreateGEP(ty, al, NewIdx), 
       ptr->getType());                             
     redMap[ptr] = al; 
     // TODO: for now, just initializing with the initial sequential
     // reduction value, which is often unit, but if it isn't this is
-    // wrong.
+    // wrong. What we need to do is assume there is more than one element, and
+    // use the first element for the first iteration of the loop.
+    // roughly: 
+    //   red = init; 
+    //   forall(i = ...){
+    //     red = reduce(red, body(i)); 
+    //   }
+    //   red = init; 
+    //   localred[m]; 
+    //   forall(k = ...){
+    //     localred[i] = body(j_0); 
+    //     for(j = j_1 ...)
+    //       localred = reduce(localred, body(j));
+    //   }
+    //   for(k = ...)
+    //     red = reduce(red, localred[k]); 
+    //
     ptr->replaceUsesWithIf(lptr, [L](Use &u){
       if(auto I = dyn_cast<Instruction>(u.getUser())){
         I->dump(); 
@@ -1624,7 +1640,7 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
         return false;
       }; 
     });
-    BH.CreateStore(BH.CreateLoad(ptr), lptr);  
+    BH.CreateStore(BH.CreateLoad(ty, ptr), lptr);  
   }
   
   // Epilog "join" of reduction values stored in local reduction value arrays.
@@ -1672,11 +1688,12 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
         for(auto& kv : redMap){
           auto al = kv.second;
           Value* ptr = const_cast<Value*>(kv.first);  
+          auto ty = ptr->getType(); 
           auto lptr = BB.CreateBitCast(
-            BB.CreateGEP(al, Idx), 
-            ptr->getType());                             
-          auto acc = BB.CreateLoad(ptr); 
-          auto x = BB.CreateLoad(lptr); 
+            BB.CreateGEP(ty, al, Idx), 
+            ty);                             
+          auto acc = BB.CreateLoad(ty, ptr); 
+          auto x = BB.CreateLoad(ty, lptr); 
           auto newacc = acc->getType()->isFloatingPointTy() ? BB.CreateFAdd(acc,x) : BB.CreateAdd(acc,x);
           BB.CreateStore(newacc, ptr); 
         }

From c8bdc81dd6483cc0ab01d9390239d213ba6981e8 Mon Sep 17 00:00:00 2001
From: George Stelle <stelleg@gmail.com>
Date: Tue, 21 Feb 2023 16:17:19 -0700
Subject: [PATCH 08/23] Working reductions, sequential semantics preserved for
 commutative associative magmas

---
 clang/include/clang/Basic/Attr.td           |   7 +
 clang/lib/CodeGen/CGCall.cpp                |   3 +
 clang/lib/Headers/kitsune.h.cmake           | 145 ++++++++++++++++++++
 clang/lib/Sema/SemaDeclAttr.cpp             |   4 +-
 kitsune-tests/reductions/l2.c               |  36 +++++
 llvm/include/llvm/IR/Attributes.td          |  10 ++
 llvm/lib/Transforms/Tapir/LoopStripMine.cpp | 140 ++++++++++---------
 7 files changed, 280 insertions(+), 65 deletions(-)
 create mode 100644 clang/lib/Headers/kitsune.h.cmake
 create mode 100644 kitsune-tests/reductions/l2.c

diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 959367a73add2..bda7c1e4d6c21 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -5227,6 +5227,13 @@ def NonString : InheritableAttr {
   let Spellings = [GCC<"nonstring">];
   let Subjects = SubjectList<[Var, Field]>;
   let Documentation = [NonStringDocs];
+
+// +===== kitsune-/tapir-centric attributes
+
+def KitsuneReduction : InheritableAttr {
+  let Spellings = [Clang<"kitsune_reduction">];
+  let Subjects = SubjectList<[FunctionLike]>;
+  let Documentation = [StrandMallocDocs];
 }
 
 def TapirTarget : StmtAttr {
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 2336c3da3b38c..9fad6a8bd4f39 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -2512,6 +2512,9 @@ void CodeGenModule::ConstructAttributeList(StringRef Name,
       FuncAttrs.addMemoryAttr(llvm::MemoryEffects::inaccessibleOrArgMemOnly());
       FuncAttrs.addAttribute(llvm::Attribute::NoUnwind);
     }
+    if (TargetDecl->hasAttr<KitsuneReductionAttr>()) {
+      FuncAttrs.addAttribute(llvm::Attribute::KitsuneReduction);
+    }
     if (const auto *RA = TargetDecl->getAttr<RestrictAttr>();
         RA && RA->getDeallocator() == nullptr)
       RetAttrs.addAttribute(llvm::Attribute::NoAlias);
diff --git a/clang/lib/Headers/kitsune.h.cmake b/clang/lib/Headers/kitsune.h.cmake
new file mode 100644
index 0000000000000..d8e6cd4bf86ee
--- /dev/null
+++ b/clang/lib/Headers/kitsune.h.cmake
@@ -0,0 +1,145 @@
+
+/*
+ * Copyright (c) 2020 Triad National Security, LLC
+ *                         All rights reserved.
+ *
+ * This file is part of the kitsune/llvm project.  It is released under
+ * the LLVM license.
+ */
+#ifndef __CLANG_KITSUNE_H__
+#define __CLANG_KITSUNE_H__
+
+#include <stdint.h>
+#include <stddef.h>
+
+#cmakedefine01 KITSUNE_ENABLE_OPENMP_ABI_TARGET
+#cmakedefine01 KITSUNE_ENABLE_QTHREADS_ABI_TARGET
+#cmakedefine01 KITSUNE_ENABLE_CUDA_ABI_TARGET
+#cmakedefine01 KITSUNE_ENABLE_GPU_ABI_TARGET
+#cmakedefine01 KITSUNE_ENABLE_REALM_ABI_TARGET
+#cmakedefine01 KITSUNE_ENABLE_OPENCL_ABI_TARGET
+#cmakedefine01 KITSUNE_ENABLE_HIP_ABI_TARGET
+
+#include "kitsune_rt.h"
+
+#if defined(reduction)
+#warning found reduction definition: try puttin kitsune lower in include order
+#else
+#define reduction __attribute__((noinline, kitsune_reduction))
+#endif
+
+
+#if defined(KITSUNE_ENABLE_OPENCL_ABI_TARGET)
+#define ocl_mmap(a, n) __kitsune_opencl_mmap_marker((void*)a, n)
+#ifdef __cplusplus
+extern "C" {
+#endif
+  void __kitsune_opencl_mmap_marker(void* ptr, uint64_t n);
+#ifdef __cplusplus
+}
+#endif
+#endif
+
+#if defined(spawn)
+#warning encountered multiple definitions of spawn!
+#else
+#define spawn _kitsune_spawn
+#endif
+
+#if defined(sync)
+#warning encountered multiple definitions of sync!
+#else
+#define sync _kitsune_sync
+#endif
+
+#if defined(forall)
+#warning encountered multiple definitions of forall!
+#else
+#define forall _kitsune_forall
+#endif
+
+
+#if defined(_tapir_cuda_target)
+  #ifdef __cplusplus
+    extern "C" __attribute__((malloc)) void* __kitrt_cuMemAllocManaged(size_t);
+    template <typename T>
+    inline __attribute__((always_inline))
+      T* alloc(size_t N) {
+      return (T*)__kitrt_cuMemAllocManaged(sizeof(T) * N);
+    }
+
+    extern "C" void __kitrt_cuMemFree(void*);
+    template <typename T>
+    void dealloc(T* array) {
+      __kitrt_cuMemFree((void*)array);
+    }
+  #else
+    void* __attribute__((malloc)) __kitrt_cuMemAllocManaged(size_t);
+    inline __attribute__((always_inline))
+    void *alloc(size_t total_bytes) {
+      return __kitrt_cuMemAllocManaged(total_bytes);
+    }
+
+    void __kitrt_cuMemFree(void*);
+    inline __attribute__((always_inline))
+    void dealloc(void *array) {
+      __kitrt_cuMemFree(array);
+    }
+  #endif
+#elif defined(_tapir_hip_target)
+  #ifdef __cplusplus
+    extern "C" __attribute__((malloc)) void* __kitrt_hipMemAllocManaged(size_t);
+    template <typename T>
+    inline __attribute__((always_inline))
+      T* alloc(size_t N) {
+      return (T*)__kitrt_hipMemAllocManaged(sizeof(T) * N);
+    }
+
+    extern "C" void __kitrt_hipMemFree(void*);
+    template <typename T>
+    void dealloc(T* array) {
+      __kitrt_hipMemFree((void*)array);
+    }
+  #else
+    void* __attribute__((malloc)) __kitrt_hipMemAllocManaged(size_t);
+    inline __attribute__((always_inline))
+    void *alloc(size_t total_bytes) {
+      return __kitrt_hipMemAllocManaged(total_bytes);
+    }
+
+    void __kitrt_hipMemFree(void*);
+    inline __attribute__((always_inline))
+    void dealloc(void *array) {
+       __kitrt_hipMemFree(array);
+    }
+  #endif
+#else
+  #ifdef __cplusplus
+    extern "C" __attribute__((malloc)) void* __kitrt_defaultMemAlloc(size_t);
+    template <typename T>
+    inline __attribute__((always_inline))
+    T* alloc(size_t N) {
+      return (T*)__kitrt_defaultMemAlloc(sizeof(T) * N);
+    }
+
+    extern "C" void __kitrt_defaultMemFree(void*);
+    template <typename T>
+    void dealloc(T* array) {
+      __kitrt_defaultMemFree(array);
+    }
+  #else
+    void* __attribute__((malloc)) __kitrt_defaultMemAlloc(size_t);
+    inline __attribute__((always_inline))
+    void *alloc(size_t total_bytes) {
+      return __kitrt_defaultMemAlloc(total_bytes);
+    }
+
+    void __kitrt_defaultMemFree(void*);
+    inline __attribute__((always_inline))
+    void dealloc(void* array) {
+       __kitrt_defaultMemFree(array);
+    }
+  #endif // __cplusplus
+#endif // cpu targets
+
+#endif
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 2ab0b218cf8c6..19f7b537003e6 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -7902,10 +7902,12 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
   case ParsedAttr::AT_TypeNullable:
     handleNullableTypeAttr(S, D, AL);
     break;
-
   case ParsedAttr::AT_VTablePointerAuthentication:
     handleVTablePointerAuthentication(S, D, AL);
     break;
+  case ParsedAttr::AT_KitsuneReduction:
+    handleSimpleAttribute<KitsuneReductionAttr>(S, D, AL);
+    break;
   }
 }
 
diff --git a/kitsune-tests/reductions/l2.c b/kitsune-tests/reductions/l2.c
new file mode 100644
index 0000000000000..fc81e0057e8a5
--- /dev/null
+++ b/kitsune-tests/reductions/l2.c
@@ -0,0 +1,36 @@
+#include<math.h>
+#include<stdio.h>
+#include<stdlib.h>
+#include<kitsune.h>
+
+reduction
+void sum(float *a, float b){
+  *a += b;
+}
+
+float l2(int n, float* a){
+  float red = 3.14159; 
+  forall(int i=0; i<n; i++){
+    sum(&red, a[i]*a[i]); 
+  }
+
+  return sqrt(red);
+}
+
+float l2_seq(int n, float* a){
+  float red = 3.14159; 
+  for(int i=0; i < n; i++){
+    sum(&red, a[i]*a[i]); 
+  }
+  return sqrt(red);
+}
+
+int main(int argc, char** argv){
+  int n = argc > 1 ? atoi(argv[1]) : 4096 ; 
+  float* arr = (float*)malloc(sizeof(float) * n); 
+  for(int i=0 ; i<n; i++){
+    arr[i] = i; 
+  }
+  printf("par: %f , seq: %f\n" , l2(n, arr), l2_seq(n, arr));
+}
+
diff --git a/llvm/include/llvm/IR/Attributes.td b/llvm/include/llvm/IR/Attributes.td
index 165573f719d59..754fe8ca27f74 100644
--- a/llvm/include/llvm/IR/Attributes.td
+++ b/llvm/include/llvm/IR/Attributes.td
@@ -276,6 +276,16 @@ def ReadNone : EnumAttr<"readnone", IntersectAnd, [ParamAttr]>;
 /// Function only reads from memory.
 def ReadOnly : EnumAttr<"readonly", IntersectAnd, [ParamAttr]>;
 
+<<<<<<< HEAD
+=======
+/// Tapir reducer-related attributes.
+def HyperView : EnumAttr<"hyper_view", [FnAttr]>;
+def HyperToken : EnumAttr<"hyper_token", [FnAttr]>;
+def ReducerRegister : EnumAttr<"reducer_register", [FnAttr]>;
+def ReducerUnregister : EnumAttr<"reducer_unregister", [FnAttr]>;
+def KitsuneReduction : EnumAttr<"kitsune_reduction", [FnAttr]>;
+
+>>>>>>> e2078e0739d8 (Working reductions, sequential semantics preserved for commutative associative magmas)
 /// Return value is always equal to this argument.
 def Returned : EnumAttr<"returned", IntersectAnd, [ParamAttr]>;
 
diff --git a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
index cb9b1c892f3b2..27a56ddfeb913 100644
--- a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
+++ b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
@@ -40,6 +40,7 @@
 #include "llvm/Transforms/Utils/SimplifyIndVar.h"
 #include "llvm/Transforms/Utils/TapirUtils.h"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
+#include "llvm/Transforms/Utils/LoopPeel.h"
 
 using namespace llvm;
 
@@ -825,6 +826,10 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
   unsigned ExitIndex = LatchBR->getSuccessor(0) == Header ? 1 : 0;
   BasicBlock *LatchExit = LatchBR->getSuccessor(ExitIndex);
 
+  
+  Function *F = Header->getParent();
+  LLVM_DEBUG(dbgs() << "Function before strip mining\n" << *F); 
+
   // We will use the increment of the primary induction variable to derive
   // wrapping flags.
   Instruction *PrimaryInc =
@@ -1572,26 +1577,25 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
     // Optimize this routine in the future.
     TI->recalculate(*F, *DT);
 
-  // iterate through the stores that should be treated as reductions
+  // accumulate reductions
   const std::vector<BasicBlock*>& blocks = L->getBlocks(); 
-  std::set<Value*> reductions;  
+  std::set<CallInst*> reductions;
   for (BasicBlock *BB : blocks){
     for (Instruction &I : *BB) {
-      if(auto si = dyn_cast<StoreInst>(&I)){
-        // TODO: better check if the store should be treated as a
-        // reduction. What we're doing is just checking if it's
-        // storing to a loop invariant pointer
-        Value* ptr = si->getPointerOperand(); 
-        if(L->isLoopInvariant(ptr)){
-          LLVM_DEBUG(dbgs() << "Found reduction var: " << ptr->getName() << "\n"); 
-          reductions.insert(ptr); 
+      if(auto ci = dyn_cast<CallInst>(&I)){
+        auto f = ci->getCalledFunction(); 
+        if(f->getAttributes().hasAttrSomewhere(Attribute::KitsuneReduction)){
+          LLVM_DEBUG(dbgs() << "Found reduction var: " << ci->getArgOperand(0)->getName() << 
+                               "with reduction function: " << f->getName() << "\n"); 
+          reductions.insert(ci); 
+          //TODO: check the type to confirm valid reduction
         }
       }
     }
   }
   LLVM_DEBUG(dbgs() << "Found " << reductions.size() << " reduction variables in loop\n"); 
 
-  ValueToValueMap redMap; 
+  std::vector<std::tuple<CallInst*, Value* , Value*>> redMap; 
   // TODO: Modify the strip mining outer loop to be smaller: currently we are
   // stack allocating n/2048 reduction values.
   // TODO: Initialize local reductions with unit values
@@ -1605,17 +1609,16 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
   Value *outerIters = RB.CreateUDiv(TripCount,
                                ConstantInt::get(TripCount->getType(), Count),
                                "stripiter");
-  for(Value* ptr : reductions){
+  for(CallInst* ci : reductions){
+    auto ptr = ci->getArgOperand(0); 
     auto ty = dyn_cast<PointerType>(ptr->getType())->getElementType(); 
     auto al = RB.CreateAlloca(ty, outerIters, ptr->getName() + "_reduction");
-    IRBuilder<> BH(L->getHeader()->getTerminator()); 
+    IRBuilder<> BH(NewLoop->getHeader()->getTerminator()); 
     auto lptr = BH.CreateBitCast(
       BH.CreateGEP(ty, al, NewIdx), 
       ptr->getType());                             
-    redMap[ptr] = al; 
-    // TODO: for now, just initializing with the initial sequential
-    // reduction value, which is often unit, but if it isn't this is
-    // wrong. What we need to do is assume there is more than one element, and
+    redMap.push_back(std::make_tuple(ci, ptr, al)); 
+    // Assume there is more than one element, and
     // use the first element for the first iteration of the loop.
     // roughly: 
     //   red = init; 
@@ -1627,20 +1630,18 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
     //   forall(k = ...){
     //     localred[i] = body(j_0); 
     //     for(j = j_1 ...)
-    //       localred = reduce(localred, body(j));
+    //       reduce(localred+i, body(j));
     //   }
     //   for(k = ...)
-    //     red = reduce(red, localred[k]); 
+    //     reduce(&red, localred[k]); 
     //
     ptr->replaceUsesWithIf(lptr, [L](Use &u){
       if(auto I = dyn_cast<Instruction>(u.getUser())){
-        I->dump(); 
         return L->contains(I->getParent()); 
       } else {
         return false;
       }; 
     });
-    BH.CreateStore(BH.CreateLoad(ty, ptr), lptr);  
   }
   
   // Epilog "join" of reduction values stored in local reduction value arrays.
@@ -1657,53 +1658,64 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
   //   RedEpiBody
   // RedEpiExit
 
+  // Todo: re-order epilogue and reduction epilogue to preserve associativity
   if(!reductions.empty()){
-    SmallVector<SyncInst*,2> syncs; 
-    for(auto &bb : *F){
-      if(auto *sync = dyn_cast<SyncInst>(bb.getTerminator())){
-        syncs.push_back(sync);
-      }
-    }
-    for(auto *sync : syncs){
-      if(sync->getSyncRegion() == SyncReg){
-        BasicBlock *Sync = sync->getParent(); 
-        BasicBlock *PostSync = sync->getSuccessor(0);
-        BasicBlock* RedEpiHeader = BasicBlock::Create(Sync->getContext(), "reductionEpilogue", Sync->getParent(), Sync); 
-        RedEpiHeader->moveAfter(Sync); 
-        sync->setSuccessor(0, RedEpiHeader); 
-        BranchInst::Create(PostSync, RedEpiHeader); 
-        PHINode *Idx = PHINode::Create(outerIters->getType(), 2,
-                                       "reductionepilogueidx",
-                                       RedEpiHeader->getFirstNonPHI());
-        IRBuilder<> BH(RedEpiHeader->getFirstNonPHI()); 
-        Idx->addIncoming(outerIters, sync->getParent());
-        Instruction *bodyTerm, *exitTerm;
-        Value *cmp = BH.CreateIsNotNull(Idx); 
-        SplitBlockAndInsertIfThenElse(cmp, RedEpiHeader->getTerminator(), &bodyTerm, &exitTerm);
-
-        IRBuilder<> BB(bodyTerm); 
-        // For each reduction, get the allocated thread local reduced values and
-        // reduce them. For now defaults to sums on primitive types.
-        // TODO: Add custom unital magmas and/or infer unital magma
-        for(auto& kv : redMap){
-          auto al = kv.second;
-          Value* ptr = const_cast<Value*>(kv.first);  
-          auto ty = ptr->getType(); 
-          auto lptr = BB.CreateBitCast(
-            BB.CreateGEP(ty, al, Idx), 
-            ty);                             
-          auto acc = BB.CreateLoad(ty, ptr); 
-          auto x = BB.CreateLoad(ty, lptr); 
-          auto newacc = acc->getType()->isFloatingPointTy() ? BB.CreateFAdd(acc,x) : BB.CreateAdd(acc,x);
-          BB.CreateStore(newacc, ptr); 
+    // Peel the first iteration of the loop and replace the reduction calls in
+    // the peeled code with stores
+    peelLoop(L, 1, LI, SE, *DT, AC, PreserveLCSSA); 
+    SmallVector<Instruction*, 4> cis;
+    for(auto &BB : NewLoop->blocks()){
+      if(!L->contains(BB)){ // better way?
+        for(auto &I : *BB){
+          if(auto *CI = dyn_cast<CallInst>(&I)){
+            auto *f = CI->getCalledFunction(); 
+            if(f->getAttributes().hasAttrSomewhere(Attribute::KitsuneReduction)){
+              IRBuilder<> pb(&I); 
+              pb.CreateStore(CI->getArgOperand(1), CI->getArgOperand(0)); 
+              cis.push_back(&I); 
+              f->removeFnAttr(Attribute::NoInline); 
+            }
+          }
         }
-        Value *IdxSub =
-            BB.CreateSub(Idx, ConstantInt::get(Idx->getType(), 1),
-                              Idx->getName() + ".sub");
-        Idx->addIncoming(IdxSub, bodyTerm->getParent()); 
-        ReplaceInstWithInst(bodyTerm, BranchInst::Create(RedEpiHeader)); 
       }
     }
+    
+    for(auto &I : cis){
+      I->eraseFromParent();  
+    }
+
+    Instruction* term = LatchExit->getTerminator(); 
+    BasicBlock *PostSync = term->getSuccessor(0);
+    BasicBlock* RedEpiHeader = BasicBlock::Create(LatchExit->getContext(), "reductionEpilogue", LatchExit->getParent(), LatchExit); 
+    RedEpiHeader->moveAfter(LatchExit); 
+    ReplaceInstWithInst(term, SyncInst::Create(RedEpiHeader, SyncReg)); 
+    BranchInst::Create(PostSync, RedEpiHeader);
+    PHINode *Idx = PHINode::Create(outerIters->getType(), 2,
+                                   "reductionepilogueidx",
+                                   RedEpiHeader->getFirstNonPHI());
+    IRBuilder<> BH(RedEpiHeader->getFirstNonPHI()); 
+    Idx->addIncoming(ConstantInt::get(outerIters->getType(), 0), LatchExit);
+    Instruction *bodyTerm, *exitTerm;
+    Value *cmp = BH.CreateCmp(CmpInst::ICMP_NE, Idx, outerIters); 
+    SplitBlockAndInsertIfThenElse(cmp, RedEpiHeader->getTerminator(), &bodyTerm, &exitTerm);
+
+    IRBuilder<> BB(bodyTerm); 
+    // For each reduction, get the allocated thread local reduced values and
+    // reduce them. 
+    for(auto& kv : redMap){
+      const auto [ ci, ptr, al ] = kv; 
+      auto ty = dyn_cast<PointerType>(ptr->getType())->getElementType(); 
+      auto lptr = BB.CreateBitCast(
+        BB.CreateGEP(ty, al, Idx), 
+        ptr->getType());                             
+      auto x = BB.CreateLoad(ty, lptr); 
+      BB.CreateCall(ci->getCalledFunction(), { ptr, x }); 
+    }
+    Value *IdxAdd =
+        BB.CreateAdd(Idx, ConstantInt::get(Idx->getType(), 1),
+                          Idx->getName() + ".add");
+    Idx->addIncoming(IdxAdd, bodyTerm->getParent()); 
+    ReplaceInstWithInst(bodyTerm, BranchInst::Create(RedEpiHeader)); 
   }
 
   LLVM_DEBUG(dbgs() << "Function after reduction epilogue\n" << *F);  

From de32b106b7adcb61f62ddd78153f9dfbf96cb23c Mon Sep 17 00:00:00 2001
From: George Stelle <stelleg@gmail.com>
Date: Tue, 13 Jun 2023 11:32:26 -0600
Subject: [PATCH 09/23] updated reduction example with timing

---
 kitsune-tests/reductions/l2.c               | 40 +++++++++++++------
 llvm/lib/Transforms/Tapir/GPUABI.cpp        | 44 +++++++--------------
 llvm/lib/Transforms/Tapir/LoopStripMine.cpp | 39 +++++++++++++-----
 3 files changed, 73 insertions(+), 50 deletions(-)

diff --git a/kitsune-tests/reductions/l2.c b/kitsune-tests/reductions/l2.c
index fc81e0057e8a5..e39b8765da54d 100644
--- a/kitsune-tests/reductions/l2.c
+++ b/kitsune-tests/reductions/l2.c
@@ -1,36 +1,54 @@
+#include<time.h>
 #include<math.h>
 #include<stdio.h>
 #include<stdlib.h>
 #include<kitsune.h>
+#include<gpu.h>
 
 reduction
-void sum(float *a, float b){
+void sum(double *a, double b){
   *a += b;
 }
 
-float l2(int n, float* a){
-  float red = 3.14159; 
-  forall(int i=0; i<n; i++){
+double l2(uint64_t n, double* a){
+  double red = 0; 
+  forall(uint64_t i=0; i<n; i++){
     sum(&red, a[i]*a[i]); 
   }
 
   return sqrt(red);
 }
 
-float l2_seq(int n, float* a){
-  float red = 3.14159; 
-  for(int i=0; i < n; i++){
+double l2_seq(uint64_t n, double* a){
+  double red = 0; 
+  for(uint64_t i=0; i < n; i++){
     sum(&red, a[i]*a[i]); 
   }
   return sqrt(red);
 }
 
 int main(int argc, char** argv){
-  int n = argc > 1 ? atoi(argv[1]) : 4096 ; 
-  float* arr = (float*)malloc(sizeof(float) * n); 
-  for(int i=0 ; i<n; i++){
+  uint64_t n = argc > 1 ? atoi(argv[1]) : 2ULL<<28 ; 
+  double* arr = (double*)gpuManagedMalloc(sizeof(double) * n); 
+
+  forall(uint64_t i=0; i<n; i++){
     arr[i] = i; 
   }
-  printf("par: %f , seq: %f\n" , l2(n, arr), l2_seq(n, arr));
+
+//  l2(n, arr);
+
+  clock_t before = clock();
+  double par = l2(n, arr);
+  clock_t after = clock(); 
+  double partime = (double)(after - before) / 1000000; 
+
+  before = clock();
+  double seq = l2_seq(n, arr);
+  after = clock(); 
+  double seqtime = (double)(after - before) / 1000000; 
+
+  printf("par: %f in %f s , seq: %f in %f s\n" , par, partime, seq, seqtime);
+  double bw = (double)((2ULL<<28) * sizeof(double)) / (1000000000.0 * partime);  
+  printf("par bandwidth: %f GB/s \n" , bw);
 }
 
diff --git a/llvm/lib/Transforms/Tapir/GPUABI.cpp b/llvm/lib/Transforms/Tapir/GPUABI.cpp
index d1909db60de42..763858e931389 100644
--- a/llvm/lib/Transforms/Tapir/GPUABI.cpp
+++ b/llvm/lib/Transforms/Tapir/GPUABI.cpp
@@ -37,6 +37,13 @@ using namespace llvm;
 
 #define DEBUG_TYPE "gpuabi"
 
+// JIT compiler kernel at containing function entry, makes timing easier at the
+// cost of less laziness
+static cl::opt<bool>
+    JIT("jit-callsite", cl::init(false), cl::NotHidden,
+          cl::desc("Wait until parallel loop is called to jit kernel. " 
+          "(default=false)"));
+
 Value *GPUABI::lowerGrainsizeCall(CallInst *GrainsizeCall) {
   Value *Grainsize = ConstantInt::get(GrainsizeCall->getType(), 8);
 
@@ -232,11 +239,7 @@ void LLVMLoop::postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out,
       // Grainsize argument is the third LC arg.
       Grainsize = &*++(++OutlineArgsIter);
   }
-  ThreadID = B.CreateMul(ThreadID, Grainsize);
-  Value *ThreadEndGrain = B.CreateAdd(ThreadID, Grainsize);
-  Value *Cmp = B.CreateICmp(ICmpInst::ICMP_ULT, ThreadEndGrain, End);
-  Value *ThreadEnd = B.CreateSelect(Cmp, ThreadEndGrain, End);
-  Value *Cond = B.CreateICmpUGE(ThreadID, ThreadEnd);
+  Value *Cond = B.CreateICmpUGE(ThreadID, End);
 
   ReplaceInstWithInst(Entry->getTerminator(), BranchInst::Create(Exit, Header,
                                                                  Cond));
@@ -246,10 +249,10 @@ void LLVMLoop::postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out,
   // Update cloned loop condition to use the thread-end value.
   unsigned TripCountIdx = 0;
   ICmpInst *ClonedCond = cast<ICmpInst>(VMap[TL.getCondition()]);
-  if (ClonedCond->getOperand(0) != ThreadEnd)
+  if (ClonedCond->getOperand(0) != End)
     ++TripCountIdx;
-  ClonedCond->setOperand(TripCountIdx, ThreadEnd);
-  assert(ClonedCond->getOperand(TripCountIdx) == ThreadEnd &&
+  ClonedCond->setOperand(TripCountIdx, End);
+  assert(ClonedCond->getOperand(TripCountIdx) == End &&
          "End argument not used in condition");
 
 }
@@ -262,8 +265,6 @@ void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
   Type *Int64Ty = Type::getInt64Ty(Ctx);
   Type *VoidPtrTy = Type::getInt8PtrTy(Ctx);
 
-  //Task *T = TL.getTask();
-  //Instruction *ReplCall = cast<CallBase>(TOI.ReplCall);
   LLVM_DEBUG(dbgs() << "Running processOutlinedLoopCall: " << LLVMM);
   Function *Parent = TOI.ReplCall->getFunction();
   Value *TripCount = OrderedInputs[0];
@@ -389,12 +390,6 @@ void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
                                  GlobalValue::PrivateLinkage, LLVMBC,
                                  "gpu_" + Twine("kitsune_kernel"));
 
-  //Value* TripCount = isSRetInput(TOI.InputSet[0]) ? TOI.InputSet[1] : TOI.InputSet[0];
-  //Value *RunStart = ReplCall->getArgOperand(getIVArgIndex(*Parent,
-  //                                                        TOI.InputSet));
-  //Value *TripCount = ReplCall->getArgOperand(getLimitArgIndex(*Parent,
-  //                                                            TOI.InputSet));
-
   Value *KernelID = ConstantInt::get(Int32Ty, MyKernelID);
   Value *LLVMPtr = B.CreateBitCast(LLVMGlobal, VoidPtrTy);
   Type *VoidPtrPtrTy = VoidPtrTy->getPointerTo();
@@ -418,20 +413,9 @@ void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
     B.CreateStore(VoidVPtr, argPtr);
   }
 
-  Value *Grainsize = TL.getGrainsize() ?
-    ConstantInt::get(TripCount->getType(), TL.getGrainsize()) :
-    OrderedInputs[2];
-
-  //Type *Int64Ty = Type::getInt64Ty(LLVMM.getContext());
-  Value *RunSizeQ = B.CreateUDiv(TripCount, Grainsize);
-  Value *RunRem = B.CreateURem(TripCount, Grainsize);
-  Value *IsRem = B.CreateICmp(ICmpInst::ICMP_UGT, RunRem, ConstantInt::get(RunRem->getType(), 0));
-  Value *IsRemAdd = B.CreateZExt(IsRem, RunSizeQ->getType());
-  Value *RunSize = B.CreateZExt(B.CreateAdd(RunSizeQ, IsRemAdd), Int64Ty);
-
-  Value* argsPtr = B.CreateConstInBoundsGEP2_32(arrayType, argArray, 0, 0);
-  Value* bcPtr = B.CreateConstInBoundsGEP2_32(LLVMGlobal->getValueType(), LLVMGlobal, 0, 0);
-  Value* stream = B.CreateCall(GPULaunchKernel, { bcPtr, kernelSize, argsPtr, RunSize });
+  Value* argsPtr = B.CreateConstInBoundsGEP2_32(arrayType, argArray, 0, 0); 
+  Value* bcPtr = B.CreateConstInBoundsGEP2_32(LLVMGlobal->getValueType(), LLVMGlobal, 0, 0); 
+  Value* stream = B.CreateCall(GPULaunchKernel, { bcPtr, kernelSize, argsPtr, TripCount });
   B.CreateCall(GPUWaitKernel, stream);
 
   LLVM_DEBUG(dbgs() << "Finished processOutlinedLoopCall: " << M);
diff --git a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
index 27a56ddfeb913..2017ea93056de 100644
--- a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
+++ b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
@@ -1577,7 +1577,7 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
     // Optimize this routine in the future.
     TI->recalculate(*F, *DT);
 
-  // accumulate reductions
+  // accumulate reductions in main loop
   const std::vector<BasicBlock*>& blocks = L->getBlocks(); 
   std::set<CallInst*> reductions;
   for (BasicBlock *BB : blocks){
@@ -1593,6 +1593,8 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
       }
     }
   }
+   
+  // accumulate reductions in epilog loop
   LLVM_DEBUG(dbgs() << "Found " << reductions.size() << " reduction variables in loop\n"); 
 
   std::vector<std::tuple<CallInst*, Value* , Value*>> redMap; 
@@ -1609,10 +1611,16 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
   Value *outerIters = RB.CreateUDiv(TripCount,
                                ConstantInt::get(TripCount->getType(), Count),
                                "stripiter");
+  auto nred = RB.CreateAdd(outerIters, ConstantInt::get(outerIters->getType(), 1)); 
   for(CallInst* ci : reductions){
+    // TODO: generic allocation/free calls
     auto ptr = ci->getArgOperand(0); 
     auto ty = dyn_cast<PointerType>(ptr->getType())->getElementType(); 
-    auto al = RB.CreateAlloca(ty, outerIters, ptr->getName() + "_reduction");
+    auto gmmTy = FunctionType::get(ptr->getType(), { nred->getType() }, false); 
+    auto arrSize = RB.CreateMul(nred, ConstantInt::get(nred->getType(), DL.getTypeAllocSize(nred->getType()))); 
+    auto al = RB.CreateCall(M->getOrInsertFunction("gpuManagedMalloc", gmmTy), {arrSize}); 
+    //auto al = RB.CreateBitCast(rm, ty); 
+    //auto al = RB.CreateAlloca(ty, nred, ptr->getName() + "_reduction");
     IRBuilder<> BH(NewLoop->getHeader()->getTerminator()); 
     auto lptr = BH.CreateBitCast(
       BH.CreateGEP(ty, al, NewIdx), 
@@ -1626,13 +1634,17 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
     //     red = reduce(red, body(i)); 
     //   }
     //   red = init; 
-    //   localred[m]; 
-    //   forall(k = ...){
+    //   localred[m+1]; 
+    //   
+    //   forall(k ∈ 0..m-1){
     //     localred[i] = body(j_0); 
-    //     for(j = j_1 ...)
+    //     for(j ∈ j_k_1..j_k_l-1)
     //       reduce(localred+i, body(j));
     //   }
-    //   for(k = ...)
+    //   for( j ∈ j_k_m .. n )
+    //     reduce(localred+m, body(j)); 
+    //   }
+    //   for(k ∈ 0..m) 
     //     reduce(&red, localred[k]); 
     //
     ptr->replaceUsesWithIf(lptr, [L](Use &u){
@@ -1709,25 +1721,34 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
         BB.CreateGEP(ty, al, Idx), 
         ptr->getType());                             
       auto x = BB.CreateLoad(ty, lptr); 
+      BB.SetCurrentDebugLocation(ci->getDebugLoc()); 
       BB.CreateCall(ci->getCalledFunction(), { ptr, x }); 
     }
     Value *IdxAdd =
         BB.CreateAdd(Idx, ConstantInt::get(Idx->getType(), 1),
                           Idx->getName() + ".add");
-    Idx->addIncoming(IdxAdd, bodyTerm->getParent()); 
+    BasicBlock* body = bodyTerm->getParent(); 
+    BasicBlock* loopExit = exitTerm->getParent(); 
+    Idx->addIncoming(IdxAdd, body); 
     ReplaceInstWithInst(bodyTerm, BranchInst::Create(RedEpiHeader)); 
+
+    // Update Loopinfo with reduction loop
+    Loop* RL = LI->AllocateLoop(); 
+    if(ParentLoop) ParentLoop->addChildLoop(RL); 
+    else LI->addTopLevelLoop(RL); 
+    RL->addBasicBlockToLoop(RedEpiHeader, *LI); 
+    RL->addBasicBlockToLoop(body, *LI); 
   }
 
   LLVM_DEBUG(dbgs() << "Function after reduction epilogue\n" << *F);  
 
   // TODO: fix DT updates
   DT->recalculate(*F); 
-  /*
+
 #ifndef NDEBUG
   DT->verify();
   LI->verify(*DT);
 #endif
-  */
 
   return NewLoop;
 }

From df151f99e5f8420c5afa6b308632fe9f5ae3cccd Mon Sep 17 00:00:00 2001
From: George Stelle <stelleg@gmail.com>
Date: Thu, 21 Sep 2023 09:22:56 -0600
Subject: [PATCH 10/23] Fixed reductions use of pointer types

---
 llvm/lib/Transforms/Tapir/LoopStripMine.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
index 2017ea93056de..4aabe0fc8fe44 100644
--- a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
+++ b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
@@ -1615,7 +1615,7 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
   for(CallInst* ci : reductions){
     // TODO: generic allocation/free calls
     auto ptr = ci->getArgOperand(0); 
-    auto ty = dyn_cast<PointerType>(ptr->getType())->getElementType(); 
+    auto ty = dyn_cast<PointerType>(ptr->getType())->getArrayElementType(); 
     auto gmmTy = FunctionType::get(ptr->getType(), { nred->getType() }, false); 
     auto arrSize = RB.CreateMul(nred, ConstantInt::get(nred->getType(), DL.getTypeAllocSize(nred->getType()))); 
     auto al = RB.CreateCall(M->getOrInsertFunction("gpuManagedMalloc", gmmTy), {arrSize}); 
@@ -1674,7 +1674,8 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
   if(!reductions.empty()){
     // Peel the first iteration of the loop and replace the reduction calls in
     // the peeled code with stores
-    peelLoop(L, 1, LI, SE, *DT, AC, PreserveLCSSA); 
+    ValueToValueMapTy VMap; 
+    peelLoop(L, 1, LI, SE, *DT, AC, PreserveLCSSA, VMap); 
     SmallVector<Instruction*, 4> cis;
     for(auto &BB : NewLoop->blocks()){
       if(!L->contains(BB)){ // better way?
@@ -1716,7 +1717,7 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
     // reduce them. 
     for(auto& kv : redMap){
       const auto [ ci, ptr, al ] = kv; 
-      auto ty = dyn_cast<PointerType>(ptr->getType())->getElementType(); 
+      auto ty = dyn_cast<PointerType>(ptr->getType())->getArrayElementType(); 
       auto lptr = BB.CreateBitCast(
         BB.CreateGEP(ty, al, Idx), 
         ptr->getType());                             

From 77142664f4f4b3c07e3aeafe3bc44e9ab2a91195 Mon Sep 17 00:00:00 2001
From: George Stelle <stelleg@gmail.com>
Date: Thu, 12 Sep 2024 14:35:33 -0600
Subject: [PATCH 11/23] working reductions on 16.x

---
 llvm/include/llvm/Bitcode/LLVMBitCodes.h    |  1 +
 llvm/lib/Bitcode/Reader/BitcodeReader.cpp   |  2 ++
 llvm/lib/Bitcode/Writer/BitcodeWriter.cpp   |  2 ++
 llvm/lib/Transforms/Tapir/GPUABI.cpp        |  3 ++-
 llvm/lib/Transforms/Tapir/LoopStripMine.cpp | 19 ++++++++++---------
 5 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
index 14c6e28d55369..a45afe2dfe615 100644
--- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
@@ -813,6 +813,7 @@ enum AttributeKindCodes {
   ATTR_KIND_KIT_KERNEL_PROPS = 109,
   ATTR_KIND_KIT_KERNEL = 110,
   ATTR_KIND_KIT_DEVICE = 111,
+  ATTR_KIND_KIT_REDUCTION = 112,
 };
 
 enum ComdatSelectionKindCodes {
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 35c4a8bd78229..945f422ef6a0e 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -2260,6 +2260,8 @@ static Attribute::AttrKind getAttrFromCode(uint64_t Code) {
     return Attribute::KitKernel;
   case bitc::ATTR_KIND_KIT_DEVICE:
     return Attribute::KitDevice;
+  case bitc::ATTR_KIND_KIT_REDUCTION:
+    return Attribute::KitsuneReduction;
   }
 }
 
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 05a103aae5ddd..c9451018b070b 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -960,6 +960,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) {
     return bitc::ATTR_KIND_KIT_KERNEL;
   case Attribute::KitDevice:
     return bitc::ATTR_KIND_KIT_DEVICE;
+  case Attribute::KitsuneReduction:
+    return bitc::ATTR_KIND_KIT_REDUCTION; 
   case Attribute::EndAttrKinds:
     llvm_unreachable("Can not encode end-attribute kinds marker.");
   case Attribute::None:
diff --git a/llvm/lib/Transforms/Tapir/GPUABI.cpp b/llvm/lib/Transforms/Tapir/GPUABI.cpp
index 763858e931389..b20f63e71c0cf 100644
--- a/llvm/lib/Transforms/Tapir/GPUABI.cpp
+++ b/llvm/lib/Transforms/Tapir/GPUABI.cpp
@@ -306,7 +306,8 @@ void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
                   SmallVector<ReturnInst*, 8> Returns;
                   CloneFunctionInto(deviceF, f, VMap, CloneFunctionChangeType::DifferentModule, Returns);
                   // GPU calls are slow as balls, try to force inlining
-                  deviceF->addFnAttr(Attribute::AlwaysInline);
+                  if(!deviceF->hasFnAttribute(Attribute::NoInline))
+                    deviceF->addFnAttr(Attribute::AlwaysInline); 
                   todo.push_back(deviceF);
                 }
               }
diff --git a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
index 4aabe0fc8fe44..e5a5f99a2bb69 100644
--- a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
+++ b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
@@ -1579,7 +1579,7 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
 
   // accumulate reductions in main loop
   const std::vector<BasicBlock*>& blocks = L->getBlocks(); 
-  std::set<CallInst*> reductions;
+  std::set<std::pair<CallInst*, Type*>> reductions;
   for (BasicBlock *BB : blocks){
     for (Instruction &I : *BB) {
       if(auto ci = dyn_cast<CallInst>(&I)){
@@ -1587,7 +1587,8 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
         if(f->getAttributes().hasAttrSomewhere(Attribute::KitsuneReduction)){
           LLVM_DEBUG(dbgs() << "Found reduction var: " << ci->getArgOperand(0)->getName() << 
                                "with reduction function: " << f->getName() << "\n"); 
-          reductions.insert(ci); 
+          auto ty = ci->getArgOperand(1)->getType(); 
+          reductions.insert(std::make_pair(ci, ty)); 
           //TODO: check the type to confirm valid reduction
         }
       }
@@ -1597,7 +1598,7 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
   // accumulate reductions in epilog loop
   LLVM_DEBUG(dbgs() << "Found " << reductions.size() << " reduction variables in loop\n"); 
 
-  std::vector<std::tuple<CallInst*, Value* , Value*>> redMap; 
+  std::vector<std::tuple<CallInst*, Value* , Value*, Type*>> redMap; 
   // TODO: Modify the strip mining outer loop to be smaller: currently we are
   // stack allocating n/2048 reduction values.
   // TODO: Initialize local reductions with unit values
@@ -1612,12 +1613,13 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
                                ConstantInt::get(TripCount->getType(), Count),
                                "stripiter");
   auto nred = RB.CreateAdd(outerIters, ConstantInt::get(outerIters->getType(), 1)); 
-  for(CallInst* ci : reductions){
+  for(auto &pair : reductions){
     // TODO: generic allocation/free calls
+    auto ci = pair.first; 
     auto ptr = ci->getArgOperand(0); 
-    auto ty = dyn_cast<PointerType>(ptr->getType())->getArrayElementType(); 
+    auto ty = pair.second; 
     auto gmmTy = FunctionType::get(ptr->getType(), { nred->getType() }, false); 
-    auto arrSize = RB.CreateMul(nred, ConstantInt::get(nred->getType(), DL.getTypeAllocSize(nred->getType()))); 
+    auto arrSize = RB.CreateMul(nred, ConstantInt::get(nred->getType(), DL.getTypeAllocSize(ty))); 
     auto al = RB.CreateCall(M->getOrInsertFunction("gpuManagedMalloc", gmmTy), {arrSize}); 
     //auto al = RB.CreateBitCast(rm, ty); 
     //auto al = RB.CreateAlloca(ty, nred, ptr->getName() + "_reduction");
@@ -1625,7 +1627,7 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
     auto lptr = BH.CreateBitCast(
       BH.CreateGEP(ty, al, NewIdx), 
       ptr->getType());                             
-    redMap.push_back(std::make_tuple(ci, ptr, al)); 
+    redMap.push_back(std::make_tuple(ci, ptr, al, ty)); 
     // Assume there is more than one element, and
     // use the first element for the first iteration of the loop.
     // roughly: 
@@ -1716,8 +1718,7 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
     // For each reduction, get the allocated thread local reduced values and
     // reduce them. 
     for(auto& kv : redMap){
-      const auto [ ci, ptr, al ] = kv; 
-      auto ty = dyn_cast<PointerType>(ptr->getType())->getArrayElementType(); 
+      const auto [ ci, ptr, al, ty ] = kv; 
       auto lptr = BB.CreateBitCast(
         BB.CreateGEP(ty, al, Idx), 
         ptr->getType());                             

From ca900c889e943ce5b0dd1faa57e380914b8935da Mon Sep 17 00:00:00 2001
From: George Stelle <stelleg@gmail.com>
Date: Mon, 16 Sep 2024 12:42:34 -0600
Subject: [PATCH 12/23] added reductions example makefile

---
 kitsune-tests/reductions/makefile | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)
 create mode 100644 kitsune-tests/reductions/makefile

diff --git a/kitsune-tests/reductions/makefile b/kitsune-tests/reductions/makefile
new file mode 100644
index 0000000000000..78e4a2a8e81cb
--- /dev/null
+++ b/kitsune-tests/reductions/makefile
@@ -0,0 +1,25 @@
+all: l2 l2_gpu l2_serial
+
+l2.ll : l2.c
+	clang -S -emit-llvm l2.c -O1 -ftapir=none 
+
+l2_stripmined.ll: l2.ll
+	opt -S -passes="loop-stripmine" -o l2_stripmined.ll l2.ll 
+
+l2_stripmined_opt.ll: l2_stripmined.ll
+	opt -S -O2 -o l2_stripmined_opt.ll l2_stripmined.ll
+
+l2: l2_stripmined_opt.ll
+	clang -ftapir=opencilk -O2 l2_stripmined_opt.ll -o l2 -lm  -lllvm-gpu
+
+l2_gpu.ll: l2_stripmined_opt.ll
+	clang -ftapir=gpu -O1 -S -emit-llvm $< -o $@ 
+
+l2_gpu: l2_gpu.ll
+	clang l2_gpu.ll -fPIC -o l2_gpu -ftapir=gpu -lm -lllvm-gpu
+
+l2_serial: l2.ll
+	clang l2.ll -o l2_serial -ftapir=serial -lllvm-gpu -lm
+
+clean: 
+	rm -f l2_stripmined.ll l2 l2.ll

From 430aa380385d2b91acb1deb4fc1d00bc09259997 Mon Sep 17 00:00:00 2001
From: George Stelle <stelleg@gmail.com>
Date: Thu, 31 Oct 2024 09:23:38 -0600
Subject: [PATCH 13/23] Added openmp reduce for comparison

---
 kitsune-tests/reductions/l2.c        | 13 +++++--
 kitsune-tests/reductions/l2_openmp.c | 55 ++++++++++++++++++++++++++++
 kitsune-tests/reductions/makefile    |  5 ++-
 3 files changed, 68 insertions(+), 5 deletions(-)
 create mode 100644 kitsune-tests/reductions/l2_openmp.c

diff --git a/kitsune-tests/reductions/l2.c b/kitsune-tests/reductions/l2.c
index e39b8765da54d..c8e936fb98b48 100644
--- a/kitsune-tests/reductions/l2.c
+++ b/kitsune-tests/reductions/l2.c
@@ -28,17 +28,22 @@ double l2_seq(uint64_t n, double* a){
 }
 
 int main(int argc, char** argv){
-  uint64_t n = argc > 1 ? atoi(argv[1]) : 2ULL<<28 ; 
+  int e = argc > 1 ? atoi(argv[1]) : 28; 
+  int niter = argc > 2 ? atoi(argv[2]) : 100; 
+  uint64_t n = 1ULL<<e; 
   double* arr = (double*)gpuManagedMalloc(sizeof(double) * n); 
 
   forall(uint64_t i=0; i<n; i++){
     arr[i] = i; 
   }
 
-//  l2(n, arr);
+  l2(n, arr);
 
   clock_t before = clock();
-  double par = l2(n, arr);
+  double par; 
+  for(int i=0; i<niter; i++){
+    par = l2(n, arr);
+  }
   clock_t after = clock(); 
   double partime = (double)(after - before) / 1000000; 
 
@@ -48,7 +53,7 @@ int main(int argc, char** argv){
   double seqtime = (double)(after - before) / 1000000; 
 
   printf("par: %f in %f s , seq: %f in %f s\n" , par, partime, seq, seqtime);
-  double bw = (double)((2ULL<<28) * sizeof(double)) / (1000000000.0 * partime);  
+  double bw = (double)((1ULL<<e) * niter * sizeof(double)) / (1000000000.0 * partime);  
   printf("par bandwidth: %f GB/s \n" , bw);
 }
 
diff --git a/kitsune-tests/reductions/l2_openmp.c b/kitsune-tests/reductions/l2_openmp.c
new file mode 100644
index 0000000000000..836052b1f5579
--- /dev/null
+++ b/kitsune-tests/reductions/l2_openmp.c
@@ -0,0 +1,55 @@
+#include<time.h>
+#include<math.h>
+#include<stdio.h>
+#include<stdlib.h>
+#include<gpu.h>
+
+double l2(uint64_t n, double* a){
+  double red = 0; 
+  #pragma omp parallel for reduction(+:red)
+  for(uint64_t i=0; i<n; i++){
+    red += a[i]*a[i]; 
+  }
+
+  return sqrt(red);
+}
+
+double l2_seq(uint64_t n, double* a){
+  double red = 0; 
+  for(uint64_t i=0; i < n; i++){
+    red += a[i]*a[i]; 
+  }
+  return sqrt(red);
+}
+
+int main(int argc, char** argv){
+  int e = argc > 1 ? atoi(argv[1]) : 28; 
+  int niter = argc > 2 ? atoi(argv[2]) : 100; 
+  uint64_t n = 1ULL<<e; 
+  double* arr = malloc(sizeof(double) * n); 
+
+  #pragma omp parallel for
+  for(uint64_t i=0; i<n; i++){
+    arr[i] = i; 
+  }
+
+  l2(n, arr);
+
+  clock_t before = clock();
+  double par; 
+  for(int i=0; i<niter; i++){
+    par = l2(n, arr);
+  }
+  clock_t after = clock(); 
+  double partime = (double)(after - before) / 1000000; 
+
+  before = clock();
+  double seq = l2_seq(n, arr);
+  after = clock(); 
+  double seqtime = (double)(after - before) / 1000000; 
+
+  printf("par: %f in %f s , seq: %f in %f s\n" , par, partime, seq, seqtime);
+  double bw = (double)((1ULL<<e) * niter * sizeof(double)) / (1000000000.0 * partime);  
+  printf("par bandwidth: %f GB/s \n" , bw);
+}
+
diff --git a/kitsune-tests/reductions/makefile b/kitsune-tests/reductions/makefile
index 78e4a2a8e81cb..d3d264acb781b 100644
--- a/kitsune-tests/reductions/makefile
+++ b/kitsune-tests/reductions/makefile
@@ -1,4 +1,4 @@
-all: l2 l2_gpu l2_serial
+all: l2 l2_gpu l2_serial l2_openmp
 
 l2.ll : l2.c
 	clang -S -emit-llvm l2.c -O1 -ftapir=none 
@@ -21,5 +21,8 @@ l2_gpu: l2_gpu.ll
 l2_serial: l2.ll
 	clang l2.ll -o l2_serial -ftapir=serial -lllvm-gpu -lm
 
+l2_openmp: l2_openmp.c
+	clang l2_openmp.c -fopenmp -O2 -o l2_openmp -lm
+
 clean: 
 	rm -f l2_stripmined.ll l2 l2.ll

From 36537a1b65f99b60c00c0c6f2f2da08e3ddcb621 Mon Sep 17 00:00:00 2001
From: George Stelle <stelleg@lanl.gov>
Date: Tue, 6 May 2025 10:28:57 -0600
Subject: [PATCH 14/23] GPU reductions via stripmining pass working

---
 clang/include/clang/Basic/Attr.td             |   6 +-
 clang/lib/CodeGen/CGKitsune.cpp               |   2 +
 clang/lib/Driver/Tapir.cpp                    |  84 +++
 clang/lib/Driver/ToolChain.cpp                |  45 ++
 kitsune-tests/reductions/l2.c                 |  27 +-
 kitsune-tests/reductions/l2_openmp.c          |  14 +-
 kitsune-tests/reductions/makefile             |  31 +-
 kitsune/include/kitsune/kitsune.h             |   9 +
 llvm/CMakeLists.txt                           |   1 +
 llvm/include/llvm/IR/Attributes.td            |   3 -
 llvm/include/llvm/Transforms/Tapir/GPUABI.h   |   2 +-
 .../llvm/Transforms/Tapir/LoopStripMine.h     |   3 +-
 .../llvm/Transforms/Tapir/TapirTargetIDs.h    | 115 ++++
 llvm/lib/Transforms/Tapir/GPUABI.cpp          |  69 +--
 llvm/lib/Transforms/Tapir/LoopStripMine.cpp   | 510 +++++++++++-------
 .../Transforms/Tapir/LoopStripMinePass.cpp    |  22 +-
 llvm/lib/Transforms/Tapir/LoweringUtils.cpp   |   4 +
 17 files changed, 656 insertions(+), 291 deletions(-)
 create mode 100644 clang/lib/Driver/Tapir.cpp
 create mode 100644 llvm/include/llvm/Transforms/Tapir/TapirTargetIDs.h

diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index bda7c1e4d6c21..f4f7d7f6e666f 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -5233,7 +5233,7 @@ def NonString : InheritableAttr {
 def KitsuneReduction : InheritableAttr {
   let Spellings = [Clang<"kitsune_reduction">];
   let Subjects = SubjectList<[FunctionLike]>;
-  let Documentation = [StrandMallocDocs];
+  let Documentation = [TapirRTDocs];
 }
 
 def TapirTarget : StmtAttr {
@@ -5254,9 +5254,9 @@ def TapirTarget : StmtAttr {
                              ErrorDiag, "'parallel' statements">;
   let Args = [
     EnumArgument<"TapirTargetAttrType", "TapirTargetAttrTy", /*is_string=*/ true,
-                 ["nolo", "serial", "cuda", "hip", "opencilk", "openmp",
+                 ["nolo", "serial", "cuda", "hip", "gpu",  "opencilk", "openmp",
                   "qthreads", "realm"],
-                 ["Nolo", "Serial", "Cuda", "Hip", "OpenCilk", "OpenMP",
+                 ["Nolo", "Serial", "Cuda", "Hip", "GPU", "OpenCilk", "OpenMP",
                   "Qthreads", "Realm"],
                  0>
   ];
diff --git a/clang/lib/CodeGen/CGKitsune.cpp b/clang/lib/CodeGen/CGKitsune.cpp
index 9f99ee449e832..bc5d9f4ea7f92 100644
--- a/clang/lib/CodeGen/CGKitsune.cpp
+++ b/clang/lib/CodeGen/CGKitsune.cpp
@@ -106,6 +106,8 @@ CodeGenFunction::GetTapirTargetAttr(ArrayRef<const Attr *> Attrs) {
         return llvm::TTID::Cuda;
       case TapirTargetAttr::Hip:
         return llvm::TTID::Hip;
+      case TapirTargetAttr::GPU:
+        return llvm::TTID::GPU; 
       case TapirTargetAttr::OpenCilk:
         return llvm::TTID::OpenCilk;
       case TapirTargetAttr::OpenMP:
diff --git a/clang/lib/Driver/Tapir.cpp b/clang/lib/Driver/Tapir.cpp
new file mode 100644
index 0000000000000..0c9826526fcb1
--- /dev/null
+++ b/clang/lib/Driver/Tapir.cpp
@@ -0,0 +1,84 @@
+//===--- Tapir.cpp - C Language Family Language Options ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file defines the functions from Tapir.h
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Driver/Tapir.h"
+#include "clang/Driver/Options.h"
+#include "clang/Driver/ToolChain.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Option/Arg.h"
+#include "llvm/Option/ArgList.h"
+#include "llvm/Transforms/Tapir/TapirTargetIDs.h"
+
+using namespace clang::driver;
+using namespace llvm;
+
+std::optional<TapirTargetID> clang::parseTapirTarget(const opt::ArgList &Args) {
+  if (const opt::Arg *A = Args.getLastArg(options::OPT_ftapir_EQ))
+    return llvm::StringSwitch<std::optional<TapirTargetID>>(A->getValue())
+        .Case("none", TapirTargetID::None)
+        .Case("serial", TapirTargetID::Serial)
+        .Case("cuda", TapirTargetID::Cuda)
+        .Case("hip", TapirTargetID::Hip)
+        .Case("gpu", TapirTargetID::GPU)
+        .Case("opencilk", TapirTargetID::OpenCilk)
+        .Case("openmp", TapirTargetID::OpenMP)
+        .Case("qthreads", TapirTargetID::Qthreads)
+        .Case("realm", TapirTargetID::Realm)
+        .Default(std::nullopt);
+  return std::nullopt;
+}
+
+std::optional<TapirNVArchTargetID>
+clang::parseTapirNVArchTarget(const opt::ArgList &Args) {
+  if (const opt::Arg *A = Args.getLastArg(options::OPT_ftapir_nvarch_EQ))
+    return llvm::StringSwitch<TapirNVArchTargetID>(A->getValue())
+        .Case("sm_50", TapirNVArchTargetID::SM_50)
+        .Case("sm_52", TapirNVArchTargetID::SM_52)
+        .Case("sm_53", TapirNVArchTargetID::SM_53)
+        .Case("sm_60", TapirNVArchTargetID::SM_60)
+        .Case("sm_62", TapirNVArchTargetID::SM_62)
+        .Case("sm_70", TapirNVArchTargetID::SM_70)
+        .Case("sm_75", TapirNVArchTargetID::SM_75)
+        .Case("sm_80", TapirNVArchTargetID::SM_80)
+        .Case("sm_86", TapirNVArchTargetID::SM_86)
+        .Case("sm_90", TapirNVArchTargetID::SM_90)
+        .Default(TapirNVArchTargetID::Last_TapirNVArchTargetID);
+
+  return std::nullopt;
+}
+
+std::optional<llvm::StringRef>
+clang::getTargetConfigFileName(const opt::ArgList &Args) {
+  if (std::optional<TapirTargetID> tt = parseTapirTarget(Args)) {
+    switch (*tt) {
+    case TapirTargetID::None:
+      return "none.cfg";
+    case TapirTargetID::Serial:
+      return "serial.cfg";
+    case TapirTargetID::Cuda:
+      return "cuda.cfg";
+    case TapirTargetID::Hip:
+      return "hip.cfg";
+    case TapirTargetID::OpenCilk:
+      return "opencilk.cfg";
+    case TapirTargetID::OpenMP:
+      return "openmp.cfg";
+    case TapirTargetID::Qthreads:
+      return "qthreads.cfg";
+    case TapirTargetID::Realm:
+      return "realm.cfg";
+    default:
+      return std::nullopt;
+    }
+  }
+  return std::nullopt;
+}
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index a953a300a8309..670f43e8e9ff2 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -2272,11 +2272,14 @@ void ToolChain::AddKitsuneRealmCommonArgs(const ArgList &Args,
 
 void ToolChain::AddKitsunePreprocessorArgs(const ArgList &Args,
                                            ArgStringList &CmdArgs) const {
+<<<<<<< HEAD
   auto AddTTArgs = [&](TTID TT, const ArgList &Args,
                        ArgStringList &CmdArgs) -> void {
     switch (TT) {
     case TTID::Nolo:
       return;
+    case TTID:GPU:
+      return;
     case TTID::Cuda:
       return ExtractArgsFromString(KITSUNE_CUDA_EXTRA_PREPROCESSOR_FLAGS,
                                    CmdArgs, Args);
@@ -2341,6 +2344,7 @@ void ToolChain::AddKitsuneCompilerArgs(const ArgList &Args,
     case TTID::Hip:
       AddKitsuneHipCommonArgs(Args, CmdArgs);
       ExtractArgsFromString(KITSUNE_HIP_EXTRA_COMPILER_FLAGS, CmdArgs, Args);
+<<<<<<< HEAD
       return;
     case TTID::Lambda:
       AddKitsuneLambdaCommonArgs(Args, CmdArgs);
@@ -2353,6 +2357,12 @@ void ToolChain::AddKitsuneCompilerArgs(const ArgList &Args,
       return;
     case TTID::OpenCilk:
       AddKitsuneOpenCilkCommonArgs(Args, CmdArgs);
+=======
+      break;
+    case llvm::TapirTargetID::GPU:
+      break;
+    case llvm::TapirTargetID::OpenCilk:
+>>>>>>> 82193e08056a (GPU reductions via stripmining pass working)
       ExtractArgsFromString(KITSUNE_OPENCILK_EXTRA_COMPILER_FLAGS, CmdArgs,
                             Args);
       return;
@@ -2551,6 +2561,7 @@ void ToolChain::AddKitsuneLinkerArgs(const ArgList &Args,
     case TTID::Hip:
       AddKitsuneHipLinkerArgs(Args, CmdArgs);
       ExtractArgsFromString(KITSUNE_HIP_EXTRA_LINKER_FLAGS, CmdArgs, Args);
+<<<<<<< HEAD
       return;
     case TTID::Lambda:
       AddKitsuneLambdaLinkerArgs(Args, CmdArgs);
@@ -2562,6 +2573,40 @@ void ToolChain::AddKitsuneLinkerArgs(const ArgList &Args,
       return;
     case TTID::OpenCilk:
       AddKitsuneOpenCilkLinkerArgs(Args, CmdArgs);
+=======
+      break;
+    case llvm::TapirTargetID::GPU:
+      break;
+
+    case llvm::TapirTargetID::OpenCilk: {
+      bool StaticOpenCilk = Args.hasArg(options::OPT_static);
+      bool UseAsan = getSanitizerArgs(Args).needsAsanRt();
+
+      // Link the correct Cilk personality fn
+      if (getDriver().CCCIsCXX())
+        CmdArgs.push_back(Args.MakeArgString(getOpenCilkRT(
+            Args,
+            UseAsan ? "opencilk-asan-personality-cpp"
+                    : "opencilk-personality-cpp",
+            StaticOpenCilk ? ToolChain::FT_Static : ToolChain::FT_Shared)));
+      else
+        CmdArgs.push_back(Args.MakeArgString(getOpenCilkRT(
+            Args,
+            UseAsan ? "opencilk-asan-personality-c" : "opencilk-personality-c",
+            StaticOpenCilk ? ToolChain::FT_Static : ToolChain::FT_Shared)));
+
+      // Link the opencilk runtime.  We do this after linking the personality
+      // function, to ensure that symbols are resolved correctly when using
+      // static linking.
+      CmdArgs.push_back(Args.MakeArgString(getOpenCilkRT(
+          Args, UseAsan ? "opencilk-asan" : "opencilk",
+          StaticOpenCilk ? ToolChain::FT_Static : ToolChain::FT_Shared)));
+
+      // Add to the executable's runpath the default directory containing
+      // OpenCilk runtime.
+      addOpenCilkRuntimeRunPath(*this, Args, CmdArgs, Triple);
+
+>>>>>>> 82193e08056a (GPU reductions via stripmining pass working)
       ExtractArgsFromString(KITSUNE_OPENCILK_EXTRA_LINKER_FLAGS, CmdArgs, Args);
       return;
     case TTID::OpenMP:
diff --git a/kitsune-tests/reductions/l2.c b/kitsune-tests/reductions/l2.c
index c8e936fb98b48..3e821a6d02c74 100644
--- a/kitsune-tests/reductions/l2.c
+++ b/kitsune-tests/reductions/l2.c
@@ -4,6 +4,8 @@
 #include<stdlib.h>
 #include<kitsune.h>
 #include<gpu.h>
+#include<stdint.h>
+#include<omp.h>
 
 reduction
 void sum(double *a, double b){
@@ -19,14 +21,6 @@ double l2(uint64_t n, double* a){
   return sqrt(red);
 }
 
-double l2_seq(uint64_t n, double* a){
-  double red = 0; 
-  for(uint64_t i=0; i < n; i++){
-    sum(&red, a[i]*a[i]); 
-  }
-  return sqrt(red);
-}
-
 int main(int argc, char** argv){
   int e = argc > 1 ? atoi(argv[1]) : 28; 
   int niter = argc > 2 ? atoi(argv[2]) : 100; 
@@ -39,21 +33,16 @@ int main(int argc, char** argv){
 
   l2(n, arr);
 
-  clock_t before = clock();
-  double par; 
+  double par = 0; 
+  double before = omp_get_wtime();
   for(int i=0; i<niter; i++){
     par = l2(n, arr);
   }
-  clock_t after = clock(); 
-  double partime = (double)(after - before) / 1000000; 
-
-  before = clock();
-  double seq = l2_seq(n, arr);
-  after = clock(); 
-  double seqtime = (double)(after - before) / 1000000; 
+  double after = omp_get_wtime(); 
+  double partime = after - before; 
 
-  printf("par: %f in %f s , seq: %f in %f s\n" , par, partime, seq, seqtime);
+  printf("%f in %f s\n" , par, partime);
   double bw = (double)((1ULL<<e) * niter * sizeof(double)) / (1000000000.0 * partime);  
-  printf("par bandwidth: %f GB/s \n" , bw);
+  printf("bandwidth: %f GB/s \n" , bw);
 }
 
diff --git a/kitsune-tests/reductions/l2_openmp.c b/kitsune-tests/reductions/l2_openmp.c
index 836052b1f5579..15855bf92a33a 100644
--- a/kitsune-tests/reductions/l2_openmp.c
+++ b/kitsune-tests/reductions/l2_openmp.c
@@ -3,6 +3,7 @@
 #include<stdio.h>
 #include<stdlib.h>
 #include<gpu.h>
+#include<omp.h>
 
 double l2(uint64_t n, double* a){
   double red = 0; 
@@ -35,20 +36,15 @@ int main(int argc, char** argv){
 
   l2(n, arr);
 
-  clock_t before = clock();
+  double before = omp_get_wtime();
   double par; 
   for(int i=0; i<niter; i++){
     par = l2(n, arr);
   }
-  clock_t after = clock(); 
-  double partime = (double)(after - before) / 1000000; 
+  double after = omp_get_wtime(); 
+  double partime = after - before; 
 
-  before = clock();
-  double seq = l2_seq(n, arr);
-  after = clock(); 
-  double seqtime = (double)(after - before) / 1000000; 
-
-  printf("par: %f in %f s , seq: %f in %f s\n" , par, partime, seq, seqtime);
+  printf("par: %f in %f s\n" , par, partime);
   double bw = (double)((1ULL<<e) * niter * sizeof(double)) / (1000000000.0 * partime);  
   printf("par bandwidth: %f GB/s \n" , bw);
 }
diff --git a/kitsune-tests/reductions/makefile b/kitsune-tests/reductions/makefile
index d3d264acb781b..3d45d5585723d 100644
--- a/kitsune-tests/reductions/makefile
+++ b/kitsune-tests/reductions/makefile
@@ -1,28 +1,21 @@
-all: l2 l2_gpu l2_serial l2_openmp
+exes=l2_serial l2_opencilk l2_gpu l2_serial l2_openmp l2_cuda
 
-l2.ll : l2.c
-	clang -S -emit-llvm l2.c -O1 -ftapir=none 
+all: $(exes)
 
-l2_stripmined.ll: l2.ll
-	opt -S -passes="loop-stripmine" -o l2_stripmined.ll l2.ll 
+l2_serial: l2.c
+	clang $< -o $@ -O2 -fuse-ld=lld -fPIC -o l2_serial -ftapir=serial -lm -lllvm-gpu -lomp
 
-l2_stripmined_opt.ll: l2_stripmined.ll
-	opt -S -O2 -o l2_stripmined_opt.ll l2_stripmined.ll
+l2_opencilk: l2.c
+	clang $< -o $@ -O2 -fuse-ld=lld -fPIC -o l2_opencilk -ftapir=opencilk -lm -lllvm-gpu -lomp -mllvm -debug-only="loop-stripmine"
 
-l2: l2_stripmined_opt.ll
-	clang -ftapir=opencilk -O2 l2_stripmined_opt.ll -o l2 -lm  -lllvm-gpu
+l2_gpu: l2.c
+	clang $< -o $@ -O2 -fPIC -o l2_gpu -ftapir=gpu -lm -lllvm-gpu -lomp -mllvm -debug-only="loop-stripmine"
 
-l2_gpu.ll: l2_stripmined_opt.ll
-	clang -ftapir=gpu -O1 -S -emit-llvm $< -o $@ 
-
-l2_gpu: l2_gpu.ll
-	clang l2_gpu.ll -fPIC -o l2_gpu -ftapir=gpu -lm -lllvm-gpu
-
-l2_serial: l2.ll
-	clang l2.ll -o l2_serial -ftapir=serial -lllvm-gpu -lm
+l2_cuda: l2.c
+	kitcc $< -o $@ -O2 -fPIC -o l2_cuda -ftapir=cuda -lm -lllvm-gpu -lomp -v -mllvm -cuabi-arch=sm_75
 
 l2_openmp: l2_openmp.c
-	clang l2_openmp.c -fopenmp -O2 -o l2_openmp -lm
+	clang l2_openmp.c -fopenmp -O2 -o l2_openmp -lm -lgomp
 
 clean: 
-	rm -f l2_stripmined.ll l2 l2.ll
+	rm -f $(exes) l2_stripmined.ll l2 l2.ll
diff --git a/kitsune/include/kitsune/kitsune.h b/kitsune/include/kitsune/kitsune.h
index 5f24569bf8ac3..c6c6344ab6f3b 100644
--- a/kitsune/include/kitsune/kitsune.h
+++ b/kitsune/include/kitsune/kitsune.h
@@ -8,6 +8,15 @@
 #ifndef __KITSUNE_KITSUNE_H__
 #define __KITSUNE_KITSUNE_H__
 
+#include <stdint.h>
+#include <stddef.h>
+
+#if defined(reduction)
+#warning found reduction definition: try puttin kitsune lower in include order
+#else
+#define reduction __attribute__((noinline, kitsune_reduction))
+#endif
+
 #if defined(spawn)
 // FIXME KITSUNE: Should this be an error instead of a warning?
 #warning encountered multiple definitions of spawn!
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 9192fca94451d..90f0dd6dc3edf 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -444,6 +444,7 @@ list(JOIN KITSUNE_ENABLED_LANGS_STR " " KITSUNE_ENABLED_LANGS_STR)
 # affects clang. And we have ensured that kitsune is configured before clang.
 set(KITSUNE_CUDA_ENABLED OFF CACHE INTERNAL "Enable 'cuda' tapir target" FORCE)
 set(KITSUNE_HIP_ENABLED OFF CACHE INTERNAL "Enable 'hip' tapir target" FORCE)
+set(KITSUNE_GPU_ENABLED OFF CACHE INTERNAL "Enable 'gpu' tapir target" FORCE)
 set(KITSUNE_LAMBDA_ENABLED OFF CACHE INTERNAL "Enable 'lambda' tapir target" FORCE)
 set(KITSUNE_OMPTASK_ENABLED OFF CACHE INTERNAL "Enable 'omptask' tapir target" FORCE)
 set(KITSUNE_OPENCILK_ENABLED OFF CACHE INTERNAL "Enable 'opencilk' tapir target" FORCE)
diff --git a/llvm/include/llvm/IR/Attributes.td b/llvm/include/llvm/IR/Attributes.td
index 754fe8ca27f74..aa2a9ec523d84 100644
--- a/llvm/include/llvm/IR/Attributes.td
+++ b/llvm/include/llvm/IR/Attributes.td
@@ -276,8 +276,6 @@ def ReadNone : EnumAttr<"readnone", IntersectAnd, [ParamAttr]>;
 /// Function only reads from memory.
 def ReadOnly : EnumAttr<"readonly", IntersectAnd, [ParamAttr]>;
 
-<<<<<<< HEAD
-=======
 /// Tapir reducer-related attributes.
 def HyperView : EnumAttr<"hyper_view", [FnAttr]>;
 def HyperToken : EnumAttr<"hyper_token", [FnAttr]>;
@@ -285,7 +283,6 @@ def ReducerRegister : EnumAttr<"reducer_register", [FnAttr]>;
 def ReducerUnregister : EnumAttr<"reducer_unregister", [FnAttr]>;
 def KitsuneReduction : EnumAttr<"kitsune_reduction", [FnAttr]>;
 
->>>>>>> e2078e0739d8 (Working reductions, sequential semantics preserved for commutative associative magmas)
 /// Return value is always equal to this argument.
 def Returned : EnumAttr<"returned", IntersectAnd, [ParamAttr]>;
 
diff --git a/llvm/include/llvm/Transforms/Tapir/GPUABI.h b/llvm/include/llvm/Transforms/Tapir/GPUABI.h
index 7e0ccda6af380..e11aeff31f696 100644
--- a/llvm/include/llvm/Transforms/Tapir/GPUABI.h
+++ b/llvm/include/llvm/Transforms/Tapir/GPUABI.h
@@ -32,7 +32,7 @@ class GPUABI : public TapirTarget {
   void lowerSync(SyncInst &SI) override final;
 
   void addHelperAttributes(Function &F) override final {}
-  void preProcessFunction(Function &F, TaskInfo &TI,
+  bool preProcessFunction(Function &F, TaskInfo &TI,
                           bool OutliningTapirLoops) override final;
   void postProcessFunction(Function &F, bool OutliningTapirLoops)
     override final;
diff --git a/llvm/include/llvm/Transforms/Tapir/LoopStripMine.h b/llvm/include/llvm/Transforms/Tapir/LoopStripMine.h
index 270b77794620c..4940f3d7d463a 100644
--- a/llvm/include/llvm/Transforms/Tapir/LoopStripMine.h
+++ b/llvm/include/llvm/Transforms/Tapir/LoopStripMine.h
@@ -47,7 +47,8 @@ Loop *StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
                     AssumptionCache *AC, TaskInfo *TI,
                     OptimizationRemarkEmitter *ORE, bool PreserveLCSSA,
                     bool ParallelEpilog, bool NeedNestedSync,
-                    Loop **Remainderloop = nullptr);
+                    Loop **Remainderloop = nullptr,
+                    bool GPU = false);
 
 } // end namespace llvm
 
diff --git a/llvm/include/llvm/Transforms/Tapir/TapirTargetIDs.h b/llvm/include/llvm/Transforms/Tapir/TapirTargetIDs.h
new file mode 100644
index 0000000000000..e4305d139cb26
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Tapir/TapirTargetIDs.h
@@ -0,0 +1,115 @@
+//===- TapirTargetIDs.h - Tapir target ID's --------------------*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file enumerates the available Tapir lowering targets.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TAPIR_TARGET_IDS_H_
+#define TAPIR_TARGET_IDS_H_
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+enum class TapirTargetID {
+  None,     // Perform no lowering
+  Serial,   // Lower to serial projection
+  Cuda,     // Lower to Cuda ABI
+  Hip,      // Lower to the Hip (AMD GPU) ABI
+  GPU,      // Lower to the GPU ABI
+  Lambda,   // Lower to generic Lambda ABI
+  OMPTask,  // Lower to OpenMP task ABI
+  OpenCilk, // Lower to OpenCilk ABI
+  OpenMP,   // Lower to OpenMP (TODO: Needs to be updated)
+  Qthreads, // Lower to Qthreads (TODO: Needs to be updated)
+  Realm,    // Lower to Realm (TODO: Needs to be updated)
+  Last_TapirTargetID
+};
+
+// Serialize the Tapir target into the given output stream. This will write a
+// string representation that is compatible with the -ftapir argument used in
+// clang.
+raw_ostream &operator<<(raw_ostream &os, const TapirTargetID &Target);
+
+enum class TapirNVArchTargetID {
+  SM_50, // TODO: Remove depcreated targets based on latest CUDA releases.
+  SM_52,
+  SM_53,
+  SM_60, // Pascal
+  SM_61,
+  SM_62,
+  SM_70, // Volta
+  SM_72,
+  SM_75, // Turing
+  SM_80, // Ampere
+  SM_86,
+  SM_90, // Hopper
+  // TODO: Update this enum when we sync w/ upstream LLVM capabilities.
+  Last_TapirNVArchTargetID
+};
+
+// Serialize the Tapir target into the given output stream. This will write a
+// string representation that is compatible with the -ftapir argument used in
+// clang.
+raw_ostream &operator<<(raw_ostream &os, const TapirTargetID &Target);
+
+// Tapir target options
+
+// Virtual base class for Target-specific options.
+class TapirTargetOptions {
+public:
+  enum TapirTargetOptionKind { TTO_OpenCilk, Last_TTO };
+
+private:
+  const TapirTargetOptionKind Kind;
+
+public:
+  TapirTargetOptionKind getKind() const { return Kind; }
+
+  TapirTargetOptions(TapirTargetOptionKind K) : Kind(K) {}
+  TapirTargetOptions(const TapirTargetOptions &) = delete;
+  TapirTargetOptions &operator=(const TapirTargetOptions &) = delete;
+  virtual ~TapirTargetOptions() {}
+
+  // Top-level method for cloning TapirTargetOptions.  Defined in
+  // TargetLibraryInfo.
+  TapirTargetOptions *clone() const;
+};
+
+// Options for OpenCilkABI Tapir target.
+class OpenCilkABIOptions : public TapirTargetOptions {
+  std::string RuntimeBCPath;
+
+  OpenCilkABIOptions() = delete;
+
+public:
+  OpenCilkABIOptions(StringRef Path)
+      : TapirTargetOptions(TTO_OpenCilk), RuntimeBCPath(Path) {}
+
+  StringRef getRuntimeBCPath() const {
+    return RuntimeBCPath;
+  }
+
+  static bool classof(const TapirTargetOptions *TTO) {
+    return TTO->getKind() == TTO_OpenCilk;
+  }
+
+protected:
+  friend TapirTargetOptions;
+
+  OpenCilkABIOptions *cloneImpl() const {
+    return new OpenCilkABIOptions(RuntimeBCPath);
+  }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/llvm/lib/Transforms/Tapir/GPUABI.cpp b/llvm/lib/Transforms/Tapir/GPUABI.cpp
index b20f63e71c0cf..74106e56f1862 100644
--- a/llvm/lib/Transforms/Tapir/GPUABI.cpp
+++ b/llvm/lib/Transforms/Tapir/GPUABI.cpp
@@ -12,11 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#pragma warning "GPUABI has been deprecated"
-#if 0
-
 #include "llvm/Transforms/Tapir/GPUABI.h"
-#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
@@ -25,12 +21,9 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/GVN.h"
-#include "llvm/Transforms/Vectorize.h"
-#include "llvm/Support/SmallVectorMemoryBuffer.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
 #include "llvm/IRReader/IRReader.h"
 #include "llvm/MC/TargetRegistry.h"
-#include <sstream>
 #include <fstream>
 
 using namespace llvm;
@@ -45,11 +38,16 @@ static cl::opt<bool>
           "(default=false)"));
 
 Value *GPUABI::lowerGrainsizeCall(CallInst *GrainsizeCall) {
-  Value *Grainsize = ConstantInt::get(GrainsizeCall->getType(), 8);
+  IRBuilder<> BH(GrainsizeCall); 
+  auto *M = GrainsizeCall->getModule(); 
+  Type *LLVMInt64Ty = Type::getInt64Ty(M->getContext());
+  Value *GS = BH.CreateCall(M->getOrInsertFunction("gpuGridSize", LLVMInt64Ty)); 
+  //FunctionCallee GGS = M->getOrInsertFunction("gpuGridSize", LLVMInt64Ty);
 
   // Replace uses of grainsize intrinsic call with this grainsize value.
-  GrainsizeCall->replaceAllUsesWith(Grainsize);
-  return Grainsize;
+  //GrainsizeCall->setCalledFunction(GGS); 
+  GrainsizeCall->replaceAllUsesWith(GS); 
+  return GS;
 }
 
 void GPUABI::lowerSync(SyncInst &SI) {
@@ -61,8 +59,9 @@ void GPUABI::postProcessOutlinedTask(llvm::Function&, llvm::Instruction*, llvm::
 void GPUABI::preProcessRootSpawner(llvm::Function&, BasicBlock *TFEntry){}
 void GPUABI::postProcessRootSpawner(llvm::Function&, BasicBlock *TFEntry){}
 
-void GPUABI::preProcessFunction(Function &F, TaskInfo &TI,
+bool GPUABI::preProcessFunction(Function &F, TaskInfo &TI,
                                  bool OutliningTapirLoops) {
+  return false;
 }
 
 void GPUABI::postProcessFunction(Function &F, bool OutliningTapirLoops) {
@@ -104,15 +103,12 @@ LLVMLoop::LLVMLoop(Module &M)
 
   // Insert runtime-function declarations in LLVM host modules.
   Type *LLVMInt32Ty = Type::getInt32Ty(LLVMM.getContext());
-  Type *LLVMInt64Ty = Type::getInt64Ty(LLVMM.getContext());
   GetThreadIdx = LLVMM.getOrInsertFunction("gtid", LLVMInt32Ty);
-  Function* getid = LLVMM.getFunction("gtid");
+  
 
   Type *VoidTy = Type::getVoidTy(M.getContext());
-  Type *VoidPtrTy = Type::getInt8PtrTy(M.getContext());
+  Type *VoidPtrTy = PointerType::getUnqual(M.getContext());
   Type *VoidPtrPtrTy = VoidPtrTy->getPointerTo();
-  Type *Int8Ty = Type::getInt8Ty(M.getContext());
-  Type *Int32Ty = Type::getInt32Ty(M.getContext());
   Type *Int64Ty = Type::getInt64Ty(M.getContext());
   GPUInit = M.getOrInsertFunction("initRuntime", VoidTy);
   GPULaunchKernel = M.getOrInsertFunction("launchBCKernel", VoidPtrTy, VoidPtrTy, Int64Ty, VoidPtrPtrTy, Int64Ty);
@@ -184,17 +180,12 @@ unsigned LLVMLoop::getLimitArgIndex(const Function &F, const ValueSet &Args)
 
 void LLVMLoop::postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out,
                                    ValueToValueMapTy &VMap) {
-  LLVMContext &Ctx = M.getContext();
-  Type *Int8Ty = Type::getInt8Ty(Ctx);
-  Type *Int32Ty = Type::getInt32Ty(Ctx);
-  //Type *Int64Ty = Type::getInt64Ty(Ctx);
-  //Type *VoidPtrTy = Type::getInt8PtrTy(Ctx);
   Task *T = TL.getTask();
   Loop *L = TL.getLoop();
 
-
   BasicBlock *Entry = cast<BasicBlock>(VMap[L->getLoopPreheader()]);
   BasicBlock *Header = cast<BasicBlock>(VMap[L->getHeader()]);
+  BasicBlock *Latch = cast<BasicBlock>(VMap[L->getLoopLatch()]);
   BasicBlock *Exit = cast<BasicBlock>(VMap[TL.getExitBlock()]);
   PHINode *PrimaryIV = cast<PHINode>(VMap[TL.getPrimaryInduction().first]);
   Value *PrimaryIVInput = PrimaryIV->getIncomingValueForBlock(Entry);
@@ -208,10 +199,18 @@ void LLVMLoop::postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out,
   // Get the thread ID for this invocation of Helper.
   IRBuilder<> B(Entry->getTerminator());
   Value *ThreadIdx = B.CreateCall(GetThreadIdx);
-  //Value *BlockIdx = B.CreateCall(GetBlockIdx, ConstantInt::get(Int32Ty, 0));
-  //Value *BlockDim = B.CreateCall(GetBlockDim, ConstantInt::get(Int32Ty, 0));
   Value *ThreadID = B.CreateIntCast(ThreadIdx, PrimaryIV->getType(), false);
 
+  // Loop should be handled in stripmining, here we just remove the loop by setting it to a jump
+  BranchInst *BI = cast<BranchInst>(Latch->getTerminator()); 
+  if(BI->getSuccessor(0) == Exit)
+    BI->setCondition(ConstantInt::get(BI->getCondition()->getType(), true));
+  else 
+    BI->setCondition(ConstantInt::get(BI->getCondition()->getType(), false));
+
+  //AV.push_back(ValueAsMetadata::get(ConstantInt::get(Type::getInt32Ty(LLVMCtx),
+  //                                                   1)));
+   
 
   Function *Helper = Out.Outline;
   Helper->setName("kitsune_kernel");
@@ -225,19 +224,10 @@ void LLVMLoop::postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out,
   // the loop limit with stride 1.  The End argument encodes the loop limit.
   // Get end and grainsize arguments
   Argument *End;
-  Value *Grainsize;
   {
-    auto OutlineArgsIter = Helper->arg_begin();
+    auto *OutlineArgsIter = Helper->arg_begin();
     // End argument is the first LC arg.
     End = &*OutlineArgsIter;
-
-    // Get the grainsize value, which is either constant or the third LC arg.
-    // ReplaceInstWithInst(gep, GetElementPtrInst::Create(
-    if (unsigned ConstGrainsize = TL.getGrainsize())
-      Grainsize = ConstantInt::get(PrimaryIV->getType(), ConstGrainsize);
-    else
-      // Grainsize argument is the third LC arg.
-      Grainsize = &*++(++OutlineArgsIter);
   }
   Value *Cond = B.CreateICmpUGE(ThreadID, End);
 
@@ -261,9 +251,8 @@ void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
                                       DominatorTree &DT) {
   LLVMContext &Ctx = M.getContext();
   Type *Int8Ty = Type::getInt8Ty(Ctx);
-  Type *Int32Ty = Type::getInt32Ty(Ctx);
   Type *Int64Ty = Type::getInt64Ty(Ctx);
-  Type *VoidPtrTy = Type::getInt8PtrTy(Ctx);
+  PointerType *VoidPtrTy = PointerType::getUnqual(Ctx);
 
   LLVM_DEBUG(dbgs() << "Running processOutlinedLoopCall: " << LLVMM);
   Function *Parent = TOI.ReplCall->getFunction();
@@ -277,7 +266,6 @@ void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
   // Compile the kernel
   //LLVMM.getFunctionList().remove(TOI.Outline);
   //TOI.Outline->eraseFromParent();
-  LLVMContext &LLVMCtx = LLVMM.getContext();
 
   ValueToValueMapTy VMap;
   // We recursively add definitions and declarations to the device module
@@ -356,12 +344,9 @@ void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
   PassManager->add(createReassociatePass());
   PassManager->add(createGVNPass());
   PassManager->add(createCFGSimplificationPass());
-  PassManager->add(createLoopVectorizePass());
-  PassManager->add(createSLPVectorizerPass());
   //PassManager->add(createBreakCriticalEdgesPass());
   //PassManager->add(createConstantPropagationPass());
   PassManager->add(createDeadCodeEliminationPass());
-  PassManager->add(createDeadStoreEliminationPass());
   //PassManager->add(createInstructionCombiningPass());
   PassManager->add(createCFGSimplificationPass());
   PassManager->add(createDeadCodeEliminationPass());
@@ -391,9 +376,6 @@ void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
                                  GlobalValue::PrivateLinkage, LLVMBC,
                                  "gpu_" + Twine("kitsune_kernel"));
 
-  Value *KernelID = ConstantInt::get(Int32Ty, MyKernelID);
-  Value *LLVMPtr = B.CreateBitCast(LLVMGlobal, VoidPtrTy);
-  Type *VoidPtrPtrTy = VoidPtrTy->getPointerTo();
 
   Constant *kernelSize = ConstantInt::get(Int64Ty,
     LLVMGlobal->getInitializer()->getType()->getArrayNumElements());
@@ -422,4 +404,3 @@ void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
   LLVM_DEBUG(dbgs() << "Finished processOutlinedLoopCall: " << M);
 }
 
-#endif
diff --git a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
index e5a5f99a2bb69..3d1d008c7ebb4 100644
--- a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
+++ b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
@@ -767,7 +767,8 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
                           const TargetTransformInfo &TTI, AssumptionCache *AC,
                           TaskInfo *TI, OptimizationRemarkEmitter *ORE,
                           bool PreserveLCSSA, bool ParallelEpilog,
-                          bool NeedNestedSync, Loop **RemainderLoop) {
+                          bool NeedNestedSync, Loop **RemainderLoop,
+                          bool GPU) {
   Task *T = getTapirLoopForStripMining(L, *TI, ORE);
   if (!T)
     return nullptr;
@@ -816,6 +817,7 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
   BasicBlock *Latch = L->getLoopLatch();
   BasicBlock *Header = L->getHeader();
   BasicBlock *TaskEntry = T->getEntry();
+
   assert(isa<DetachInst>(Header->getTerminator()) &&
          "Header not terminated by a detach.");
   DetachInst *DI = cast<DetachInst>(Header->getTerminator());
@@ -883,7 +885,12 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
     return nullptr;
   }
 
-  LLVM_DEBUG(dbgs() << "Stripmining loop using grainsize " << Count << "\n");
+  if(GPU){
+    LLVM_DEBUG(dbgs() << "Stripmining loop using grainsize " << "gpu grainsize call" << "\n");
+  }
+  else {
+    LLVM_DEBUG(dbgs() << "Stripmining loop using grainsize " << Count << "\n");
+  }
   using namespace ore;
   ORE->emit([&]() {
               return OptimizationRemark(LSM_NAME, "Stripmined",
@@ -899,6 +906,7 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
   //   ...
   //   Latch
   // LatchExit
+  Module *M = F->getParent();
 
   // Insert the epilog remainder.
   BasicBlock *NewPreheader;
@@ -942,41 +950,65 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
   // *EpilogPreheader
   // LatchExit
 
-  IRBuilder<> B(PreheaderBR);
+  IRBuilder<> B(PreheaderBR); 
   Value *ModVal;
-  // Calculate ModVal = (BECount + 1) % Count.
-  // Note that TripCount is BECount + 1.
-  if (isPowerOf2_32(Count)) {
-    // When Count is power of 2 we don't BECount for epilog case.  However we'll
-    // need it for a branch around stripmined loop for prolog case.
-    ModVal = B.CreateAnd(TripCount, Count - 1, "xtraiter");
-    //  1. There are no iterations to be run in the prolog/epilog loop.
-    // OR
-    //  2. The addition computing TripCount overflowed.
-    //
-    // If (2) is true, we know that TripCount really is (1 << BEWidth) and so
-    // the number of iterations that remain to be run in the original loop is a
-    // multiple Count == (1 << Log2(Count)) because Log2(Count) <= BEWidth (we
-    // explicitly check this above).
-    if (TL.isInclusiveRange())
-      ModVal = B.CreateAdd(ModVal, ConstantInt::get(ModVal->getType(), 1));
-  } else {
-    // As (BECount + 1) can potentially unsigned overflow we count
-    // (BECount % Count) + 1 which is overflow safe as BECount % Count < Count.
-    Value *ModValTmp = B.CreateURem(BECount,
-                                    ConstantInt::get(BECount->getType(),
-                                                     Count));
-    Value *ModValAdd = B.CreateAdd(ModValTmp,
-                                   ConstantInt::get(ModValTmp->getType(), 1));
-    // At that point (BECount % Count) + 1 could be equal to Count.
-    // To handle this case we need to take mod by Count one more time.
-    ModVal = B.CreateURem(ModValAdd,
-                          ConstantInt::get(BECount->getType(), Count),
-                          "xtraiter");
+  Value *StepSize; 
+  Value *BranchVal; 
+  // Int the gpu case we don't need an epilogue
+  if(GPU){
+    ModVal = TripCount;  
+    //B.SetInsertPoint(F->getEntryBlock().getFirstNonPHI()); 
+
+    Instruction* bloc; 
+    if(Instruction* I = dyn_cast<Instruction>(TripCount)){
+      bloc = I->getNextNode();
+    } else {
+      bloc = F->getEntryBlock().getTerminator();
+    }
+
+    IRBuilder<> B2(bloc); 
+    StepSize = B2.CreateCall(
+      Intrinsic::getDeclaration(M, Intrinsic::tapir_loop_grainsize,
+                                { TripCount->getType() }), { TripCount });
+
+
+    BranchVal = B.CreateICmpULE(ModVal, ConstantInt::get(ModVal->getType(), 0)); 
+  }
+  else {
+    // Calculate ModVal = (BECount + 1) % Count.
+    // Note that TripCount is BECount + 1.
+    if (isPowerOf2_32(Count)) {
+      // When Count is power of 2 we don't BECount for epilog case.  However we'll
+      // need it for a branch around stripmined loop for prolog case.
+      ModVal = B.CreateAnd(TripCount, Count - 1, "xtraiter");
+      //  1. There are no iterations to be run in the prolog/epilog loop.
+      // OR
+      //  2. The addition computing TripCount overflowed.
+      //
+      // If (2) is true, we know that TripCount really is (1 << BEWidth) and so
+      // the number of iterations that remain to be run in the original loop is a
+      // multiple Count == (1 << Log2(Count)) because Log2(Count) <= BEWidth (we
+      // explicitly check this above).
+      if (TL.isInclusiveRange())
+        ModVal = B.CreateAdd(ModVal, ConstantInt::get(ModVal->getType(), 1));
+    } else {
+      // As (BECount + 1) can potentially unsigned overflow we count
+      // (BECount % Count) + 1 which is overflow safe as BECount % Count < Count.
+      Value *ModValTmp = B.CreateURem(BECount,
+                                      ConstantInt::get(BECount->getType(),
+                                                       Count));
+      Value *ModValAdd = B.CreateAdd(ModValTmp,
+                                     ConstantInt::get(ModValTmp->getType(), 1));
+      // At that point (BECount % Count) + 1 could be equal to Count.
+      // To handle this case we need to take mod by Count one more time.
+      ModVal = B.CreateURem(ModValAdd,
+                            ConstantInt::get(BECount->getType(), Count),
+                            "xtraiter");
+    }
+    BranchVal = B.CreateICmpULT(
+        BECount, ConstantInt::get(BECount->getType(),
+                                  TL.isInclusiveRange() ? Count : Count - 1));
   }
-  Value *BranchVal = B.CreateICmpSLT(
-      BECount, ConstantInt::get(BECount->getType(),
-                                TL.isInclusiveRange() ? Count : Count - 1));
   BasicBlock *RemainderLoopBB = NewExit;
   BasicBlock *StripminedLoopBB = NewPreheader;
   // Branch to either remainder (extra iterations) loop or stripmined loop.
@@ -985,7 +1017,6 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
   if (DT)
     DT->changeImmediateDominator(NewExit, Preheader);
 
-  Function *F = Header->getParent();
   // Get an ordered list of blocks in the loop to help with the ordering of the
   // cloned blocks in the prolog/epilog code
   LoopBlocksDFS LoopBlocks(L);
@@ -1054,21 +1085,22 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
   // TODO: For stripmine factor 2 remainder loop will have 1 iterations.
   // Do not create 1 iteration loop.
   // bool CreateRemainderLoop = (Count != 2);
-  bool CreateRemainderLoop = true;
+  bool CreateRemainderLoop = !GPU;
 
   // Clone all the basic blocks in the loop. If Count is 2, we don't clone
   // the loop, otherwise we create a cloned loop to execute the extra
   // iterations. This function adds the appropriate CFG connections.
   BasicBlock *InsertBot = LatchExit;
   BasicBlock *InsertTop = EpilogPreheader;
-  *RemainderLoop =
-      cloneLoopBlocks(L, ModVal, CreateRemainderLoop, true, UnrollRemainder,
-                      InsertTop, InsertBot, NewPreheader, NewBlocks, LoopBlocks,
-                      ExtraTaskBlocks, SharedEHTaskBlocks, VMap, DT, LI, Count);
+  if(CreateRemainderLoop){
+    *RemainderLoop =
+        cloneLoopBlocks(L, ModVal, CreateRemainderLoop, true, UnrollRemainder,
+                        InsertTop, InsertBot, NewPreheader, NewBlocks, LoopBlocks,
+                        ExtraTaskBlocks, SharedEHTaskBlocks, VMap, DT, LI);
 
-  // Insert the cloned blocks into the function.
-  F->splice(InsertBot->getIterator(), &*F, NewBlocks[0]->getIterator(),
-            F->end());
+    // Insert the cloned blocks into the function.
+    F->splice(InsertBot->getIterator(), &*F, NewBlocks[0]->getIterator(),
+              F->end());
 
   // Loop structure should be the following:
   //  Epilog
@@ -1087,31 +1119,45 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
 
   // Rewrite the cloned instruction operands to use the values created when the
   // clone is created.
-  for (BasicBlock *BB : NewBlocks)
-    for (Instruction &I : *BB)
-      RemapInstruction(&I, VMap,
-                       RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+    for (BasicBlock *BB : NewBlocks)
+      for (Instruction &I : *BB)
+        RemapInstruction(&I, VMap,
+                         RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
 
-  // Serialize the cloned loop body to render the inner loop serial.
-  {
-    // Translate all the analysis for the new cloned task.
-    SmallVector<Instruction *, 1> ClonedReattaches;
-    for (Instruction *I : Reattaches)
-      ClonedReattaches.push_back(cast<Instruction>(VMap[I]));
-    SmallPtrSet<BasicBlock *, 4> ClonedEHBlockPreds;
-    for (BasicBlock *B : EHBlockPreds)
-      ClonedEHBlockPreds.insert(cast<BasicBlock>(VMap[B]));
-    SmallVector<BasicBlock *, 4> ClonedEHBlocks;
-    for (BasicBlock *B : EHBlocksToClone)
-      ClonedEHBlocks.push_back(cast<BasicBlock>(VMap[B]));
-    // Landing pads and detached-rethrow instructions may or may not have been
-    // cloned.
-    SmallPtrSet<LandingPadInst *, 1> ClonedInlinedLPads;
-    for (LandingPadInst *LPad : InlinedLPads) {
-      if (VMap[LPad])
-        ClonedInlinedLPads.insert(cast<LandingPadInst>(VMap[LPad]));
-      else
-        ClonedInlinedLPads.insert(LPad);
+    // Serialize the cloned loop body to render the inner loop serial.
+    {
+      // Translate all the analysis for the new cloned task.
+      SmallVector<Instruction *, 1> ClonedReattaches;
+      for (Instruction *I : Reattaches)
+        ClonedReattaches.push_back(cast<Instruction>(VMap[I]));
+      SmallPtrSet<BasicBlock *, 4> ClonedEHBlockPreds;
+      for (BasicBlock *B : EHBlockPreds)
+        ClonedEHBlockPreds.insert(cast<BasicBlock>(VMap[B]));
+      SmallVector<BasicBlock *, 4> ClonedEHBlocks;
+      for (BasicBlock *B : EHBlocksToClone)
+        ClonedEHBlocks.push_back(cast<BasicBlock>(VMap[B]));
+      // Landing pads and detached-rethrow instructions may or may not have been
+      // cloned.
+      SmallPtrSet<LandingPadInst *, 1> ClonedInlinedLPads;
+      for (LandingPadInst *LPad : InlinedLPads) {
+        if (VMap[LPad])
+          ClonedInlinedLPads.insert(cast<LandingPadInst>(VMap[LPad]));
+        else
+          ClonedInlinedLPads.insert(LPad);
+      }
+      SmallVector<Instruction *, 1> ClonedDetachedRethrows;
+      for (Instruction *DR : DetachedRethrows) {
+        if (VMap[DR])
+          ClonedDetachedRethrows.push_back(cast<Instruction>(VMap[DR]));
+        else
+          ClonedDetachedRethrows.push_back(DR);
+      }
+      DetachInst *ClonedDI = cast<DetachInst>(VMap[DI]);
+      // Serialize the new task.
+      SerializeDetach(ClonedDI, ParentEntry, EHCont, EHContLPadVal,
+                      ClonedReattaches, &ClonedEHBlocks, &ClonedEHBlockPreds,
+                      &ClonedInlinedLPads, &ClonedDetachedRethrows,
+                      NeedToInsertTaskFrame, DT, LI);
     }
     SmallVector<Instruction *, 1> ClonedDetachedRethrows;
     for (Instruction *DR : DetachedRethrows) {
@@ -1131,7 +1177,6 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
   // Detach the stripmined loop.
   Value *SyncReg = DI->getSyncRegion(), *NewSyncReg;
   BasicBlock *EpilogPred, *LoopDetEntry, *LoopReattach;
-  Module *M = F->getParent();
   if (ParallelEpilog) {
     ORE->emit([&]() {
                 return OptimizationRemark(LSM_NAME, "ParallelEpil",
@@ -1344,48 +1389,66 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
   //
   // TODO: Generalize to handle non-power-of-2 counts.
   assert(isPowerOf2_32(Count) && "Count is not a power of 2.");
-  Value *TestVal = B2.CreateUDiv(TripCount,
-                                 ConstantInt::get(TripCount->getType(), Count),
-                                 "stripiter");
-  // Value *TestVal = B2.CreateSub(TripCount, ModVal, "stripiter", true, true);
-
-  // Value *TestCmp = B2.CreateICmpUGT(TestVal,
-  //                                   ConstantInt::get(TestVal->getType(), 0),
-  //                                   TestVal->getName() + ".ncmp");
-  // ReplaceInstWithInst(NewPreheader->getTerminator(),
-  //                     BranchInst::Create(Header, LatchExit, TestCmp));
-  // DT->changeImmediateDominator(LatchExit,
-  //                              DT->findNearestCommonDominator(LatchExit,
-  //                                                             NewPreheader));
-
-  // Add new counter for new outer loop.
-  //
-  // We introduce a new primary induction variable, NewIdx, into the outer loop,
-  // which counts up to the outer-loop trip count from 0, stepping by 1.  In
-  // contrast to counting down from the outer-loop trip count, this new variable
-  // ensures that future loop passes, including LoopSpawning, can process this
-  // outer loop when we're done.
-  PHINode *NewIdx = PHINode::Create(TestVal->getType(), 2, "niter");
-  NewIdx->insertBefore(NewHeader->getFirstNonPHIIt());
-  B2.SetInsertPoint(NewLatch->getTerminator());
-  // Instruction *IdxSub = cast<Instruction>(
-  //     B2.CreateSub(NewIdx, ConstantInt::get(NewIdx->getType(), 1),
-  //                  NewIdx->getName() + ".nsub"));
-  // IdxSub->copyIRFlags(PrimaryInc);
-  Instruction *IdxAdd = cast<Instruction>(
-      B2.CreateAdd(NewIdx, ConstantInt::get(NewIdx->getType(), 1),
-                   NewIdx->getName() + ".nadd"));
-  IdxAdd->copyIRFlags(PrimaryInc);
-
-  // NewIdx->addIncoming(TestVal, NewPreheader);
-  // NewIdx->addIncoming(IdxSub, NewLatch);
-  // Value *IdxCmp = B2.CreateIsNull(IdxSub, NewIdx->getName() + ".ncmp");
-  NewIdx->addIncoming(ConstantInt::get(TestVal->getType(), 0), LoopDetEntry);
-  NewIdx->addIncoming(IdxAdd, NewLatch);
-  Value *IdxCmp = B2.CreateICmpEQ(IdxAdd, TestVal,
-                                  NewIdx->getName() + ".ncmp");
-  ReplaceInstWithInst(NewLatch->getTerminator(),
-                      BranchInst::Create(LoopReattach, NewHeader, IdxCmp));
+  PHINode *NewIdx; 
+  if(GPU){
+    Value *TestVal = StepSize;
+    NewIdx = PHINode::Create(TestVal->getType(), 2, "niter",
+                                      NewHeader->getFirstNonPHIIt());
+    B2.SetInsertPoint(NewLatch->getTerminator());
+    Instruction *IdxAdd = cast<Instruction>(
+        B2.CreateAdd(NewIdx, ConstantInt::get(NewIdx->getType(), 1),
+                     NewIdx->getName() + ".nadd"));
+    IdxAdd->copyIRFlags(PrimaryInc);
+    NewIdx->addIncoming(ConstantInt::get(TestVal->getType(), 0), LoopDetEntry);
+    NewIdx->addIncoming(IdxAdd, NewLatch);
+    Value *IdxCmp = B2.CreateICmpEQ(IdxAdd, TestVal,
+                                    NewIdx->getName() + ".ncmp");
+    ReplaceInstWithInst(NewLatch->getTerminator(),
+                        BranchInst::Create(LoopReattach, NewHeader, IdxCmp));
+  } else {
+    Value *TestVal = B2.CreateUDiv(TripCount,
+                                   ConstantInt::get(TripCount->getType(), Count),
+                                   "stripiter");
+    // Value *TestVal = B2.CreateSub(TripCount, ModVal, "stripiter", true, true);
+
+    // Value *TestCmp = B2.CreateICmpUGT(TestVal,
+    //                                   ConstantInt::get(TestVal->getType(), 0),
+    //                                   TestVal->getName() + ".ncmp");
+    // ReplaceInstWithInst(NewPreheader->getTerminator(),
+    //                     BranchInst::Create(Header, LatchExit, TestCmp));
+    // DT->changeImmediateDominator(LatchExit,
+    //                              DT->findNearestCommonDominator(LatchExit,
+    //                                                             NewPreheader));
+
+    // Add new counter for new outer loop.
+    //
+    // We introduce a new primary induction variable, NewIdx, into the outer loop,
+    // which counts up to the outer-loop trip count from 0, stepping by 1.  In
+    // contrast to counting down from the outer-loop trip count, this new variable
+    // ensures that future loop passes, including LoopSpawning, can process this
+    // outer loop when we're done.
+    NewIdx = PHINode::Create(TestVal->getType(), 2, "niter",
+                                      NewHeader->getFirstNonPHIIt());
+    B2.SetInsertPoint(NewLatch->getTerminator());
+    // Instruction *IdxSub = cast<Instruction>(
+    //     B2.CreateSub(NewIdx, ConstantInt::get(NewIdx->getType(), 1),
+    //                  NewIdx->getName() + ".nsub"));
+    // IdxSub->copyIRFlags(PrimaryInc);
+    Instruction *IdxAdd = cast<Instruction>(
+        B2.CreateAdd(NewIdx, ConstantInt::get(NewIdx->getType(), 1),
+                     NewIdx->getName() + ".nadd"));
+    IdxAdd->copyIRFlags(PrimaryInc);
+
+    // NewIdx->addIncoming(TestVal, NewPreheader);
+    // NewIdx->addIncoming(IdxSub, NewLatch);
+    // Value *IdxCmp = B2.CreateIsNull(IdxSub, NewIdx->getName() + ".ncmp");
+    NewIdx->addIncoming(ConstantInt::get(TestVal->getType(), 0), LoopDetEntry);
+    NewIdx->addIncoming(IdxAdd, NewLatch);
+    Value *IdxCmp = B2.CreateICmpEQ(IdxAdd, TestVal,
+                                    NewIdx->getName() + ".ncmp");
+    ReplaceInstWithInst(NewLatch->getTerminator(),
+                        BranchInst::Create(LoopReattach, NewHeader, IdxCmp));
+  }
   DT->changeImmediateDominator(NewLatch, NewHeader);
   // The block structure of the stripmined loop should now look like so:
   //
@@ -1465,75 +1528,113 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
 
   // Update all of the old PHI nodes
   B2.SetInsertPoint(NewEntry->getTerminator());
-  Instruction *CountVal = cast<Instruction>(
-      B2.CreateMul(ConstantInt::get(NewIdx->getType(), Count),
-                   NewIdx));
-  CountVal->copyIRFlags(PrimaryInduction);
-  for (auto &InductionEntry : *TL.getInductionVars()) {
-    PHINode *OrigPhi = InductionEntry.first;
-    const InductionDescriptor &II = InductionEntry.second;
-    if (II.getStep()->isZero())
-      // Nothing to do for this Phi
-      continue;
-    // Get the new step value for this Phi.
-    Value *PhiCount = !II.getStep()->getType()->isIntegerTy()
-      ? B2.CreateCast(Instruction::SIToFP, CountVal,
-                      II.getStep()->getType())
-      : B2.CreateSExtOrTrunc(CountVal, II.getStep()->getType());
-    Value *NewStart = emitTransformedIndex(B2, PhiCount, SE, DL, II);
-
-    // Get the old increment instruction for this Phi
-    int Idx = OrigPhi->getBasicBlockIndex(NewEntry);
-    OrigPhi->setIncomingValue(Idx, NewStart);
+  if(GPU){
+  }
+  else{
+    Instruction *CountVal = cast<Instruction>(
+        B2.CreateMul(ConstantInt::get(NewIdx->getType(), Count),
+                     NewIdx));
+    CountVal->copyIRFlags(PrimaryInduction);
+    for (auto &InductionEntry : *TL.getInductionVars()) {
+      PHINode *OrigPhi = InductionEntry.first;
+      const InductionDescriptor &II = InductionEntry.second;
+      if (II.getStep()->isZero())
+        // Nothing to do for this Phi
+        continue;
+      // Get the new step value for this Phi.
+      Value *PhiCount = !II.getStep()->getType()->isIntegerTy()
+        ? B2.CreateCast(Instruction::SIToFP, CountVal,
+                        II.getStep()->getType())
+        : B2.CreateSExtOrTrunc(CountVal, II.getStep()->getType());
+      Value *NewStart = emitTransformedIndex(B2, PhiCount, SE, DL, II);
+
+      // Get the old increment instruction for this Phi
+      int Idx = OrigPhi->getBasicBlockIndex(NewEntry);
+      OrigPhi->setIncomingValue(Idx, NewStart);
+    }
   }
 
-  // Add new induction variable for inner loop.
-  PHINode *InnerIdx =
-      PHINode::Create(PrimaryInduction->getType(), 2, "inneriter");
-  InnerIdx->insertBefore(Header->getFirstNonPHIIt());
-  Value *InnerTestVal = ConstantInt::get(PrimaryInduction->getType(), Count);
-  B2.SetInsertPoint(LatchBR);
-  Instruction *InnerSub = cast<Instruction>(
-      B2.CreateSub(InnerIdx, ConstantInt::get(InnerIdx->getType(), 1),
-                   InnerIdx->getName() + ".nsub"));
-  InnerSub->copyIRFlags(PrimaryInc);
-  // Instruction *InnerAdd = cast<Instruction>(
-  //     B2.CreateAdd(InnerIdx, ConstantInt::get(InnerIdx->getType(), 1),
-  //                  InnerIdx->getName() + ".nadd"));
-  // InnerAdd->copyIRFlags(PrimaryInc);
-  Value *InnerCmp;
-  if (LatchBR->getSuccessor(0) == Header)
-    InnerCmp = B2.CreateIsNotNull(InnerSub, InnerIdx->getName() + ".ncmp");
-  else
-    InnerCmp = B2.CreateIsNull(InnerSub, InnerIdx->getName() + ".ncmp");
-  InnerIdx->addIncoming(InnerTestVal, NewEntry);
-  InnerIdx->addIncoming(InnerSub, Latch);
-  // if (LatchBR->getSuccessor(0) == Header)
-  //   InnerCmp = B2.CreateICmpNE(InnerAdd, InnerTestVal,
-  //                              InnerIdx->getName() + ".ncmp");
-  // else
-  //   InnerCmp = B2.CreateICmpEQ(InnerAdd, InnerTestVal,
-  //                              InnerIdx->getName() + ".ncmp");
-  // InnerIdx->addIncoming(ConstantInt::get(InnerIdx->getType(), 0), NewEntry);
-  // InnerIdx->addIncoming(InnerAdd, Latch);
-  LatchBR->setCondition(InnerCmp);
+  if(GPU){
+    // GPU mode: inner loop strides by grainsize.
+    PHINode *InnerIdx =
+      PHINode::Create(PrimaryInduction->getType(), 2, "inneriter", 
+                      Header->getFirstNonPHIIt());
+    // Initialize inner index to zero.
+    //Value *Zero = ConstantInt::get(PrimaryInduction->getType(), 0);
+    B2.SetInsertPoint(LatchBR->getParent()->getFirstNonPHI());
+    // Instead of subtracting one, add the grainsize.
+
+    Value *NextIdx = B2.CreateAdd(InnerIdx, StepSize,
+                                  InnerIdx->getName() + ".nadd_stride");
+
+    //NextIdx->copyIRFlags(PrimaryInc);
+    // Check if the new index is still within the original trip count.
+    InnerIdx->addIncoming(NewIdx, NewEntry);
+    InnerIdx->addIncoming(NextIdx, Latch);
+    Value *InnerCmp;
+    if (LatchBR->getSuccessor(0) == Header)
+      InnerCmp = B2.CreateICmpULT(NextIdx, TripCount,
+                                         InnerIdx->getName() + ".ncmp_final");
+    
+    else
+      InnerCmp = B2.CreateICmpUGE(NextIdx, TripCount,
+                                         InnerIdx->getName() + ".ncmp_final");
+
+    LatchBR->setCondition(InnerCmp);
+    // In the gpu case, we actually want to replace the induction variable
+    PrimaryInduction->replaceAllUsesWith(InnerIdx); 
+  } else {
+    // Add new induction variable for inner loop.
+    PHINode *InnerIdx = PHINode::Create(PrimaryInduction->getType(), 2,
+                                        "inneriter",
+                                        Header->getFirstNonPHIIt());
+    Value *InnerTestVal = ConstantInt::get(PrimaryInduction->getType(), Count);
+    B2.SetInsertPoint(LatchBR);
+    Instruction *InnerSub = cast<Instruction>(
+        B2.CreateSub(InnerIdx, ConstantInt::get(InnerIdx->getType(), 1),
+                     InnerIdx->getName() + ".nsub"));
+    InnerSub->copyIRFlags(PrimaryInc);
+    // Instruction *InnerAdd = cast<Instruction>(
+    //     B2.CreateAdd(InnerIdx, ConstantInt::get(InnerIdx->getType(), 1),
+    //                  InnerIdx->getName() + ".nadd"));
+    // InnerAdd->copyIRFlags(PrimaryInc);
+    Value *InnerCmp;
+    if (LatchBR->getSuccessor(0) == Header)
+      InnerCmp = B2.CreateIsNotNull(InnerSub, InnerIdx->getName() + ".ncmp");
+    else
+      InnerCmp = B2.CreateIsNull(InnerSub, InnerIdx->getName() + ".ncmp");
+    InnerIdx->addIncoming(InnerTestVal, NewEntry);
+    InnerIdx->addIncoming(InnerSub, Latch);
+    // if (LatchBR->getSuccessor(0) == Header)
+    //   InnerCmp = B2.CreateICmpNE(InnerAdd, InnerTestVal,
+    //                              InnerIdx->getName() + ".ncmp");
+    // else
+    //   InnerCmp = B2.CreateICmpEQ(InnerAdd, InnerTestVal,
+    //                              InnerIdx->getName() + ".ncmp");
+    // InnerIdx->addIncoming(ConstantInt::get(InnerIdx->getType(), 0), NewEntry);
+    // InnerIdx->addIncoming(InnerAdd, Latch);
+    LatchBR->setCondition(InnerCmp);
+  }
+>>>>>>> 82193e08056a (GPU reductions via stripmining pass working)
 
   // Connect the epilog code to the original loop and update the PHI functions.
   B2.SetInsertPoint(EpilogPreheader->getTerminator());
 
-  // Compute the start of the epilog iterations.  We use a divide and multiply
-  // by the power-of-2 count to simplify the SCEV's of the induction variables
-  // for later analysis passes.
-  // Value *EpilStartIter = B2.CreateSub(TripCount, ModVal);
-  Value *EpilStartIter =
-    B2.CreateMul(B2.CreateUDiv(TripCount,
-                               ConstantInt::get(TripCount->getType(), Count)),
-                 ConstantInt::get(TripCount->getType(), Count));
-  if (Instruction *ESIInst = dyn_cast<Instruction>(EpilStartIter))
-    ESIInst->copyIRFlags(PrimaryInc);
-  connectEpilog(TL, EpilStartIter, ModVal, EpilogPred, LoopReattach, NewExit,
-                LatchExit, Preheader, EpilogPreheader, VMap, DT, LI, SE, DL,
-                PreserveLCSSA);
+  if(!GPU){
+    // Compute the start of the epilog iterations.  We use a divide and multiply
+    // by the power-of-2 count to simplify the SCEV's of the induction variables
+    // for later analysis passes.
+    // Value *EpilStartIter = B2.CreateSub(TripCount, ModVal);
+    Value *EpilStartIter =
+      B2.CreateMul(B2.CreateUDiv(TripCount,
+                                 ConstantInt::get(TripCount->getType(), Count)),
+                   ConstantInt::get(TripCount->getType(), Count));
+    if (Instruction *ESIInst = dyn_cast<Instruction>(EpilStartIter))
+      ESIInst->copyIRFlags(PrimaryInc);
+    connectEpilog(TL, EpilStartIter, ModVal, EpilogPred, LoopReattach, NewExit,
+                  LatchExit, Preheader, EpilogPreheader, VMap, DT, LI, SE, DL,
+                  PreserveLCSSA);
+  }
 
   // If this loop is nested, then the loop stripminer changes the code in the
   // any of its parent loops, so the Scalar Evolution pass needs to be run
@@ -1553,29 +1654,32 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
   // }
 
   // Record that the remainder loop was derived from a Tapir loop.
-  (*RemainderLoop)->setDerivedFromTapirLoop();
+  if(!GPU)
+    (*RemainderLoop)->setDerivedFromTapirLoop();
 
   // At this point, the code is well formed.  We now simplify the new loops,
   // doing constant propagation and dead code elimination as we go.
   simplifyLoopAfterStripMine(L, /*SimplifyIVs*/ true, LI, SE, DT, TTI, AC);
   simplifyLoopAfterStripMine(NewLoop, /*SimplifyIVs*/ true, LI, SE, DT, TTI,
                              AC);
-  simplifyLoopAfterStripMine(*RemainderLoop, /*SimplifyIVs*/ true, LI, SE, DT,
-                             TTI, AC);
+  if(!GPU)
+    simplifyLoopAfterStripMine(*RemainderLoop, /*SimplifyIVs*/ true, LI, SE, DT,
+                               TTI, AC);
 
+  // TODO: update all the analyses manually
 #ifndef NDEBUG
-  DT->verify();
-  LI->verify(*DT);
+  //DT->verify();
+  //LI->verify(*DT);
 #endif
 
   // Record that the old loop was derived from a Tapir loop.
   L->setDerivedFromTapirLoop();
 
   // Update TaskInfo manually using the updated DT.
-  if (TI)
+  //if (TI)
     // FIXME: Recalculating TaskInfo for the whole function is wasteful.
     // Optimize this routine in the future.
-    TI->recalculate(*F, *DT);
+    //TI->recalculate(*F, *DT);
 
   // accumulate reductions in main loop
   const std::vector<BasicBlock*>& blocks = L->getBlocks(); 
@@ -1595,6 +1699,22 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
     }
   }
    
+  // To make the stripmining work for multiple backends, we parameterize on the step and the termination condition
+  // Roughly speaking, we want the CPU to look like
+  // n = p*s + k
+  // forall(i = 0; i<p; j++)
+  //   for(j = i*s; j < (i+1)*s, j++) 
+  //     B
+  // 
+  // forall(i = p*s; i<n; i++)
+  //   B
+  //
+  // while GPU should look like
+  // forall(i=0..p)
+  //   for(j = 0; j < n; j+=s)
+  //     B
+  //
+  //
   // accumulate reductions in epilog loop
   LLVM_DEBUG(dbgs() << "Found " << reductions.size() << " reduction variables in loop\n"); 
 
@@ -1602,16 +1722,21 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
   // TODO: Modify the strip mining outer loop to be smaller: currently we are
   // stack allocating n/2048 reduction values.
   // TODO: Initialize local reductions with unit values
-  Instruction *bloc = nullptr;
+  Instruction* bloc = nullptr;
   if(Instruction* I = dyn_cast<Instruction>(TripCount)){
-    bloc = I->getNextNode();
+    bloc = I->getParent()->getTerminator();
   } else {
     bloc = F->getEntryBlock().getTerminator();
   }
   IRBuilder<> RB(bloc); 
-  Value *outerIters = RB.CreateUDiv(TripCount,
+  Value *outerIters;
+  if(!GPU) 
+    outerIters = RB.CreateUDiv(TripCount,
                                ConstantInt::get(TripCount->getType(), Count),
                                "stripiter");
+  else 
+    outerIters = StepSize;
+
   auto nred = RB.CreateAdd(outerIters, ConstantInt::get(outerIters->getType(), 1)); 
   for(auto &pair : reductions){
     // TODO: generic allocation/free calls
@@ -1738,18 +1863,25 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
     Loop* RL = LI->AllocateLoop(); 
     if(ParentLoop) ParentLoop->addChildLoop(RL); 
     else LI->addTopLevelLoop(RL); 
-    RL->addBasicBlockToLoop(RedEpiHeader, *LI); 
-    RL->addBasicBlockToLoop(body, *LI); 
+    if(!ParentLoop){
+      RL->addBasicBlockToLoop(RedEpiHeader, *LI); 
+      RL->addBasicBlockToLoop(body, *LI); 
+    } else {
+      LI->changeLoopFor(RedEpiHeader, RL);
+      RL->addBlockEntry(RedEpiHeader);
+      LI->changeLoopFor(body, RL); 
+      RL->addBlockEntry(body); 
+    }
   }
 
   LLVM_DEBUG(dbgs() << "Function after reduction epilogue\n" << *F);  
 
   // TODO: fix DT updates
-  DT->recalculate(*F); 
+  //DT->recalculate(*F); 
 
 #ifndef NDEBUG
-  DT->verify();
-  LI->verify(*DT);
+  //DT->verify();
+  //LI->verify(*DT);
 #endif
 
   return NewLoop;
diff --git a/llvm/lib/Transforms/Tapir/LoopStripMinePass.cpp b/llvm/lib/Transforms/Tapir/LoopStripMinePass.cpp
index 9866dcd1fb9ff..435ade12f55c2 100644
--- a/llvm/lib/Transforms/Tapir/LoopStripMinePass.cpp
+++ b/llvm/lib/Transforms/Tapir/LoopStripMinePass.cpp
@@ -212,12 +212,14 @@ static bool tryToStripMineLoop(
 
   // If the loop contains potentially expensive function calls, then we don't
   // want to stripmine it.
+  /*
   if (NumCalls > 0 && !ExplicitCount && !StripMiningRequested) {
     LLVM_DEBUG(dbgs() << "  Skipping loop with expensive function calls.\n");
     ORE.emit(createMissedAnalysis("ExpensiveCalls", L)
              << "Not stripmining loop with potentially expensive calls.");
     return false;
   }
+  */
 
   // Make sure the count is a power of 2.
   if (!isPowerOf2_32(SMP.Count))
@@ -282,12 +284,25 @@ static bool tryToStripMineLoop(
   // Save loop properties before it is transformed.
   MDNode *OrigLoopID = L->getLoopID();
 
+  // TODO: change this to check tapir loop attributes for custom target
+  bool GPU = false;
+  auto target = TLI->getTapirTarget();
+  switch(target){
+    case TapirTargetID::GPU:
+    case TapirTargetID::Cuda:
+    case TapirTargetID::Hip:
+      GPU = true;
+      break;
+    default:
+      break; 
+  }
+
   // Stripmine the loop
   Loop *RemainderLoop = nullptr;
   Loop *NewLoop = StripMineLoop(L, SMP.Count, SMP.AllowExpensiveTripCount,
                                 SMP.UnrollRemainder, LI, &SE, &DT, TTI, &AC, TI,
                                 &ORE, PreserveLCSSA, ParallelEpilog,
-                                NeedNestedSync, &RemainderLoop);
+                                NeedNestedSync, &RemainderLoop, GPU);
   if (!NewLoop)
     return false;
 
@@ -372,7 +387,7 @@ PreservedAnalyses LoopStripMinePass::run(Function &F,
     // The parent must not be damaged by stripmining!
 #ifndef NDEBUG
     if (LoopChanged && ParentL)
-      ParentL->verifyLoop();
+      //ParentL->verifyLoop();
 #endif
 
     // Clear any cached analysis results for L if we removed it completely.
@@ -383,5 +398,6 @@ PreservedAnalyses LoopStripMinePass::run(Function &F,
   if (!Changed)
     return PreservedAnalyses::all();
 
-  return getLoopPassPreservedAnalyses();
+  // If we've changed, assume we've not preserved anything
+  return PreservedAnalyses::none(); 
 }
diff --git a/llvm/lib/Transforms/Tapir/LoweringUtils.cpp b/llvm/lib/Transforms/Tapir/LoweringUtils.cpp
index e90970df06f0f..fd17fbce9d882 100644
--- a/llvm/lib/Transforms/Tapir/LoweringUtils.cpp
+++ b/llvm/lib/Transforms/Tapir/LoweringUtils.cpp
@@ -25,6 +25,10 @@
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Transforms/IPO/FunctionAttrs.h"
 #include "llvm/Transforms/Tapir/Outline.h"
+#include "llvm/Transforms/Tapir/QthreadsABI.h"
+#include "llvm/Transforms/Tapir/RealmABI.h"
+#include "llvm/Transforms/Tapir/GPUABI.h"
+#include "llvm/Transforms/Tapir/SerialABI.h"
 #include "llvm/Transforms/Tapir/TapirLoopInfo.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"

From 36be3a50d68ecd6c6b96ab13014701b2959b3495 Mon Sep 17 00:00:00 2001
From: George Stelle <stelleg@lanl.gov>
Date: Thu, 22 May 2025 11:45:25 -0600
Subject: [PATCH 15/23] gpuGridSize takes iteration count

---
 llvm/lib/Transforms/Tapir/GPUABI.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Tapir/GPUABI.cpp b/llvm/lib/Transforms/Tapir/GPUABI.cpp
index 74106e56f1862..007a5d15a973c 100644
--- a/llvm/lib/Transforms/Tapir/GPUABI.cpp
+++ b/llvm/lib/Transforms/Tapir/GPUABI.cpp
@@ -41,7 +41,7 @@ Value *GPUABI::lowerGrainsizeCall(CallInst *GrainsizeCall) {
   IRBuilder<> BH(GrainsizeCall); 
   auto *M = GrainsizeCall->getModule(); 
   Type *LLVMInt64Ty = Type::getInt64Ty(M->getContext());
-  Value *GS = BH.CreateCall(M->getOrInsertFunction("gpuGridSize", LLVMInt64Ty)); 
+  Value *GS = BH.CreateCall(M->getOrInsertFunction("gpuGridSize", LLVMInt64Ty, LLVMInt64Ty), {GrainsizeCall->getArgOperand(0)}); 
   //FunctionCallee GGS = M->getOrInsertFunction("gpuGridSize", LLVMInt64Ty);
 
   // Replace uses of grainsize intrinsic call with this grainsize value.

From 957be4caf48cb0c755d2bf81afe191110eabbe10 Mon Sep 17 00:00:00 2001
From: George Stelle <stelleg@lanl.gov>
Date: Fri, 18 Jul 2025 12:49:35 -0600
Subject: [PATCH 16/23] Fix to ensure clang generates loop invariant bound for
 RHS of comparison

---
 clang/lib/CodeGen/CGKitsune.cpp | 60 ++++++++++++++++++++++++++++++++-
 1 file changed, 59 insertions(+), 1 deletion(-)

diff --git a/clang/lib/CodeGen/CGKitsune.cpp b/clang/lib/CodeGen/CGKitsune.cpp
index bc5d9f4ea7f92..1222e067d222d 100644
--- a/clang/lib/CodeGen/CGKitsune.cpp
+++ b/clang/lib/CodeGen/CGKitsune.cpp
@@ -59,6 +59,7 @@
 #include "clang/CodeGen/CGFunctionInfo.h"
 #include "clang/Frontend/FrontendDiagnostic.h"
 #include "llvm/IR/ValueMap.h"
+#include "llvm/IR/FixedPointBuilder.h"
 
 using namespace clang;
 using namespace CodeGen;
@@ -328,6 +329,11 @@ void CodeGenFunction::EmitForallStmt(const ForallStmt &S,
   // Evaluate the initialization before the loop.
   EmitStmt(S.getInit());
 
+  // We assume that the boolean is a binary operator and pre-compute RHS
+  auto* BO = dyn_cast<BinaryOperator>(S.getCond()); 
+  llvm::Value *RHS = EmitScalarExpr(BO->getRHS()); 
+
+
   // In a parallel loop there will always be a condition block
   // so there is no need to test
   JumpDest Condition = getJumpDestInCurrentScope("forall.cond");
@@ -366,7 +372,59 @@ void CodeGenFunction::EmitForallStmt(const ForallStmt &S,
 
   // C99 6.8.5p2/p4: The first substatement is executed if the expression
   // compares unequal to 0.  The condition must be a scalar type.
-  llvm::Value *BoolCondVal = EvaluateExprAsBool(S.getCond());
+  
+  //llvm::Value *BoolCondVal = EvaluateExprAsBool(S.getCond());
+  llvm::Value *LHS = EmitScalarExpr(BO->getLHS()); 
+  QualType LHSTy = BO->getLHS()->getType();
+  llvm::Value *BoolCondVal;
+  if (LHSTy->hasSignedIntegerRepresentation()) {
+    switch(BO->getOpcode()) {
+      case BO_GT:
+        BoolCondVal = Builder.CreateICmp(llvm::ICmpInst::ICMP_SGT, LHS, RHS, "cmp");
+        break;
+      case BO_GE:
+        BoolCondVal = Builder.CreateICmp(llvm::ICmpInst::ICMP_SGE, LHS, RHS, "cmp");
+        break;
+      case BO_LT:
+        BoolCondVal = Builder.CreateICmp(llvm::ICmpInst::ICMP_SLT, LHS, RHS, "cmp");
+        break;
+      case BO_LE:
+        BoolCondVal = Builder.CreateICmp(llvm::ICmpInst::ICMP_SLE, LHS, RHS, "cmp");
+        break;
+      case BO_NE:
+        BoolCondVal = Builder.CreateICmp(llvm::ICmpInst::ICMP_NE, LHS, RHS, "cmp");
+        break;
+      case BO_EQ:
+        BoolCondVal = Builder.CreateICmp(llvm::ICmpInst::ICMP_EQ, LHS, RHS, "cmp");
+        break;
+      default:
+        llvm_unreachable("Invalid comparison in forall");
+    }
+  } else {
+    switch(BO->getOpcode()){
+      case BO_GT:
+        BoolCondVal = Builder.CreateICmp(llvm::ICmpInst::ICMP_UGT, LHS, RHS, "cmp");
+        break;
+      case BO_GE:
+        BoolCondVal = Builder.CreateICmp(llvm::ICmpInst::ICMP_UGE, LHS, RHS, "cmp");
+        break;
+      case BO_LT:
+        BoolCondVal = Builder.CreateICmp(llvm::ICmpInst::ICMP_ULT, LHS, RHS, "cmp");
+        break;
+      case BO_LE:
+        BoolCondVal = Builder.CreateICmp(llvm::ICmpInst::ICMP_ULE, LHS, RHS, "cmp");
+        break;
+      case BO_NE:
+        BoolCondVal = Builder.CreateICmp(llvm::ICmpInst::ICMP_NE, LHS, RHS, "cmp");
+        break;
+      case BO_EQ:
+        BoolCondVal = Builder.CreateICmp(llvm::ICmpInst::ICMP_EQ, LHS, RHS, "cmp");
+        break;
+      default:
+        llvm_unreachable("Invalid comparison in forall");
+    }
+  }
+
   Builder.CreateCondBr(
       BoolCondVal, Detach, Sync.getBlock(),
       createProfileWeightsForLoop(S.getCond(), getProfileCount(S.getBody())));

From 3f4d462f8fc0892e9d4c0b1d9865477021be5b0d Mon Sep 17 00:00:00 2001
From: George Stelle <stelleg@lanl.gov>
Date: Thu, 2 Oct 2025 07:40:03 -0600
Subject: [PATCH 17/23] CUDA 13 fixes

---
 kitsune/runtime/cuda/dylib_support.cpp |  2 +-
 kitsune/runtime/cuda/kitcuda.cpp       |  4 ++++
 kitsune/runtime/cuda/kitcuda.h         |  1 +
 kitsune/runtime/cuda/kitcuda_dylib.h   |  2 +-
 kitsune/runtime/cuda/memory.cpp        | 15 +++++++++------
 5 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/kitsune/runtime/cuda/dylib_support.cpp b/kitsune/runtime/cuda/dylib_support.cpp
index c518e72e712a9..907d4f4ca3808 100644
--- a/kitsune/runtime/cuda/dylib_support.cpp
+++ b/kitsune/runtime/cuda/dylib_support.cpp
@@ -114,7 +114,7 @@ bool __kitcuda_load_symbols() {
   DLSYM_LOAD(cuFuncGetAttribute);
 
   /* Context management */
-  DLSYM_LOAD(cuCtxCreate_v3);
+  DLSYM_LOAD(cuCtxCreate_v4);
   DLSYM_LOAD(cuDevicePrimaryCtxRetain);
   DLSYM_LOAD(cuCtxGetCurrent);
   DLSYM_LOAD(cuCtxSetCurrent);
diff --git a/kitsune/runtime/cuda/kitcuda.cpp b/kitsune/runtime/cuda/kitcuda.cpp
index e26aafe2f5b24..76fc1c586228c 100644
--- a/kitsune/runtime/cuda/kitcuda.cpp
+++ b/kitsune/runtime/cuda/kitcuda.cpp
@@ -72,6 +72,7 @@
 bool _kitcuda_initialized = false;
 int _kitcuda_device_id = -1;
 CUdevice _kitcuda_device = -1;
+CUmemLocation _kitcuda_mem_location; 
 CUcontext _kitcuda_context;
 
 // TODO: We currently don't use these values within the runtime but
@@ -141,6 +142,9 @@ bool __kitcuda_initialize() {
   if (!__kitrt_get_env_value("KITCUDA_DEVICE_ID", _kitcuda_device_id))
     _kitcuda_device_id = 0;
 
+  _kitcuda_mem_location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  _kitcuda_mem_location.id = _kitcuda_device_id; 
+
   assert(_kitcuda_device_id < device_count &&
          "kitcuda: KITCUDA_DEVICE_ID value exceeds available number"
          " of devices.");
diff --git a/kitsune/runtime/cuda/kitcuda.h b/kitsune/runtime/cuda/kitcuda.h
index 20de2485db0c9..7ae93d4115d03 100644
--- a/kitsune/runtime/cuda/kitcuda.h
+++ b/kitsune/runtime/cuda/kitcuda.h
@@ -441,6 +441,7 @@ inline CUcontext __kitcuda_get_context() {
 #endif
 
 extern CUdevice _kitcuda_device;
+extern CUmemLocation _kitcuda_mem_location; 
 extern CUcontext _kitcuda_context;
 
 #define CU_SAFE_CALL(x)                                                        \
diff --git a/kitsune/runtime/cuda/kitcuda_dylib.h b/kitsune/runtime/cuda/kitcuda_dylib.h
index 8e89edc835549..3ed7932e84bde 100644
--- a/kitsune/runtime/cuda/kitcuda_dylib.h
+++ b/kitsune/runtime/cuda/kitcuda_dylib.h
@@ -82,7 +82,7 @@ DECLARE_DLSYM(cuFuncGetName);
 DECLARE_DLSYM(cuFuncGetAttribute);
 
 /* Context management */
-DECLARE_DLSYM(cuCtxCreate_v3);
+DECLARE_DLSYM(cuCtxCreate_v4);
 DECLARE_DLSYM(cuDevicePrimaryCtxRetain);
 DECLARE_DLSYM(cuCtxGetCurrent);
 DECLARE_DLSYM(cuCtxSetCurrent);
diff --git a/kitsune/runtime/cuda/memory.cpp b/kitsune/runtime/cuda/memory.cpp
index de1c1d24bee3d..5b8d87f490822 100644
--- a/kitsune/runtime/cuda/memory.cpp
+++ b/kitsune/runtime/cuda/memory.cpp
@@ -79,9 +79,9 @@ __kitcuda_mem_alloc_managed(size_t size) {
   // to occur on first touch -- thus our 'prefetch' status here is a bit
   // misleading (technically we are not prefetched to either host nor device).
   CU_SAFE_CALL(cuMemAdvise_p(devp, size, CU_MEM_ADVISE_SET_ACCESSED_BY,
-                             _kitcuda_device));
+                             _kitcuda_mem_location));
   CU_SAFE_CALL(cuMemAdvise_p(devp, size, CU_MEM_ADVISE_SET_PREFERRED_LOCATION,
-                             _kitcuda_device));
+                             _kitcuda_mem_location));
 
   int enable = 1;
   CU_SAFE_CALL(
@@ -267,7 +267,7 @@ void *__kitcuda_mem_gpu_prefetch(void *vp, void *opaque_stream) {
       // CU_MEM_ADVISE_SET_READ_MOSTLY advice flag.
       CU_SAFE_CALL(cuMemAdvise_p((CUdeviceptr)vp, size,
                                  CU_MEM_ADVISE_SET_PREFERRED_LOCATION,
-                                 _kitcuda_device));
+                                 _kitcuda_mem_location));
 
       // Issue a prefetch request on the provided stream.  If the given
       // stream is null, create a new stream and return it. Once issued
@@ -280,7 +280,7 @@ void *__kitcuda_mem_gpu_prefetch(void *vp, void *opaque_stream) {
       else
         cu_stream = (CUstream)__kitcuda_get_thread_stream();
 
-      CU_SAFE_CALL(cuMemPrefetchAsync_p((CUdeviceptr)vp, size, _kitcuda_device,
+      CU_SAFE_CALL(cuMemPrefetchAsync_p((CUdeviceptr)vp, size, _kitcuda_mem_location, 0,
                                         cu_stream));
       _kitcuda_mem_alloc_mutex.lock();
       __kitrt_mark_mem_prefetched(vp);
@@ -322,9 +322,12 @@ void *__kitcuda_mem_host_prefetch(void *vp, void *opaque_stream) {
       //
       // TODO: A lot of work needs to go into seeing if we can be
       // smarter about device- and host-side prefetching.
+      CUmemLocation cpu;
+      cpu.type = CU_MEM_LOCATION_TYPE_HOST;
+      cpu.id = 0; 
       CU_SAFE_CALL(cuMemAdvise_p((CUdeviceptr)vp, size,
                                  CU_MEM_ADVISE_SET_PREFERRED_LOCATION,
-                                 CU_DEVICE_CPU));
+                                 cpu));
       // Issue a prefetch request on the stream associated with the
       // calling thread. Once issued go ahead and mark the memory as
       // no long being prefetched to the device/GPU.  This "mark" does
@@ -336,7 +339,7 @@ void *__kitcuda_mem_host_prefetch(void *vp, void *opaque_stream) {
       else
         cu_stream = (CUstream)__kitcuda_get_thread_stream();
 
-      CU_SAFE_CALL(cuMemPrefetchAsync_p((CUdeviceptr)vp, size, CU_DEVICE_CPU,
+      CU_SAFE_CALL(cuMemPrefetchAsync_p((CUdeviceptr)vp, size, cpu, 0,
                                         cu_stream));
       __kitrt_set_mem_prefetch(vp, false);
       return cu_stream;

From 9fe76e2d6fb6e92f019c3f94dbb79902fe1f187e Mon Sep 17 00:00:00 2001
From: George Stelle <stelleg@lanl.gov>
Date: Thu, 18 Dec 2025 09:13:18 -0700
Subject: [PATCH 18/23] Added reduction keyword header

---
 clang/lib/Headers/kitsune.h.cmake | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Headers/kitsune.h.cmake b/clang/lib/Headers/kitsune.h.cmake
index d8e6cd4bf86ee..a82691697c135 100644
--- a/clang/lib/Headers/kitsune.h.cmake
+++ b/clang/lib/Headers/kitsune.h.cmake
@@ -23,11 +23,16 @@
 #include "kitsune_rt.h"
 
 #if defined(reduction)
-#warning found reduction definition: try puttin kitsune lower in include order
+#warning found reduction definition: try putting kitsune lower in include order
 #else
 #define reduction __attribute__((noinline, kitsune_reduction))
 #endif
 
+#if defined(reduce)
+#warning found reduce definition: try putting kitsune lower in include order
+#else
+#define reduce __kitsune_reduce(var, join, unit)
+#endif
 
 #if defined(KITSUNE_ENABLE_OPENCL_ABI_TARGET)
 #define ocl_mmap(a, n) __kitsune_opencl_mmap_marker((void*)a, n)

From 3ff073e090cda353b4b64d389640ceb3aeb070e3 Mon Sep 17 00:00:00 2001
From: George Stelle <stelleg@lanl.gov>
Date: Thu, 18 Dec 2025 09:17:43 -0700
Subject: [PATCH 19/23] Working simple reductions

---
 kitsune-tests/reductions/l2.c                 |  8 +-
 kitsune-tests/reductions/l2_cond.c            | 49 ++++++++++
 kitsune-tests/reductions/l2_openmp.c          | 10 +--
 kitsune-tests/reductions/makefile             | 31 ++++---
 kitsune-tests/reductions/triangle.c           | 49 ++++++++++
 kitsune/include/kitsune/kitsune.h             |  3 +-
 llvm/lib/Transforms/Tapir/LoopStripMine.cpp   | 42 +++++----
 .../Transforms/Tapir/LoopStripMinePass.cpp    | 90 +++++++++++++++++--
 8 files changed, 234 insertions(+), 48 deletions(-)
 create mode 100644 kitsune-tests/reductions/l2_cond.c
 create mode 100644 kitsune-tests/reductions/triangle.c

diff --git a/kitsune-tests/reductions/l2.c b/kitsune-tests/reductions/l2.c
index 3e821a6d02c74..d4c78453803c5 100644
--- a/kitsune-tests/reductions/l2.c
+++ b/kitsune-tests/reductions/l2.c
@@ -5,17 +5,19 @@
 #include<kitsune.h>
 #include<gpu.h>
 #include<stdint.h>
+#include<assert.h>
 #include<omp.h>
 
 reduction
-void sum(double *a, double b){
-  *a += b;
+void sum(double *a, double b, double unit){
+  *a += b + unit;
 }
 
 double l2(uint64_t n, double* a){
   double red = 0; 
   forall(uint64_t i=0; i<n; i++){
-    sum(&red, a[i]*a[i]); 
+    sum(&red, a[i] * a[i], 0.0); 
+    assert(i<n);
   }
 
   return sqrt(red);
diff --git a/kitsune-tests/reductions/l2_cond.c b/kitsune-tests/reductions/l2_cond.c
new file mode 100644
index 0000000000000..401baefa28300
--- /dev/null
+++ b/kitsune-tests/reductions/l2_cond.c
@@ -0,0 +1,49 @@
+#include<time.h>
+#include<math.h>
+#include<stdio.h>
+#include<stdlib.h>
+#include<kitsune.h>
+#include<gpu.h>
+#include<stdint.h>
+#include<omp.h>
+
+reduction
+void sum(double *a, double b, double unit){
+  *a += b;
+}
+
+double l2(uint64_t n, double* a){
+  double red = 0; 
+  forall(uint64_t i=0; i<n; i++){
+    if(a[i] > n/10)
+      sum(&red, a[i]*a[i], 0); 
+  }
+
+  return sqrt(red);
+}
+
+int main(int argc, char** argv){
+  int e = argc > 1 ? atoi(argv[1]) : 28; 
+  int niter = argc > 2 ? atoi(argv[2]) : 100; 
+  uint64_t n = 1ULL<<e; 
+  double* arr = (double*)gpuManagedMalloc(sizeof(double) * n); 
+
+  forall(uint64_t i=0; i<n; i++){
+    arr[i] = i; 
+  }
+
+  l2(n, arr);
+
+  double par = 0; 
+  double before = omp_get_wtime();
+  for(int i=0; i<niter; i++){
+    par = l2(n, arr);
+  }
+  double after = omp_get_wtime(); 
+  double partime = after - before; 
+
+  printf("%f in %f s\n" , par, partime);
+  double bw = (double)((1ULL<<e) * niter * sizeof(double)) / (1000000000.0 * partime);  
+  printf("bandwidth: %f GB/s \n" , bw);
+}
+
diff --git a/kitsune-tests/reductions/l2_openmp.c b/kitsune-tests/reductions/l2_openmp.c
index 15855bf92a33a..2061029e9d033 100644
--- a/kitsune-tests/reductions/l2_openmp.c
+++ b/kitsune-tests/reductions/l2_openmp.c
@@ -9,20 +9,12 @@ double l2(uint64_t n, double* a){
   double red = 0; 
   #pragma omp parallel for reduction(+:red)
   for(uint64_t i=0; i<n; i++){
-    red += a[i]*a[i]; 
+    red += a[i] * a[i]; 
   }
 
   return sqrt(red);
 }
 
-double l2_seq(uint64_t n, double* a){
-  double red = 0; 
-  for(uint64_t i=0; i < n; i++){
-    red += a[i]*a[i]; 
-  }
-  return sqrt(red);
-}
-
 int main(int argc, char** argv){
   int e = argc > 1 ? atoi(argv[1]) : 28; 
   int niter = argc > 2 ? atoi(argv[2]) : 100; 
diff --git a/kitsune-tests/reductions/makefile b/kitsune-tests/reductions/makefile
index 3d45d5585723d..cc53950dcc719 100644
--- a/kitsune-tests/reductions/makefile
+++ b/kitsune-tests/reductions/makefile
@@ -1,21 +1,30 @@
-exes=l2_serial l2_opencilk l2_gpu l2_serial l2_openmp l2_cuda
+exes=l2_opencilk l2_gpu triangle_opencilk triangle_gpu triangle_serial
 
 all: $(exes)
 
-l2_serial: l2.c
-	clang $< -o $@ -O2 -fuse-ld=lld -fPIC -o l2_serial -ftapir=serial -lm -lllvm-gpu -lomp
+%_serial: %.c
+	clang $< -o $@ -O2 -fuse-ld=lld -fPIC -ftapir=serial -lm -lllvm-gpu -lomp #-mllvm -debug-only="loop-stripmine"
 
-l2_opencilk: l2.c
-	clang $< -o $@ -O2 -fuse-ld=lld -fPIC -o l2_opencilk -ftapir=opencilk -lm -lllvm-gpu -lomp -mllvm -debug-only="loop-stripmine"
+%_opencilk: %.c
+	clang $< -o $@ -O2 -fuse-ld=lld -fPIC -ftapir=opencilk -lm -lllvm-gpu -lomp #-mllvm -debug-only="loop-stripmine"
 
-l2_gpu: l2.c
-	clang $< -o $@ -O2 -fPIC -o l2_gpu -ftapir=gpu -lm -lllvm-gpu -lomp -mllvm -debug-only="loop-stripmine"
+%_opencilk.ll: %.c
+	clang $< -o $@ -O2 -fPIC -ftapir=opencilk -mllvm -debug-only="loop-stripmine" -S -emit-llvm
 
-l2_cuda: l2.c
-	kitcc $< -o $@ -O2 -fPIC -o l2_cuda -ftapir=cuda -lm -lllvm-gpu -lomp -v -mllvm -cuabi-arch=sm_75
+%_gpu: %.c
+	clang $< -o $@ -O2 -fPIC -ftapir=gpu -lm -lllvm-gpu -lomp #-mllvm -debug-only="loop-stripmine"
 
-l2_openmp: l2_openmp.c
-	clang l2_openmp.c -fopenmp -O2 -o l2_openmp -lm -lgomp
+%_gpu.ll: %.c
+	clang $< -o $@ -O2 -fPIC -ftapir=gpu -mllvm -debug-only="loop-stripmine" -S -emit-llvm
+
+%_cuda: %.c
+	kitcc $< -o $@ -O2 -fPIC -ftapir=cuda -lm -lllvm-gpu -lomp -v -mllvm -cuabi-arch=sm_75
+
+%_stripmined.ll: %.c 
+	clang $< -o $@ -O1 -ftapir=none -S -emit-llvm -mllvm -stripmine-loop
+
+%_openmp: %_openmp.c
+	clang $< -fopenmp -O2 -o $@ -lm 
 
 clean: 
 	rm -f $(exes) l2_stripmined.ll l2 l2.ll
diff --git a/kitsune-tests/reductions/triangle.c b/kitsune-tests/reductions/triangle.c
new file mode 100644
index 0000000000000..720defc5a9898
--- /dev/null
+++ b/kitsune-tests/reductions/triangle.c
@@ -0,0 +1,49 @@
+#include<time.h>
+#include<math.h>
+#include<stdio.h>
+#include<stdlib.h>
+#include<kitsune.h>
+#include<gpu.h>
+#include<stdint.h>
+#include<omp.h>
+#include<assert.h>
+
+reduction
+void sum(int *a, int b, int unit){
+  *a += b + unit;
+}
+
+int triangle(uint64_t n, int* a){
+  int red = 0; 
+  forall(uint64_t i=0; i<n; i++){
+    sum(&red, a[i], 0); 
+  }
+  return red; 
+}
+
+int main(int argc, char** argv){
+  int e = argc > 1 ? atoi(argv[1]) : 100; 
+  int niter = argc > 2 ? atoi(argv[2]) : 100; 
+  uint64_t n = e;//1ULL<<e; 
+  int* arr = (int*)gpuManagedMalloc(sizeof(int) * n); 
+
+  forall(uint64_t i=0; i<n; i++){
+    arr[i] = i; 
+  }
+
+  printf("%d, %d, %d\n", triangle(n, arr), n, (n*(n-1))/2);
+
+  int par = 0; 
+  double before = omp_get_wtime();
+  for(int i=0; i<niter; i++){
+    par = triangle(n, arr);
+    assert(par == (n*(n-1))/2);
+  }
+  double after = omp_get_wtime(); 
+  double partime = after - before; 
+
+  printf("%d in %f s\n" , par, partime);
+  double bw = (double)((1ULL<<e) * niter * sizeof(double)) / (1000000000.0 * partime);  
+  printf("bandwidth: %f GB/s \n" , bw);
+}
+
diff --git a/kitsune/include/kitsune/kitsune.h b/kitsune/include/kitsune/kitsune.h
index c6c6344ab6f3b..57003b9800ff7 100644
--- a/kitsune/include/kitsune/kitsune.h
+++ b/kitsune/include/kitsune/kitsune.h
@@ -14,9 +14,10 @@
 #if defined(reduction)
 #warning found reduction definition: try puttin kitsune lower in include order
 #else
-#define reduction __attribute__((noinline, kitsune_reduction))
+#define reduction __attribute__((optnone, noinline, kitsune_reduction))
 #endif
 
+
 #if defined(spawn)
 // FIXME KITSUNE: Should this be an error instead of a warning?
 #warning encountered multiple definitions of spawn!
diff --git a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
index 3d1d008c7ebb4..5774398067ea8 100644
--- a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
+++ b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
@@ -1718,7 +1718,7 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
   // accumulate reductions in epilog loop
   LLVM_DEBUG(dbgs() << "Found " << reductions.size() << " reduction variables in loop\n"); 
 
-  std::vector<std::tuple<CallInst*, Value* , Value*, Type*>> redMap; 
+  std::vector<std::tuple<CallInst*, Value* , Value*, Type*, Value*>> redMap; 
   // TODO: Modify the strip mining outer loop to be smaller: currently we are
   // stack allocating n/2048 reduction values.
   // TODO: Initialize local reductions with unit values
@@ -1742,6 +1742,7 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
     // TODO: generic allocation/free calls
     auto ci = pair.first; 
     auto ptr = ci->getArgOperand(0); 
+    auto unit = ci->getArgOperand(2); 
     auto ty = pair.second; 
     auto gmmTy = FunctionType::get(ptr->getType(), { nred->getType() }, false); 
     auto arrSize = RB.CreateMul(nred, ConstantInt::get(nred->getType(), DL.getTypeAllocSize(ty))); 
@@ -1752,7 +1753,7 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
     auto lptr = BH.CreateBitCast(
       BH.CreateGEP(ty, al, NewIdx), 
       ptr->getType());                             
-    redMap.push_back(std::make_tuple(ci, ptr, al, ty)); 
+    redMap.push_back(std::make_tuple(ci, ptr, al, ty, unit)); 
     // Assume there is more than one element, and
     // use the first element for the first iteration of the loop.
     // roughly: 
@@ -1770,7 +1771,7 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
     //   }
     //   for( j ∈ j_k_m .. n )
     //     reduce(localred+m, body(j)); 
-    //   }
+    //   
     //   for(k ∈ 0..m) 
     //     reduce(&red, localred[k]); 
     //
@@ -1801,28 +1802,33 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
   if(!reductions.empty()){
     // Peel the first iteration of the loop and replace the reduction calls in
     // the peeled code with stores
+    // Can't do this in general if the reduction is conditional
     ValueToValueMapTy VMap; 
-    peelLoop(L, 1, LI, SE, *DT, AC, PreserveLCSSA, VMap); 
-    SmallVector<Instruction*, 4> cis;
+    //peelLoop(L, 1, LI, SE, *DT, AC, PreserveLCSSA, VMap); 
+    SmallVector<Instruction*, 4> CIS;
     for(auto &BB : NewLoop->blocks()){
-      if(!L->contains(BB)){ // better way?
-        for(auto &I : *BB){
-          if(auto *CI = dyn_cast<CallInst>(&I)){
-            auto *f = CI->getCalledFunction(); 
-            if(f->getAttributes().hasAttrSomewhere(Attribute::KitsuneReduction)){
-              IRBuilder<> pb(&I); 
-              pb.CreateStore(CI->getArgOperand(1), CI->getArgOperand(0)); 
-              cis.push_back(&I); 
-              f->removeFnAttr(Attribute::NoInline); 
-            }
+      // We find the location that we reduce into and create a store of unit
+      // TODO: Get unit value for reduction
+      for(auto &I : *BB){
+        if(auto *CI = dyn_cast<CallInst>(&I)){
+          auto *F = CI->getCalledFunction(); 
+          if(F->getAttributes().hasAttrSomewhere(Attribute::KitsuneReduction)){
+            // this must be defined in the outer parallel loop but before the inner loop 
+            IRBuilder<> PB(dyn_cast<Instruction>(CI->getArgOperand(0))->getNextNode()); 
+            PB.CreateStore(CI->getArgOperand(2), CI->getArgOperand(0)); 
+            CIS.push_back(&I); 
+            F->removeFnAttr(Attribute::NoInline); 
+            F->removeFnAttr(Attribute::OptimizeNone); 
           }
         }
       }
     }
     
-    for(auto &I : cis){
+    /*
+    for(auto &I : CIS){
       I->eraseFromParent();  
     }
+    */
 
     Instruction* term = LatchExit->getTerminator(); 
     BasicBlock *PostSync = term->getSuccessor(0);
@@ -1843,13 +1849,13 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
     // For each reduction, get the allocated thread local reduced values and
     // reduce them. 
     for(auto& kv : redMap){
-      const auto [ ci, ptr, al, ty ] = kv; 
+      const auto [ ci, ptr, al, ty, unit ] = kv; 
       auto lptr = BB.CreateBitCast(
         BB.CreateGEP(ty, al, Idx), 
         ptr->getType());                             
       auto x = BB.CreateLoad(ty, lptr); 
       BB.SetCurrentDebugLocation(ci->getDebugLoc()); 
-      BB.CreateCall(ci->getCalledFunction(), { ptr, x }); 
+      BB.CreateCall(ci->getCalledFunction(), { ptr, x , unit}); 
     }
     Value *IdxAdd =
         BB.CreateAdd(Idx, ConstantInt::get(Idx->getType(), 1),
diff --git a/llvm/lib/Transforms/Tapir/LoopStripMinePass.cpp b/llvm/lib/Transforms/Tapir/LoopStripMinePass.cpp
index 435ade12f55c2..e4947490c7e7a 100644
--- a/llvm/lib/Transforms/Tapir/LoopStripMinePass.cpp
+++ b/llvm/lib/Transforms/Tapir/LoopStripMinePass.cpp
@@ -152,12 +152,12 @@ static bool tryToStripMineLoop(
 
   // If the loop size is unknown, then we cannot compute a stripmining count for
   // it.
-  if (!ExplicitCount && UnknownSize) {
-    LLVM_DEBUG(dbgs() << "  Not stripmining loop with unknown size.\n");
-    ORE.emit(createMissedAnalysis("UnknownSize", L)
-             << "Cannot stripmine loop with unknown size.");
-    return false;
-  }
+  //if (!ExplicitCount && UnknownSize) {
+  //  LLVM_DEBUG(dbgs() << "  Not stripmining loop with unknown size.\n");
+  //  ORE.emit(createMissedAnalysis("UnknownSize", L)
+  //           << "Cannot stripmine loop with unknown size.");
+  //  return false;
+ // }
 
   // If the loop size is enormous, then we might want to use a stripmining count
   // of 1 for it.
@@ -288,6 +288,9 @@ static bool tryToStripMineLoop(
   bool GPU = false;
   auto target = TLI->getTapirTarget();
   switch(target){
+    // We don't want to stripmine for serial targets
+    case TapirTargetID::Serial:
+      return false; 
     case TapirTargetID::GPU:
     case TapirTargetID::Cuda:
     case TapirTargetID::Hip:
@@ -322,6 +325,81 @@ static bool tryToStripMineLoop(
   return true;
 }
 
+namespace {
+
+class LoopStripMine : public LoopPass {
+public:
+  static char ID; // Pass ID, replacement for typeid
+
+  std::optional<unsigned> ProvidedCount;
+
+  LoopStripMine(std::optional<unsigned> Count = std::nullopt)
+      : LoopPass(ID), ProvidedCount(Count) {
+    initializeLoopStripMinePass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+    if (skipLoop(L))
+      return false;
+
+    Function &F = *L->getHeader()->getParent();
+
+    auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    TaskInfo *TI = &getAnalysis<TaskInfoWrapperPass>().getTaskInfo();
+    ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    const TargetTransformInfo &TTI =
+        getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+    // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
+    // pass.  Function analyses need to be preserved across loop transformations
+    // but ORE cannot be preserved (see comment before the pass definition).
+    OptimizationRemarkEmitter ORE(&F);
+    bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
+
+    bool ret = tryToStripMineLoop(L, DT, LI, SE, TTI, AC, TI, ORE, &TLI,
+                              PreserveLCSSA, ProvidedCount);
+    if(!ret){
+      ORE.emit(DiagnosticInfoOptimizationFailure(
+                    DEBUG_TYPE, "FailedRequestedSpawning",
+                    L->getStartLoc(), L->getHeader())
+                << "Tapir loop not stripmined");
+    }
+    return ret;
+  }
+
+  /// This transformation requires natural loop information & requires that
+  /// loop preheaders be inserted into the CFG...
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    getLoopAnalysisUsage(AU);
+  }
+};
+
+} // end anonymous namespace
+
+char LoopStripMine::ID = 0;
+
+INITIALIZE_PASS_BEGIN(LoopStripMine, "loop-stripmine", "Stripmine Tapir loops",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(LoopStripMine, "loop-stripmine", "Stripmine Tapir loops",
+                    false, false)
+
+Pass *llvm::createLoopStripMinePass(int Count) {
+  // TODO: It would make more sense for this function to take the optionals
+  // directly, but that's dangerous since it would silently break out of tree
+  // callers.
+  return new LoopStripMine(Count == -1 ? std::nullopt
+                                       : std::optional<unsigned>(Count));
+}
+
 PreservedAnalyses LoopStripMinePass::run(Function &F,
                                          FunctionAnalysisManager &AM) {
   Module& M  = *F.getParent();

From 5e7420f2b8fea2e8b0cd4582428449b8fe38642d Mon Sep 17 00:00:00 2001
From: George Stelle <stelleg@lanl.gov>
Date: Thu, 18 Dec 2025 09:18:56 -0700
Subject: [PATCH 20/23] GPU ABI fixes

---
 llvm/lib/Transforms/Tapir/GPUABI.cpp | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Tapir/GPUABI.cpp b/llvm/lib/Transforms/Tapir/GPUABI.cpp
index 007a5d15a973c..ec5ccd68cdce9 100644
--- a/llvm/lib/Transforms/Tapir/GPUABI.cpp
+++ b/llvm/lib/Transforms/Tapir/GPUABI.cpp
@@ -41,13 +41,17 @@ Value *GPUABI::lowerGrainsizeCall(CallInst *GrainsizeCall) {
   IRBuilder<> BH(GrainsizeCall); 
   auto *M = GrainsizeCall->getModule(); 
   Type *LLVMInt64Ty = Type::getInt64Ty(M->getContext());
-  Value *GS = BH.CreateCall(M->getOrInsertFunction("gpuGridSize", LLVMInt64Ty, LLVMInt64Ty), {GrainsizeCall->getArgOperand(0)}); 
+  Type *LLVMInt32Ty = Type::getInt64Ty(M->getContext());
+  // we have to cast 
+  Value *GSO = BH.CreateIntCast(GrainsizeCall->getOperand(0), LLVMInt64Ty, false);
+  Value *GS = BH.CreateCall(M->getOrInsertFunction("gpuGridSize", LLVMInt32Ty, LLVMInt64Ty), {GSO}); 
+  Value *GSN = BH.CreateIntCast(GS, GrainsizeCall->getType(), false);
   //FunctionCallee GGS = M->getOrInsertFunction("gpuGridSize", LLVMInt64Ty);
 
   // Replace uses of grainsize intrinsic call with this grainsize value.
   //GrainsizeCall->setCalledFunction(GGS); 
-  GrainsizeCall->replaceAllUsesWith(GS); 
-  return GS;
+  GrainsizeCall->replaceAllUsesWith(GSN); 
+  return GSN;
 }
 
 void GPUABI::lowerSync(SyncInst &SI) {
@@ -398,7 +402,8 @@ void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
 
   Value* argsPtr = B.CreateConstInBoundsGEP2_32(arrayType, argArray, 0, 0); 
   Value* bcPtr = B.CreateConstInBoundsGEP2_32(LLVMGlobal->getValueType(), LLVMGlobal, 0, 0); 
-  Value* stream = B.CreateCall(GPULaunchKernel, { bcPtr, kernelSize, argsPtr, TripCount });
+  Value *TripCount64 = B.CreateIntCast(TripCount, Int64Ty, true);
+  Value* stream = B.CreateCall(GPULaunchKernel, { bcPtr, kernelSize, argsPtr, TripCount64 });
   B.CreateCall(GPUWaitKernel, stream);
 
   LLVM_DEBUG(dbgs() << "Finished processOutlinedLoopCall: " << M);

From d5db34cc149a7e37604b13b99137aa563b759784 Mon Sep 17 00:00:00 2001
From: George Stelle <stelleg@lanl.gov>
Date: Thu, 18 Dec 2025 09:19:41 -0700
Subject: [PATCH 21/23] Make stripmining not depenedent on O2 or O3

---
 llvm/lib/Passes/PassBuilderPipelines.cpp | 56 ++++++++++++------------
 1 file changed, 27 insertions(+), 29 deletions(-)

diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 57c38dcd5aa1e..fc623989e2c29 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -1597,34 +1597,6 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
   // rather than on each loop in an inside-out manner, and so they are actually
   // function passes.
 
-  // Stripmine Tapir loops, if pass is enabled.
-  if (PTO.LoopStripmine && Level.getSpeedupLevel() > 1 &&
-      !Level.isOptimizingForSize()) {
-    LoopPassManager LPM1, LPM2;
-    LPM1.addPass(
-        IndVarSimplifyPass(/*WidenIndVars=*/true, /*TapirLoopsOnly=*/true));
-    OptimizePM.addPass(
-        createFunctionToLoopPassAdaptor(std::move(LPM1),
-                                        /*UseMemorySSA=*/true,
-                                        /*UseBlockFrequencyInfo=*/true));
-    OptimizePM.addPass(LoopStripMinePass());
-    // Cleanup tasks after stripmining loops.
-    OptimizePM.addPass(TaskSimplifyPass());
-    // Cleanup after stripmining loops.
-    LPM2.addPass(LoopSimplifyCFGPass());
-    LPM2.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap,
-                          /*AllowSpeculation=*/true));
-    OptimizePM.addPass(
-        createFunctionToLoopPassAdaptor(std::move(LPM2),
-                                        /*UseMemorySSA=*/true,
-                                        /*UseBlockFrequencyInfo=*/true));
-    // Don't run IndVarSimplify at this point, as it can actually inhibit
-    // vectorization in some cases.
-    OptimizePM.addPass(JumpThreadingPass());
-    OptimizePM.addPass(CorrelatedValuePropagationPass());
-    OptimizePM.addPass(InstCombinePass());
-  }
-
   invokeVectorizerStartEPCallbacks(OptimizePM, Level);
 
   LoopPassManager LPM;
@@ -1760,8 +1732,34 @@ PassBuilder::buildTapirLoopLoweringPipeline(OptimizationLevel Level,
   FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM2),
                                               /*UseMemorySSA=*/false,
                                               /*UseBlockFrequencyInfo=*/false));
-  MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
 
+  // Stripmine Tapir loops, if pass is enabled.
+  if (PTO.LoopStripmine) {
+    LoopPassManager LPM1, LPM2;
+    LPM1.addPass(TapirIndVarSimplifyPass());
+    FPM.addPass(
+        createFunctionToLoopPassAdaptor(std::move(LPM1),
+                                        /*UseMemorySSA=*/true,
+                                        /*UseBlockFrequencyInfo=*/true));
+    FPM.addPass(LoopStripMinePass());
+    // Cleanup tasks after stripmining loops.
+    FPM.addPass(TaskSimplifyPass());
+    // Cleanup after stripmining loops.
+    LPM2.addPass(LoopSimplifyCFGPass());
+    LPM2.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap,
+                          /*AllowSpeculation=*/true));
+    FPM.addPass(
+        createFunctionToLoopPassAdaptor(std::move(LPM2),
+                                        /*UseMemorySSA=*/true,
+                                        /*UseBlockFrequencyInfo=*/true));
+    // Don't run IndVarSimplify at this point, as it can actually inhibit
+    // vectorization in some cases.
+    FPM.addPass(JumpThreadingPass());
+    FPM.addPass(CorrelatedValuePropagationPass());
+    FPM.addPass(InstCombinePass());
+  }
+
+  MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
   // Outline Tapir loops as needed.
   MPM.addPass(LoopSpawningPass(Level));
   if (VerifyTapirLowering)

From 316b838d25487000fde03b7cb157ab200aca798f Mon Sep 17 00:00:00 2001
From: George Stelle <stelleg@lanl.gov>
Date: Thu, 18 Dec 2025 09:20:33 -0700
Subject: [PATCH 22/23] Make dead argument elimination disabled by optnone

---
 llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
index d32b829e2ad79..bede4ec6c26e7 100644
--- a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -284,6 +284,10 @@ bool DeadArgumentEliminationPass::removeDeadArgumentsFromCallers(Function &F) {
   if (F.hasFnAttribute(Attribute::Naked))
     return false;
 
+  // Don't operate on optnone
+  if (F.hasFnAttribute(Attribute::OptimizeNone))
+    return false;
+
   if (F.use_empty())
     return false;
 
@@ -730,6 +734,10 @@ bool DeadArgumentEliminationPass::removeDeadStuffFromFunction(Function *F) {
   if (FrozenFunctions.count(F))
     return false;
 
+  // Don't operate on optnone
+  if (F->hasFnAttribute(Attribute::OptimizeNone))
+    return false;
+
   // Start by computing a new prototype for the function, which is the same as
   // the old function, but has fewer arguments and a different return type.
   FunctionType *FTy = F->getFunctionType();

From 7113766da4a89fe405be2bae5c1a7d969a3c57c2 Mon Sep 17 00:00:00 2001
From: George Stelle <stelleg@lanl.gov>
Date: Thu, 8 Jan 2026 16:58:44 -0700
Subject: [PATCH 23/23] 21.x rebase fixes

---
 clang/include/clang/Basic/Attr.td             |   1 +
 clang/lib/CodeGen/CGKitsune.cpp               |   1 +
 clang/lib/Driver/Driver.cpp                   |   3 +
 clang/lib/Driver/KitsuneOptionUtils.cpp       |   3 +
 clang/lib/Driver/Tapir.cpp                    |  84 ----------
 clang/lib/Driver/ToolChain.cpp                |  49 +-----
 clang/lib/Frontend/CompilerInvocation.cpp     |   1 +
 clang/lib/Headers/kitsune.h.cmake             | 150 ------------------
 clang/lib/Headers/magma.h                     |  43 -----
 clang/lib/Headers/reductions.h                |  56 -------
 kitsune-tests/reductions/l2.c                 |   3 +-
 kitsune-tests/reductions/makefile             |  16 +-
 kitsune/docs/limitations.md                   |  11 ++
 kitsune/docs/using.md                         |  30 ++--
 kitsune/include/kitsune/Config/config.h.cmake |   3 +
 kitsune/include/kitsune/Core/Tapir.h          |   3 +-
 kitsune/include/kitsune/Core/TapirTargets.h   |   6 +-
 kitsune/lib/Analysis/TapirTargetAnalysis.cpp  |   5 +
 kitsune/lib/Core/TapirTargetOptions.cpp       |   2 +-
 kitsune/lib/Support/Deserialize.cpp           |   4 +-
 kitsune/lib/Support/ToString.cpp              |   2 +
 llvm/CMakeLists.txt                           |   4 +-
 llvm/include/llvm/IR/Attributes.td            |   6 +-
 llvm/include/llvm/Transforms/Tapir/GPUABI.h   |   5 +-
 .../llvm/Transforms/Tapir/LoopStripMinePass.h |   2 +-
 .../llvm/Transforms/Tapir/TapirTargetIDs.h    | 115 --------------
 llvm/lib/Passes/PassBuilderPipelines.cpp      |  12 +-
 llvm/lib/Passes/PassRegistry.def              |   2 +-
 .../IPO/DeadArgumentElimination.cpp           |   4 +-
 llvm/lib/Transforms/Tapir/GPUABI.cpp          |   7 +-
 llvm/lib/Transforms/Tapir/LoopStripMine.cpp   | 118 +++++++++-----
 .../Transforms/Tapir/LoopStripMinePass.cpp    | 122 ++++----------
 .../Tapir/jump-threading-detach-continue-2.ll |   2 +-
 .../loop-stripmine-zero-iter-pfor-loop.ll     |   2 +-
 .../Transforms/Tapir/missed-loop-opts-test.ll |   2 +-
 35 files changed, 198 insertions(+), 681 deletions(-)
 delete mode 100644 clang/lib/Driver/Tapir.cpp
 delete mode 100644 clang/lib/Headers/kitsune.h.cmake
 delete mode 100644 clang/lib/Headers/magma.h
 delete mode 100644 clang/lib/Headers/reductions.h
 delete mode 100644 llvm/include/llvm/Transforms/Tapir/TapirTargetIDs.h

diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index f4f7d7f6e666f..50063de0d4067 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -5227,6 +5227,7 @@ def NonString : InheritableAttr {
   let Spellings = [GCC<"nonstring">];
   let Subjects = SubjectList<[Var, Field]>;
   let Documentation = [NonStringDocs];
+}
 
 // +===== kitsune-/tapir-centric attributes
 
diff --git a/clang/lib/CodeGen/CGKitsune.cpp b/clang/lib/CodeGen/CGKitsune.cpp
index 1222e067d222d..d026d74f0e7bb 100644
--- a/clang/lib/CodeGen/CGKitsune.cpp
+++ b/clang/lib/CodeGen/CGKitsune.cpp
@@ -329,6 +329,7 @@ void CodeGenFunction::EmitForallStmt(const ForallStmt &S,
   // Evaluate the initialization before the loop.
   EmitStmt(S.getInit());
 
+  // TODO: explain more
   // We assume that the boolean is a binary operator and pre-compute RHS
   auto* BO = dyn_cast<BinaryOperator>(S.getCond()); 
   llvm::Value *RHS = EmitScalarExpr(BO->getRHS()); 
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index d861e8474050d..9897492f05cee 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -161,6 +161,9 @@ static void CheckTTEnabled(const Driver &D, llvm::TTID TT) {
     if (!KITSUNE_REALM_ENABLED)
       D.Diag(diag::err_drv_kitsune_target_not_enabled) << llvm::toString(TT);
     return;
+  case llvm::TTID::GPU:
+    // TODO: Check this
+    return;
   case llvm::TTID::Serial:
     // The serial tapir target is always enabled
     return;
diff --git a/clang/lib/Driver/KitsuneOptionUtils.cpp b/clang/lib/Driver/KitsuneOptionUtils.cpp
index 56695829646b3..00a2de5e70298 100644
--- a/clang/lib/Driver/KitsuneOptionUtils.cpp
+++ b/clang/lib/Driver/KitsuneOptionUtils.cpp
@@ -112,6 +112,8 @@ clang::getTapirTargetConfigFileName(const opt::ArgList &args) {
     return "qthreads.cfg";
   case TTID::Realm:
     return "realm.cfg";
+  case TTID::GPU:
+    return "gpu.cfg";
   }
   llvm_unreachable("getTapirTargetConfigFile: TTID not handled");
 }
@@ -323,6 +325,7 @@ static bool parseKitsuneTTArgs(KitsuneOptions &kitOpts, TTID tt,
   case llvm::TTID::Realm:
     return parseKitsuneRealmArgs(kitOpts, args, optTable, diags);
   case llvm::TTID::Serial:
+  case llvm::TTID::GPU: // TODO:?
     return true;
   }
   llvm_unreachable("ParseKitsuneTTArgs: TTID not handled");
diff --git a/clang/lib/Driver/Tapir.cpp b/clang/lib/Driver/Tapir.cpp
deleted file mode 100644
index 0c9826526fcb1..0000000000000
--- a/clang/lib/Driver/Tapir.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-//===--- Tapir.cpp - C Language Family Language Options ---------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//  This file defines the functions from Tapir.h
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang/Driver/Tapir.h"
-#include "clang/Driver/Options.h"
-#include "clang/Driver/ToolChain.h"
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/Option/Arg.h"
-#include "llvm/Option/ArgList.h"
-#include "llvm/Transforms/Tapir/TapirTargetIDs.h"
-
-using namespace clang::driver;
-using namespace llvm;
-
-std::optional<TapirTargetID> clang::parseTapirTarget(const opt::ArgList &Args) {
-  if (const opt::Arg *A = Args.getLastArg(options::OPT_ftapir_EQ))
-    return llvm::StringSwitch<std::optional<TapirTargetID>>(A->getValue())
-        .Case("none", TapirTargetID::None)
-        .Case("serial", TapirTargetID::Serial)
-        .Case("cuda", TapirTargetID::Cuda)
-        .Case("hip", TapirTargetID::Hip)
-        .Case("gpu", TapirTargetID::GPU)
-        .Case("opencilk", TapirTargetID::OpenCilk)
-        .Case("openmp", TapirTargetID::OpenMP)
-        .Case("qthreads", TapirTargetID::Qthreads)
-        .Case("realm", TapirTargetID::Realm)
-        .Default(std::nullopt);
-  return std::nullopt;
-}
-
-std::optional<TapirNVArchTargetID>
-clang::parseTapirNVArchTarget(const opt::ArgList &Args) {
-  if (const opt::Arg *A = Args.getLastArg(options::OPT_ftapir_nvarch_EQ))
-    return llvm::StringSwitch<TapirNVArchTargetID>(A->getValue())
-        .Case("sm_50", TapirNVArchTargetID::SM_50)
-        .Case("sm_52", TapirNVArchTargetID::SM_52)
-        .Case("sm_53", TapirNVArchTargetID::SM_53)
-        .Case("sm_60", TapirNVArchTargetID::SM_60)
-        .Case("sm_62", TapirNVArchTargetID::SM_62)
-        .Case("sm_70", TapirNVArchTargetID::SM_70)
-        .Case("sm_75", TapirNVArchTargetID::SM_75)
-        .Case("sm_80", TapirNVArchTargetID::SM_80)
-        .Case("sm_86", TapirNVArchTargetID::SM_86)
-        .Case("sm_90", TapirNVArchTargetID::SM_90)
-        .Default(TapirNVArchTargetID::Last_TapirNVArchTargetID);
-
-  return std::nullopt;
-}
-
-std::optional<llvm::StringRef>
-clang::getTargetConfigFileName(const opt::ArgList &Args) {
-  if (std::optional<TapirTargetID> tt = parseTapirTarget(Args)) {
-    switch (*tt) {
-    case TapirTargetID::None:
-      return "none.cfg";
-    case TapirTargetID::Serial:
-      return "serial.cfg";
-    case TapirTargetID::Cuda:
-      return "cuda.cfg";
-    case TapirTargetID::Hip:
-      return "hip.cfg";
-    case TapirTargetID::OpenCilk:
-      return "opencilk.cfg";
-    case TapirTargetID::OpenMP:
-      return "openmp.cfg";
-    case TapirTargetID::Qthreads:
-      return "qthreads.cfg";
-    case TapirTargetID::Realm:
-      return "realm.cfg";
-    default:
-      return std::nullopt;
-    }
-  }
-  return std::nullopt;
-}
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index 670f43e8e9ff2..c2d263e58a3af 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -2272,13 +2272,12 @@ void ToolChain::AddKitsuneRealmCommonArgs(const ArgList &Args,
 
 void ToolChain::AddKitsunePreprocessorArgs(const ArgList &Args,
                                            ArgStringList &CmdArgs) const {
-<<<<<<< HEAD
   auto AddTTArgs = [&](TTID TT, const ArgList &Args,
                        ArgStringList &CmdArgs) -> void {
     switch (TT) {
     case TTID::Nolo:
       return;
-    case TTID:GPU:
+    case TTID::GPU:
       return;
     case TTID::Cuda:
       return ExtractArgsFromString(KITSUNE_CUDA_EXTRA_PREPROCESSOR_FLAGS,
@@ -2344,7 +2343,6 @@ void ToolChain::AddKitsuneCompilerArgs(const ArgList &Args,
     case TTID::Hip:
       AddKitsuneHipCommonArgs(Args, CmdArgs);
       ExtractArgsFromString(KITSUNE_HIP_EXTRA_COMPILER_FLAGS, CmdArgs, Args);
-<<<<<<< HEAD
       return;
     case TTID::Lambda:
       AddKitsuneLambdaCommonArgs(Args, CmdArgs);
@@ -2357,15 +2355,11 @@ void ToolChain::AddKitsuneCompilerArgs(const ArgList &Args,
       return;
     case TTID::OpenCilk:
       AddKitsuneOpenCilkCommonArgs(Args, CmdArgs);
-=======
-      break;
-    case llvm::TapirTargetID::GPU:
-      break;
-    case llvm::TapirTargetID::OpenCilk:
->>>>>>> 82193e08056a (GPU reductions via stripmining pass working)
       ExtractArgsFromString(KITSUNE_OPENCILK_EXTRA_COMPILER_FLAGS, CmdArgs,
                             Args);
       return;
+    case llvm::TTID::GPU:
+      return;
     case TTID::OpenMP:
       AddKitsuneOpenMPCommonArgs(Args, CmdArgs);
       ExtractArgsFromString(KITSUNE_OPENMP_EXTRA_COMPILER_FLAGS, CmdArgs, Args);
@@ -2561,7 +2555,6 @@ void ToolChain::AddKitsuneLinkerArgs(const ArgList &Args,
     case TTID::Hip:
       AddKitsuneHipLinkerArgs(Args, CmdArgs);
       ExtractArgsFromString(KITSUNE_HIP_EXTRA_LINKER_FLAGS, CmdArgs, Args);
-<<<<<<< HEAD
       return;
     case TTID::Lambda:
       AddKitsuneLambdaLinkerArgs(Args, CmdArgs);
@@ -2573,42 +2566,10 @@ void ToolChain::AddKitsuneLinkerArgs(const ArgList &Args,
       return;
     case TTID::OpenCilk:
       AddKitsuneOpenCilkLinkerArgs(Args, CmdArgs);
-=======
-      break;
-    case llvm::TapirTargetID::GPU:
-      break;
-
-    case llvm::TapirTargetID::OpenCilk: {
-      bool StaticOpenCilk = Args.hasArg(options::OPT_static);
-      bool UseAsan = getSanitizerArgs(Args).needsAsanRt();
-
-      // Link the correct Cilk personality fn
-      if (getDriver().CCCIsCXX())
-        CmdArgs.push_back(Args.MakeArgString(getOpenCilkRT(
-            Args,
-            UseAsan ? "opencilk-asan-personality-cpp"
-                    : "opencilk-personality-cpp",
-            StaticOpenCilk ? ToolChain::FT_Static : ToolChain::FT_Shared)));
-      else
-        CmdArgs.push_back(Args.MakeArgString(getOpenCilkRT(
-            Args,
-            UseAsan ? "opencilk-asan-personality-c" : "opencilk-personality-c",
-            StaticOpenCilk ? ToolChain::FT_Static : ToolChain::FT_Shared)));
-
-      // Link the opencilk runtime.  We do this after linking the personality
-      // function, to ensure that symbols are resolved correctly when using
-      // static linking.
-      CmdArgs.push_back(Args.MakeArgString(getOpenCilkRT(
-          Args, UseAsan ? "opencilk-asan" : "opencilk",
-          StaticOpenCilk ? ToolChain::FT_Static : ToolChain::FT_Shared)));
-
-      // Add to the executable's runpath the default directory containing
-      // OpenCilk runtime.
-      addOpenCilkRuntimeRunPath(*this, Args, CmdArgs, Triple);
-
->>>>>>> 82193e08056a (GPU reductions via stripmining pass working)
       ExtractArgsFromString(KITSUNE_OPENCILK_EXTRA_LINKER_FLAGS, CmdArgs, Args);
       return;
+    case TTID::GPU:
+      return;
     case TTID::OpenMP:
       AddKitsuneOpenMPLinkerArgs(Args, CmdArgs);
       ExtractArgsFromString(KITSUNE_OPENMP_EXTRA_LINKER_FLAGS, CmdArgs, Args);
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index db8c73df8b1ce..9a47676b2fe95 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -4813,6 +4813,7 @@ void CompilerInvocationBase::GenerateKitsuneArgs(const KitsuneOptions &Opts,
     case llvm::TTID::Qthreads:
     case llvm::TTID::Realm:
     case llvm::TTID::Serial:
+    case llvm::TTID::GPU:
       return;
     }
     llvm_unreachable("GenerateKitsuneArgs: TTID not handled");
diff --git a/clang/lib/Headers/kitsune.h.cmake b/clang/lib/Headers/kitsune.h.cmake
deleted file mode 100644
index a82691697c135..0000000000000
--- a/clang/lib/Headers/kitsune.h.cmake
+++ /dev/null
@@ -1,150 +0,0 @@
-
-/*
- * Copyright (c) 2020 Triad National Security, LLC
- *                         All rights reserved.
- *
- * This file is part of the kitsune/llvm project.  It is released under
- * the LLVM license.
- */
-#ifndef __CLANG_KITSUNE_H__
-#define __CLANG_KITSUNE_H__
-
-#include <stdint.h>
-#include <stddef.h>
-
-#cmakedefine01 KITSUNE_ENABLE_OPENMP_ABI_TARGET
-#cmakedefine01 KITSUNE_ENABLE_QTHREADS_ABI_TARGET
-#cmakedefine01 KITSUNE_ENABLE_CUDA_ABI_TARGET
-#cmakedefine01 KITSUNE_ENABLE_GPU_ABI_TARGET
-#cmakedefine01 KITSUNE_ENABLE_REALM_ABI_TARGET
-#cmakedefine01 KITSUNE_ENABLE_OPENCL_ABI_TARGET
-#cmakedefine01 KITSUNE_ENABLE_HIP_ABI_TARGET
-
-#include "kitsune_rt.h"
-
-#if defined(reduction)
-#warning found reduction definition: try putting kitsune lower in include order
-#else
-#define reduction __attribute__((noinline, kitsune_reduction))
-#endif
-
-#if defined(reduce)
-#warning found reduce definition: try putting kitsune lower in include order
-#else
-#define reduce __kitsune_reduce(var, join, unit)
-#endif
-
-#if defined(KITSUNE_ENABLE_OPENCL_ABI_TARGET)
-#define ocl_mmap(a, n) __kitsune_opencl_mmap_marker((void*)a, n)
-#ifdef __cplusplus
-extern "C" {
-#endif
-  void __kitsune_opencl_mmap_marker(void* ptr, uint64_t n);
-#ifdef __cplusplus
-}
-#endif
-#endif
-
-#if defined(spawn)
-#warning encountered multiple definitions of spawn!
-#else
-#define spawn _kitsune_spawn
-#endif
-
-#if defined(sync)
-#warning encountered multiple definitions of sync!
-#else
-#define sync _kitsune_sync
-#endif
-
-#if defined(forall)
-#warning encountered multiple definitions of forall!
-#else
-#define forall _kitsune_forall
-#endif
-
-
-#if defined(_tapir_cuda_target)
-  #ifdef __cplusplus
-    extern "C" __attribute__((malloc)) void* __kitrt_cuMemAllocManaged(size_t);
-    template <typename T>
-    inline __attribute__((always_inline))
-      T* alloc(size_t N) {
-      return (T*)__kitrt_cuMemAllocManaged(sizeof(T) * N);
-    }
-
-    extern "C" void __kitrt_cuMemFree(void*);
-    template <typename T>
-    void dealloc(T* array) {
-      __kitrt_cuMemFree((void*)array);
-    }
-  #else
-    void* __attribute__((malloc)) __kitrt_cuMemAllocManaged(size_t);
-    inline __attribute__((always_inline))
-    void *alloc(size_t total_bytes) {
-      return __kitrt_cuMemAllocManaged(total_bytes);
-    }
-
-    void __kitrt_cuMemFree(void*);
-    inline __attribute__((always_inline))
-    void dealloc(void *array) {
-      __kitrt_cuMemFree(array);
-    }
-  #endif
-#elif defined(_tapir_hip_target)
-  #ifdef __cplusplus
-    extern "C" __attribute__((malloc)) void* __kitrt_hipMemAllocManaged(size_t);
-    template <typename T>
-    inline __attribute__((always_inline))
-      T* alloc(size_t N) {
-      return (T*)__kitrt_hipMemAllocManaged(sizeof(T) * N);
-    }
-
-    extern "C" void __kitrt_hipMemFree(void*);
-    template <typename T>
-    void dealloc(T* array) {
-      __kitrt_hipMemFree((void*)array);
-    }
-  #else
-    void* __attribute__((malloc)) __kitrt_hipMemAllocManaged(size_t);
-    inline __attribute__((always_inline))
-    void *alloc(size_t total_bytes) {
-      return __kitrt_hipMemAllocManaged(total_bytes);
-    }
-
-    void __kitrt_hipMemFree(void*);
-    inline __attribute__((always_inline))
-    void dealloc(void *array) {
-       __kitrt_hipMemFree(array);
-    }
-  #endif
-#else
-  #ifdef __cplusplus
-    extern "C" __attribute__((malloc)) void* __kitrt_defaultMemAlloc(size_t);
-    template <typename T>
-    inline __attribute__((always_inline))
-    T* alloc(size_t N) {
-      return (T*)__kitrt_defaultMemAlloc(sizeof(T) * N);
-    }
-
-    extern "C" void __kitrt_defaultMemFree(void*);
-    template <typename T>
-    void dealloc(T* array) {
-      __kitrt_defaultMemFree(array);
-    }
-  #else
-    void* __attribute__((malloc)) __kitrt_defaultMemAlloc(size_t);
-    inline __attribute__((always_inline))
-    void *alloc(size_t total_bytes) {
-      return __kitrt_defaultMemAlloc(total_bytes);
-    }
-
-    void __kitrt_defaultMemFree(void*);
-    inline __attribute__((always_inline))
-    void dealloc(void* array) {
-       __kitrt_defaultMemFree(array);
-    }
-  #endif // __cplusplus
-#endif // cpu targets
-
-#endif
diff --git a/clang/lib/Headers/magma.h b/clang/lib/Headers/magma.h
deleted file mode 100644
index 43c4fb8f1075f..0000000000000
--- a/clang/lib/Headers/magma.h
+++ /dev/null
@@ -1,43 +0,0 @@
-#include<string>
-#include<limits>
-
-template <typename a> 
-struct Magma {
-  virtual a op(a x, a y) = 0; 
-}; 
-
-template <typename a>
-struct UnitalMagma : public Magma<a> {
-  virtual a id() = 0; 
-}; 
-
-// Example unital magmas
-template <typename a>
-struct Sum : UnitalMagma<a>{
-  a op(a x, a y){ return x + y; }
-  a id(){ return 0; }  // look into this more
-}; 
-
-template <typename a>
-struct Product : UnitalMagma<a> {
-  a op(a x, a y){ return x * y; }
-  a id(){ return 1; }
-}; 
-
-struct StringApp : UnitalMagma<std::string> {
-  std::string op(std::string x, std::string y){ return x.append(y); }
-  std::string id() { return ""; }
-}; 
-
-template <typename a>
-struct Max : UnitalMagma<a> {
-  a op(a x, a y){ return x > y ? x : y; }
-  a id() { return std::numeric_limits<a>::min(); }
-}; 
-
-template <typename a>
-struct Min : UnitalMagma<a> {
-  a op(a x, a y){ return x < y ? x : y; }
-  a id() { return std::numeric_limits<a>::max(); }
-}; 
-
diff --git a/clang/lib/Headers/reductions.h b/clang/lib/Headers/reductions.h
deleted file mode 100644
index d9abf9cf276b5..0000000000000
--- a/clang/lib/Headers/reductions.h
+++ /dev/null
@@ -1,56 +0,0 @@
-#include"magma.h"
-#include<assert.h>
-#include<unistd.h>
-#include<stdio.h>
-//#include<kitsune.h>
-
-template <typename a, typename um, typename v> 
-a reduce(um m, v& xs){
-  auto acc = m.id(); 
-  for(auto x : xs){
-    acc = m.op(acc, x); 
-  }
-  return acc; 
-}
-
-template <typename a, typename um, typename v>
-a parReduce(um m, v& xs, uint64_t nthreads){
-  uint64_t linesize = sysconf(_SC_LEVEL1_DCACHE_LINESIZE); 
-  assert(linesize % sizeof(a) == 0); 
-  uint64_t linenum = linesize / sizeof(a); 
-  a* accs = new a[nthreads * linenum]; 
-  uint64_t size = xs.end() - xs.begin(); 
-  assert(size % nthreads == 0); 
-  uint64_t grainsize = size / nthreads; 
-  for(uint64_t i=0; i<nthreads; i++){
-    accs[i*linenum] = m.id();
-    for(uint64_t j = i*grainsize; j<(i+1)*grainsize; j++){
-      accs[i*linenum] = m.op(accs[i*linenum], xs[j]);
-    }
-  }
-  a acc = m.id(); 
-  for(uint64_t i=0; i<nthreads; i++){
-    acc = m.op(acc, accs[i*linenum]); 
-  }
-  delete[] accs; 
-  return acc; 
-}
-
-template<typename a, typename um, typename v> 
-a treeReduce(um m, v& xs, uint64_t start, uint64_t end, uint64_t gs){
-  if(end-start < gs){
-    a acc = m.id();
-    for(uint64_t i=start; i<end; i++){
-      acc+=xs[i];
-    }
-    return acc; 
-  }
-  else{
-    uint64_t mid = (start + end) / 2; 
-    return treeReduce(m, xs, start, mid, gs) 
-         + treeReduce(m, xs, mid, end, gs); 
-  }
-}
-
-  
-
diff --git a/kitsune-tests/reductions/l2.c b/kitsune-tests/reductions/l2.c
index d4c78453803c5..335c6c585ec28 100644
--- a/kitsune-tests/reductions/l2.c
+++ b/kitsune-tests/reductions/l2.c
@@ -10,14 +10,13 @@
 
 reduction
 void sum(double *a, double b, double unit){
-  *a += b + unit;
+  *a += b;
 }
 
 double l2(uint64_t n, double* a){
   double red = 0; 
   forall(uint64_t i=0; i<n; i++){
     sum(&red, a[i] * a[i], 0.0); 
-    assert(i<n);
   }
 
   return sqrt(red);
diff --git a/kitsune-tests/reductions/makefile b/kitsune-tests/reductions/makefile
index cc53950dcc719..b82010b6fa9dc 100644
--- a/kitsune-tests/reductions/makefile
+++ b/kitsune-tests/reductions/makefile
@@ -1,30 +1,30 @@
-exes=l2_opencilk l2_gpu triangle_opencilk triangle_gpu triangle_serial
+exes=l2_opencilk l2_gpu
 
 all: $(exes)
 
 %_serial: %.c
-	clang $< -o $@ -O2 -fuse-ld=lld -fPIC -ftapir=serial -lm -lllvm-gpu -lomp #-mllvm -debug-only="loop-stripmine"
+	kitcc $< -o $@ -O2 -fuse-ld=lld -fPIC -ftapir=serial -lm -lllvm-gpu -lomp #-mllvm -debug-only="loop-stripmine"
 
 %_opencilk: %.c
-	clang $< -o $@ -O2 -fuse-ld=lld -fPIC -ftapir=opencilk -lm -lllvm-gpu -lomp #-mllvm -debug-only="loop-stripmine"
+	kitcc $< -o $@ -O2 -fuse-ld=lld -fPIC -ftapir=opencilk -lm -lllvm-gpu -lomp #-mllvm -debug-only="loop-stripmine"
 
 %_opencilk.ll: %.c
-	clang $< -o $@ -O2 -fPIC -ftapir=opencilk -mllvm -debug-only="loop-stripmine" -S -emit-llvm
+	kitcc $< -o $@ -O2 -fPIC -ftapir=opencilk -mllvm -debug-only="loop-stripmine" -S -emit-llvm
 
 %_gpu: %.c
-	clang $< -o $@ -O2 -fPIC -ftapir=gpu -lm -lllvm-gpu -lomp #-mllvm -debug-only="loop-stripmine"
+	kitcc $< -o $@ -O2 -fPIC -ftapir=gpu -lm -lllvm-gpu -lomp -mllvm -debug-only="gpuabi"
 
 %_gpu.ll: %.c
-	clang $< -o $@ -O2 -fPIC -ftapir=gpu -mllvm -debug-only="loop-stripmine" -S -emit-llvm
+	kitcc $< -o $@ -O2 -fPIC -ftapir=gpu -mllvm -debug-only="loop-stripmine" -S -emit-llvm
 
 %_cuda: %.c
 	kitcc $< -o $@ -O2 -fPIC -ftapir=cuda -lm -lllvm-gpu -lomp -v -mllvm -cuabi-arch=sm_75
 
 %_stripmined.ll: %.c 
-	clang $< -o $@ -O1 -ftapir=none -S -emit-llvm -mllvm -stripmine-loop
+	kitcc $< -o $@ -O1 -ftapir=none -S -emit-llvm -mllvm -stripmine-loop
 
 %_openmp: %_openmp.c
-	clang $< -fopenmp -O2 -o $@ -lm 
+	kitcc $< -fopenmp -O2 -o $@ -lm 
 
 clean: 
 	rm -f $(exes) l2_stripmined.ll l2 l2.ll
diff --git a/kitsune/docs/limitations.md b/kitsune/docs/limitations.md
index eceb4b042a73f..299d08d74672e 100644
--- a/kitsune/docs/limitations.md
+++ b/kitsune/docs/limitations.md
@@ -52,3 +52,14 @@ Virtual method calls from within a `forall` loop are not currently allowed.
 _[This suddenly occurred to me but I haven't thought about whether this is an
 actual limitation or not. I just added it here because I thought that it might 
 be - Tarun (08 June 2022)]_
+
+
+## Reductions
+
+Reductions have a number of unchecked ways that they can fail
+
+    - Single word size: because of how clang can unpack structs passed by
+      value, we don't currently have a way to generalize the reduction approach
+      to handle reductions of structs.
+
+
diff --git a/kitsune/docs/using.md b/kitsune/docs/using.md
index e843f6f2f1e76..a64807bf13cff 100644
--- a/kitsune/docs/using.md
+++ b/kitsune/docs/using.md
@@ -101,29 +101,21 @@ Each special mode and runtime transformation/ABI target has its own named config
 These files can reduce complexity for end users by providing configuration- and build-specific flags.  This can be important when version-specific bitcode files and other details are used.  In addition, these files can provide developers additional flexibility for debugging, testing, and experimenting.  Obviously, all these features can also be hardcoded onto the command line for a more traditional use case.  In addition, to override any of the Kitsune or system configuration files you can place an empty config file within the user directory (no kitsune or system configuration files will be read in this case). 
 
 ## Reductions 
-We provide two approaches to reductions. The first (still very much a work in
-progress and likely to break) is implicit reductions. This allows you to write
-basic reductions in the way you would for sequential code, and have them be
-optimized for parallelism, e.g. 
+We provide a reduction keyword that acts as a set of attributes, including a custom kitsune_reduction attribute, that takes three arguments: the first is the lhs operand in memory, which will be updated in-place. The second is the rhs, and the third is the unit value of the monoid. This should be ignored in the body, and is used as an easy way to refer to the unit value in codegen.
 
-```
-forall(auto x : xs) {
-  acc += x;
-}
-```
-
-should generate efficient parallel reduction code.
-
-Second, we provide a c++ interface for parallel reduction via user-defined
-reduction operators. Formally, we require a unital magma, which is just a
-reduction operator and a unit value, e.g. 0 for sums and 1 for products.
-
-This allows for the following style of reductions: 
 
 ```
-#include<reductions.h>
+#include<kitsune.h>
+
 ...
-  double sum = reduce<double>(Sum<double>(), big);
+reduction void sum(double* l, double r, double unit){
+    *l += r;
+}
+
+double red=0.0;
+forall(...){
+    sum(&red, x, 0.0) ;
+}
 ```
 
 
diff --git a/kitsune/include/kitsune/Config/config.h.cmake b/kitsune/include/kitsune/Config/config.h.cmake
index 63eef3ac05aea..4e93c06c9756d 100644
--- a/kitsune/include/kitsune/Config/config.h.cmake
+++ b/kitsune/include/kitsune/Config/config.h.cmake
@@ -274,4 +274,7 @@ constexpr unsigned KITSUNE_MAX_FIXED_THREADS_PER_BLOCK = 1024;
 
 // -----------------------------------------------------------------------------
 
+// Is the gpu target enabled 
+#cmakedefine01 KITSUNE_GPU_ENABLED
+
 #endif // KITSUNE_CONFIG_H
diff --git a/kitsune/include/kitsune/Core/Tapir.h b/kitsune/include/kitsune/Core/Tapir.h
index d31e0b13c0c23..eb8da83bf17b9 100644
--- a/kitsune/include/kitsune/Core/Tapir.h
+++ b/kitsune/include/kitsune/Core/Tapir.h
@@ -55,7 +55,7 @@ enum class TTID : uint32_t {
 
   /// Lower to kitsune's JIT-enabled, GPU-agnostic runtime.
   /// FIXME: This has been disabled for now, but should be re-enabled shortly.
-  // GPUABI = 0x10,
+  GPU = 0x10,
 
   /// Lower to the qthreads runtime.
   /// FIXME: This is currently disabled and needs to be updated before it can be
@@ -78,6 +78,7 @@ enum class TTID : uint32_t {
 
   /// FIXME: Almost certainly obsolete.
   OpenMP = 0x200,
+
 };
 
 /// Convert the integer to a \ref TTID. If the integer cannot be converted to a
diff --git a/kitsune/include/kitsune/Core/TapirTargets.h b/kitsune/include/kitsune/Core/TapirTargets.h
index 808fadf77b394..98b358d960775 100644
--- a/kitsune/include/kitsune/Core/TapirTargets.h
+++ b/kitsune/include/kitsune/Core/TapirTargets.h
@@ -41,8 +41,12 @@
 #include "llvm/Transforms/Tapir/QthreadsABI.h"
 #endif // KITSUNE_QTHREADS_ENABLED
 
-#if KITSNUE_REALM_ENABLED
+#if KITSUNE_REALM_ENABLED
 #include "llvm/Transforms/Tapir/RealmABI.h"
 #endif // KITSUNE_REALM_ENABLED
 
+#if KITSUNE_GPU_ENABLED
+#include "llvm/Transforms/Tapir/GPUABI.h"
+#endif // KITSUNE_REALM_ENABLED
+       //
 #endif // LLVM_TAPIR_TARGETS_H
diff --git a/kitsune/lib/Analysis/TapirTargetAnalysis.cpp b/kitsune/lib/Analysis/TapirTargetAnalysis.cpp
index 51dc77aa323c9..e0c68d8b2a51c 100644
--- a/kitsune/lib/Analysis/TapirTargetAnalysis.cpp
+++ b/kitsune/lib/Analysis/TapirTargetAnalysis.cpp
@@ -87,6 +87,11 @@ createTT(TTID id, Module &m, const TapirTargetOptions &tto) {
   case TTID::Realm:
     return std::make_unique<RealmABI>(m);
 #endif // KITSUNE_REALM_ENABLED
+       
+#if KITSUNE_GPU_ENABLED
+  case TTID::GPU:
+    return std::make_unique<GPUABI>(m, tto);
+#endif // KITSUNE_REALM_ENABLED
 
   default:
     llvm_unreachable("createTT: TTID not handled");
diff --git a/kitsune/lib/Core/TapirTargetOptions.cpp b/kitsune/lib/Core/TapirTargetOptions.cpp
index 4f6f5e6a10ea1..fec422593eba9 100644
--- a/kitsune/lib/Core/TapirTargetOptions.cpp
+++ b/kitsune/lib/Core/TapirTargetOptions.cpp
@@ -47,7 +47,7 @@ static cl::opt<TTID>
                        clEnumValN(TTID::Cuda, "cuda", ""),
                        clEnumValN(TTID::Hip, "hip", ""),
                        clEnumValN(TTID::OpenCilk, "opencilk", ""),
-                       // clEnumValN(TTID::GPUABI, "gpuabi", ""),
+                       clEnumValN(TTID::GPU, "gpu", ""),
                        clEnumValN(TTID::Qthreads, "qthreads", ""),
                        clEnumValN(TTID::Realm, "realm", ""),
                        clEnumValN(TTID::Lambda, "lambda", ""),
diff --git a/kitsune/lib/Support/Deserialize.cpp b/kitsune/lib/Support/Deserialize.cpp
index 547d9a5806586..629afd5012d07 100644
--- a/kitsune/lib/Support/Deserialize.cpp
+++ b/kitsune/lib/Support/Deserialize.cpp
@@ -22,6 +22,7 @@ std::optional<TTID> llvm::createTTIDFrom(StringRef s) {
       .Case("nolo", TTID::Nolo)
       .Case("cuda", TTID::Cuda)
       .Case("hip", TTID::Hip)
+      .Case("gpu", TTID::GPU)
       .Case("lambda", TTID::Lambda)
       .Case("omptask", TTID::OMPTask)
       .Case("opencilk", TTID::OpenCilk)
@@ -45,8 +46,7 @@ std::optional<TTID> llvm::createTTIDFrom(uint32_t u) {
   case 0x8:
     return TTID::OpenCilk;
   case 0x10:
-    llvm_unreachable("createTTIDFrom: GPUABI has not been enabled");
-    // return TTID::GPUABI;
+    return TTID::GPU;
   case 0x20:
     return TTID::Qthreads;
   case 0x40:
diff --git a/kitsune/lib/Support/ToString.cpp b/kitsune/lib/Support/ToString.cpp
index 41b1163534580..04f66f6d8b5bc 100644
--- a/kitsune/lib/Support/ToString.cpp
+++ b/kitsune/lib/Support/ToString.cpp
@@ -37,6 +37,8 @@ std::string llvm::toString(const TTID &tt) {
     return "qthreads";
   case TTID::Realm:
     return "realm";
+  case TTID::GPU:
+    return "gpu";
   }
   llvm_unreachable("toString: TTID not handled");
 }
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 90f0dd6dc3edf..658ff27f06a8e 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -454,7 +454,7 @@ set(KITSUNE_REALM_ENABLED OFF CACHE INTERNAL "Enable 'realm' tapir target" FORCE
 set(KITSUNE_SERIAL_ENABLED ON CACHE INTERNAL "Enable 'serial' tapir target" FORCE)
 
 set(KITSUNE_KNOWN_TAPIR_TARGETS
-  "cuda;hip;lambda;omptask;opencilk;openmp;qthreads;realm;serial"
+  "cuda;hip;gpu;lambda;omptask;opencilk;openmp;qthreads;realm;serial"
   CACHE INTERNAL
   "All known Tapir targets" FORCE)
 
@@ -494,6 +494,8 @@ foreach(target IN LISTS KITSUNE_ENABLED_TAPIR_TARGETS)
     message(FATAL_ERROR "Unknown Tapir target '${target}'")
   endif ()
 endforeach ()
+message(WARNING "CUDA: ${KITSUNE_CUDA_ENABLED}")
+message(WARNING "GPU: ${KITSUNE_GPU_ENABLED}")
 list(SORT KITSUNE_ENABLED_TAPIR_TARGETS_STR)
 list(JOIN KITSUNE_ENABLED_TAPIR_TARGETS_STR " " KITSUNE_ENABLED_TAPIR_TARGETS_STR)
 
diff --git a/llvm/include/llvm/IR/Attributes.td b/llvm/include/llvm/IR/Attributes.td
index aa2a9ec523d84..b88603738b846 100644
--- a/llvm/include/llvm/IR/Attributes.td
+++ b/llvm/include/llvm/IR/Attributes.td
@@ -277,11 +277,7 @@ def ReadNone : EnumAttr<"readnone", IntersectAnd, [ParamAttr]>;
 def ReadOnly : EnumAttr<"readonly", IntersectAnd, [ParamAttr]>;
 
 /// Tapir reducer-related attributes.
-def HyperView : EnumAttr<"hyper_view", [FnAttr]>;
-def HyperToken : EnumAttr<"hyper_token", [FnAttr]>;
-def ReducerRegister : EnumAttr<"reducer_register", [FnAttr]>;
-def ReducerUnregister : EnumAttr<"reducer_unregister", [FnAttr]>;
-def KitsuneReduction : EnumAttr<"kitsune_reduction", [FnAttr]>;
+def KitsuneReduction : EnumAttr<"kitsune_reduction", IntersectPreserve, [FnAttr]>;
 
 /// Return value is always equal to this argument.
 def Returned : EnumAttr<"returned", IntersectAnd, [ParamAttr]>;
diff --git a/llvm/include/llvm/Transforms/Tapir/GPUABI.h b/llvm/include/llvm/Transforms/Tapir/GPUABI.h
index e11aeff31f696..0ad6808a5420f 100644
--- a/llvm/include/llvm/Transforms/Tapir/GPUABI.h
+++ b/llvm/include/llvm/Transforms/Tapir/GPUABI.h
@@ -26,7 +26,7 @@ class LLVMLoop;
 class GPUABI : public TapirTarget {
   LLVMLoop *LOP = nullptr;
 public:
-  GPUABI(Module &M) : TapirTarget(M) {}
+  GPUABI(Module &M, const TapirTargetOptions &TTO) : TapirTarget(M, TTO) {}; 
   ~GPUABI() {}
   Value *lowerGrainsizeCall(CallInst *GrainsizeCall) override final;
   void lowerSync(SyncInst &SI) override final;
@@ -49,8 +49,7 @@ class GPUABI : public TapirTarget {
   void processSubTaskCall(TaskOutlineInfo &TOI, DominatorTree &DT)
     override final;
 
-  LoopOutlineProcessor *getLoopOutlineProcessor(const TapirLoopInfo *TL,
-		  OptimizationLevel OptLevel = OptimizationLevel=O2)						
+  LoopOutlineProcessor *getLoopOutlineProcessor(const TapirLoopInfo *TL)						
     override final;
 };
 
diff --git a/llvm/include/llvm/Transforms/Tapir/LoopStripMinePass.h b/llvm/include/llvm/Transforms/Tapir/LoopStripMinePass.h
index 5b130c3e89d62..f148f20ff1820 100644
--- a/llvm/include/llvm/Transforms/Tapir/LoopStripMinePass.h
+++ b/llvm/include/llvm/Transforms/Tapir/LoopStripMinePass.h
@@ -24,7 +24,7 @@ class LoopStripMinePass : public PassInfoMixin<LoopStripMinePass> {
 public:
   explicit LoopStripMinePass() {}
 
-  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+  PreservedAnalyses run(Module &F, ModuleAnalysisManager &AM);
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/Transforms/Tapir/TapirTargetIDs.h b/llvm/include/llvm/Transforms/Tapir/TapirTargetIDs.h
deleted file mode 100644
index e4305d139cb26..0000000000000
--- a/llvm/include/llvm/Transforms/Tapir/TapirTargetIDs.h
+++ /dev/null
@@ -1,115 +0,0 @@
-//===- TapirTargetIDs.h - Tapir target ID's --------------------*- C++ -*--===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file enumerates the available Tapir lowering targets.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef TAPIR_TARGET_IDS_H_
-#define TAPIR_TARGET_IDS_H_
-
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/raw_ostream.h"
-
-namespace llvm {
-
-enum class TapirTargetID {
-  None,     // Perform no lowering
-  Serial,   // Lower to serial projection
-  Cuda,     // Lower to Cuda ABI
-  Hip,      // Lower to the Hip (AMD GPU) ABI
-  GPU,      // Lower to the GPU ABI
-  Lambda,   // Lower to generic Lambda ABI
-  OMPTask,  // Lower to OpenMP task ABI
-  OpenCilk, // Lower to OpenCilk ABI
-  OpenMP,   // Lower to OpenMP (TODO: Needs to be updated)
-  Qthreads, // Lower to Qthreads (TODO: Needs to be updated)
-  Realm,    // Lower to Realm (TODO: Needs to be updated)
-  Last_TapirTargetID
-};
-
-// Serialize the Tapir target into the given output stream. This will write a
-// string representation that is compatible with the -ftapir argument used in
-// clang.
-raw_ostream &operator<<(raw_ostream &os, const TapirTargetID &Target);
-
-enum class TapirNVArchTargetID {
-  SM_50, // TODO: Remove depcreated targets based on latest CUDA releases.
-  SM_52,
-  SM_53,
-  SM_60, // Pascal
-  SM_61,
-  SM_62,
-  SM_70, // Volta
-  SM_72,
-  SM_75, // Turing
-  SM_80, // Ampere
-  SM_86,
-  SM_90, // Hopper
-  // TODO: Update this enum when we sync w/ upstream LLVM capabilities.
-  Last_TapirNVArchTargetID
-};
-
-// Serialize the Tapir target into the given output stream. This will write a
-// string representation that is compatible with the -ftapir argument used in
-// clang.
-raw_ostream &operator<<(raw_ostream &os, const TapirTargetID &Target);
-
-// Tapir target options
-
-// Virtual base class for Target-specific options.
-class TapirTargetOptions {
-public:
-  enum TapirTargetOptionKind { TTO_OpenCilk, Last_TTO };
-
-private:
-  const TapirTargetOptionKind Kind;
-
-public:
-  TapirTargetOptionKind getKind() const { return Kind; }
-
-  TapirTargetOptions(TapirTargetOptionKind K) : Kind(K) {}
-  TapirTargetOptions(const TapirTargetOptions &) = delete;
-  TapirTargetOptions &operator=(const TapirTargetOptions &) = delete;
-  virtual ~TapirTargetOptions() {}
-
-  // Top-level method for cloning TapirTargetOptions.  Defined in
-  // TargetLibraryInfo.
-  TapirTargetOptions *clone() const;
-};
-
-// Options for OpenCilkABI Tapir target.
-class OpenCilkABIOptions : public TapirTargetOptions {
-  std::string RuntimeBCPath;
-
-  OpenCilkABIOptions() = delete;
-
-public:
-  OpenCilkABIOptions(StringRef Path)
-      : TapirTargetOptions(TTO_OpenCilk), RuntimeBCPath(Path) {}
-
-  StringRef getRuntimeBCPath() const {
-    return RuntimeBCPath;
-  }
-
-  static bool classof(const TapirTargetOptions *TTO) {
-    return TTO->getKind() == TTO_OpenCilk;
-  }
-
-protected:
-  friend TapirTargetOptions;
-
-  OpenCilkABIOptions *cloneImpl() const {
-    return new OpenCilkABIOptions(RuntimeBCPath);
-  }
-};
-
-} // end namespace llvm
-
-#endif
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index fc623989e2c29..c21e98f38e980 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -14,6 +14,7 @@
 ///
 //===----------------------------------------------------------------------===//
 
+#include "kitsune/Analysis/TapirTargetAnalysis.h"
 #include "kitsune/Passes/PipelineUtils.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
@@ -1734,14 +1735,21 @@ PassBuilder::buildTapirLoopLoweringPipeline(OptimizationLevel Level,
                                               /*UseBlockFrequencyInfo=*/false));
 
   // Stripmine Tapir loops, if pass is enabled.
+  // This was moved from buildModuleOptimizationPipeline, I (George) expect it to be
+  // beneficial for all parallel backends. Needs performance evaluation to confirm.
   if (PTO.LoopStripmine) {
+    // We need to do tapir target analysis to determine how to stripmine
+    MPM.addPass(RequireAnalysisPass<TapirTargetAnalysis, Module>());
+    //MPM.addPass(
+        //createModuleToFunctionPassAdaptor(InvalidateAnalysisPass<TapirTargetAnalysis>()));
+
     LoopPassManager LPM1, LPM2;
-    LPM1.addPass(TapirIndVarSimplifyPass());
+    LPM1.addPass(IndVarSimplifyPass(true, true));
     FPM.addPass(
         createFunctionToLoopPassAdaptor(std::move(LPM1),
                                         /*UseMemorySSA=*/true,
                                         /*UseBlockFrequencyInfo=*/true));
-    FPM.addPass(LoopStripMinePass());
+    MPM.addPass(LoopStripMinePass());
     // Cleanup tasks after stripmining loops.
     FPM.addPass(TaskSimplifyPass());
     // Cleanup after stripmining loops.
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 8e62e9a2b97f6..063eccc8c83a8 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -192,6 +192,7 @@ MODULE_PASS("kit-kernel-properties", RecomputeKernelPropertiesPass())
 MODULE_PASS("kit-prefetch", PrefetchingPass())
 MODULE_PASS("kit-lower-intrinsics", LowerKitsuneIntrinsicsPass())
 MODULE_PASS("kit-strip-addr-spaces", StripKitsuneAddrSpacesPass())
+MODULE_PASS("loop-stripmine", LoopStripMinePass())
 
 #undef MODULE_PASS
 
@@ -510,7 +511,6 @@ FUNCTION_PASS("loop-fusion", LoopFusePass())
 FUNCTION_PASS("loop-load-elim", LoopLoadEliminationPass())
 FUNCTION_PASS("loop-simplify", LoopSimplifyPass())
 FUNCTION_PASS("loop-sink", LoopSinkPass())
-FUNCTION_PASS("loop-stripmine", LoopStripMinePass())
 FUNCTION_PASS("loop-versioning", LoopVersioningPass())
 FUNCTION_PASS("lower-atomic", LowerAtomicPass())
 FUNCTION_PASS("lower-constant-intrinsics", LowerConstantIntrinsicsPass())
diff --git a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
index bede4ec6c26e7..79e530c6362b1 100644
--- a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -284,7 +284,7 @@ bool DeadArgumentEliminationPass::removeDeadArgumentsFromCallers(Function &F) {
   if (F.hasFnAttribute(Attribute::Naked))
     return false;
 
-  // Don't operate on optnone
+  // Don't operate on optnone (we need this for kitsune reductions)
   if (F.hasFnAttribute(Attribute::OptimizeNone))
     return false;
 
@@ -734,7 +734,7 @@ bool DeadArgumentEliminationPass::removeDeadStuffFromFunction(Function *F) {
   if (FrozenFunctions.count(F))
     return false;
 
-  // Don't operate on optnone
+  // Don't operate on optnone (we need this for kitsune reductions)
   if (F->hasFnAttribute(Attribute::OptimizeNone))
     return false;
 
diff --git a/llvm/lib/Transforms/Tapir/GPUABI.cpp b/llvm/lib/Transforms/Tapir/GPUABI.cpp
index ec5ccd68cdce9..c6502fbeb8938 100644
--- a/llvm/lib/Transforms/Tapir/GPUABI.cpp
+++ b/llvm/lib/Transforms/Tapir/GPUABI.cpp
@@ -78,8 +78,7 @@ void GPUABI::processSubTaskCall(TaskOutlineInfo &TOI, DominatorTree &DT) {
 }
 
 LoopOutlineProcessor *
-GPUABI::getLoopOutlineProcessor(const TapirLoopInfo *TL,
-		OptimizationLevel OptLevel = OptimizationLevel=O2) {
+GPUABI::getLoopOutlineProcessor(const TapirLoopInfo *TL) {
   if(!LOP)
     return new LLVMLoop(M);
   return LOP;
@@ -89,7 +88,7 @@ GPUABI::getLoopOutlineProcessor(const TapirLoopInfo *TL,
 unsigned LLVMLoop::NextKernelID = 0;
 
 LLVMLoop::LLVMLoop(Module &M)
-    : LoopOutlineProcessor(M, LLVMM), LLVMM("kernelModule", M.getContext()) {
+    : LoopOutlineProcessor(M, TTOpts), LLVMM("kernelModule", M.getContext()) {
   ValueToValueMapTy VMap;
   // LLVMMptr = CloneModule(M, vmap, [](const GlobalValue* gv) { return false; });
   // And named metadata....
@@ -103,7 +102,7 @@ LLVMLoop::LLVMLoop(Module &M)
 
   // Setup an LLVM triple.
   Triple LLVMTriple("spir64-unknown-unknown");
-  LLVMM.setTargetTriple(LLVMTriple.str());
+  LLVMM.setTargetTriple(LLVMTriple);
 
   // Insert runtime-function declarations in LLVM host modules.
   Type *LLVMInt32Ty = Type::getInt32Ty(LLVMM.getContext());
diff --git a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
index 5774398067ea8..b1db7abb46776 100644
--- a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
+++ b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
@@ -955,6 +955,15 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
   Value *StepSize; 
   Value *BranchVal; 
   // Int the gpu case we don't need an epilogue
+  // If we start with forall(i=0..n)
+  // GPU stripmine converts to 
+  //   forall(i=0; i<k; i++)
+  //     for(j=i; j+=k; j<n)
+  // CPU stripmine converts to
+  //   forall(i=0; i<n/k; i++){
+  //     for(j=i*k; j<(i+1)*k; j++)
+  //   }
+  //   epilogue
   if(GPU){
     ModVal = TripCount;  
     //B.SetInsertPoint(F->getEntryBlock().getFirstNonPHI()); 
@@ -968,7 +977,7 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
 
     IRBuilder<> B2(bloc); 
     StepSize = B2.CreateCall(
-      Intrinsic::getDeclaration(M, Intrinsic::tapir_loop_grainsize,
+      Intrinsic::getOrInsertDeclaration(M, Intrinsic::tapir_loop_grainsize,
                                 { TripCount->getType() }), { TripCount });
 
 
@@ -1005,7 +1014,7 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
                             ConstantInt::get(BECount->getType(), Count),
                             "xtraiter");
     }
-    BranchVal = B.CreateICmpULT(
+    BranchVal = B.CreateICmpSLT(
         BECount, ConstantInt::get(BECount->getType(),
                                   TL.isInclusiveRange() ? Count : Count - 1));
   }
@@ -1096,7 +1105,7 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
     *RemainderLoop =
         cloneLoopBlocks(L, ModVal, CreateRemainderLoop, true, UnrollRemainder,
                         InsertTop, InsertBot, NewPreheader, NewBlocks, LoopBlocks,
-                        ExtraTaskBlocks, SharedEHTaskBlocks, VMap, DT, LI);
+                        ExtraTaskBlocks, SharedEHTaskBlocks, VMap, DT, LI, Count);
 
     // Insert the cloned blocks into the function.
     F->splice(InsertBot->getIterator(), &*F, NewBlocks[0]->getIterator(),
@@ -1157,21 +1166,8 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
       SerializeDetach(ClonedDI, ParentEntry, EHCont, EHContLPadVal,
                       ClonedReattaches, &ClonedEHBlocks, &ClonedEHBlockPreds,
                       &ClonedInlinedLPads, &ClonedDetachedRethrows,
-                      NeedToInsertTaskFrame, DT, LI);
+                      NeedToInsertTaskFrame, DT, nullptr, LI);
     }
-    SmallVector<Instruction *, 1> ClonedDetachedRethrows;
-    for (Instruction *DR : DetachedRethrows) {
-      if (VMap[DR])
-        ClonedDetachedRethrows.push_back(cast<Instruction>(VMap[DR]));
-      else
-        ClonedDetachedRethrows.push_back(DR);
-    }
-    DetachInst *ClonedDI = cast<DetachInst>(VMap[DI]);
-    // Serialize the new task.
-    SerializeDetach(ClonedDI, ParentEntry, EHCont, EHContLPadVal,
-                    ClonedReattaches, &ClonedEHBlocks, &ClonedEHBlockPreds,
-                    &ClonedInlinedLPads, &ClonedDetachedRethrows,
-                    NeedToInsertTaskFrame, DT, nullptr, LI);
   }
 
   // Detach the stripmined loop.
@@ -1561,7 +1557,7 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
                       Header->getFirstNonPHIIt());
     // Initialize inner index to zero.
     //Value *Zero = ConstantInt::get(PrimaryInduction->getType(), 0);
-    B2.SetInsertPoint(LatchBR->getParent()->getFirstNonPHI());
+    B2.SetInsertPoint(LatchBR->getParent()->getFirstNonPHIIt());
     // Instead of subtracting one, add the grainsize.
 
     Value *NextIdx = B2.CreateAdd(InnerIdx, StepSize,
@@ -1615,7 +1611,6 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
     // InnerIdx->addIncoming(InnerAdd, Latch);
     LatchBR->setCondition(InnerCmp);
   }
->>>>>>> 82193e08056a (GPU reductions via stripmining pass working)
 
   // Connect the epilog code to the original loop and update the PHI functions.
   B2.SetInsertPoint(EpilogPreheader->getTerminator());
@@ -1680,8 +1675,45 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
     // FIXME: Recalculating TaskInfo for the whole function is wasteful.
     // Optimize this routine in the future.
     //TI->recalculate(*F, *DT);
-
-  // accumulate reductions in main loop
+    
+  // Reductions take a parallel loop
+  // forall(i=0; i<n; i++)
+  //   Can be an aribrary parallel loop with multiple reductions
+  //   BODY
+  //   sum(&red, a[i]; 0.0)
+  //
+  // GPU stripmine converts to 
+  //   nred = gpuGridSize(n);
+  //   reds = managedMalloc(nred);
+  //   forall(i=0; i<nred; i++){
+  //     for(j=i; j<n; j+=nred) {
+  //       BODY
+  //       sum(&reds[i], a[j]; 0.0)
+  //     }
+  //   }
+  //   for(i=0; i<nred; i++)
+  //     sum(&red, reds[i], 0.0);
+  //
+  // CPU stripmine converts to
+  //   nred = n/K; // K defaults to 2048 (DefaultCoarseningFactor)
+  //   reds = managedMalloc();
+  //   forall(i=0; i<n; i+=nred){ // not quite right, need to handle case with epilogue
+  //     reds[i] = 0.0
+  //     for(j=i; j<i+nred; j++){
+  //       BODY
+  //       sum(&reds[i], a[j]; 0.0)
+  //     }
+  //   }
+  //   // Epilogue (leftover iterations)
+  //   if(nred * k > N){ 
+  //     for(...) // epilogue logic
+  //       BODY
+  //       sum(&reds[i], a[j]; 0.0)
+  //   }
+  //   for(i=0; i<nred; i++)
+  //     sum(&red, reds[i], 0.0);
+  //
+  // record calls to reduction functions in loop for later reference
   const std::vector<BasicBlock*>& blocks = L->getBlocks(); 
   std::set<std::pair<CallInst*, Type*>> reductions;
   for (BasicBlock *BB : blocks){
@@ -1716,12 +1748,16 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
   //
   //
   // accumulate reductions in epilog loop
-  LLVM_DEBUG(dbgs() << "Found " << reductions.size() << " reduction variables in loop\n"); 
+  LLVM_DEBUG(dbgs() << "Found " << reductions.size() << " reduction variables in loop\n");
 
-  std::vector<std::tuple<CallInst*, Value* , Value*, Type*, Value*>> redMap; 
+  // Associates calls to reduction functions, first argument to reduction
+  // function,  local reduction allocation, type of unit, unit
+  std::vector<std::tuple<CallInst *, Value *, Value *, Type *, Value *>> redMap;
   // TODO: Modify the strip mining outer loop to be smaller: currently we are
   // stack allocating n/2048 reduction values.
   // TODO: Initialize local reductions with unit values
+  // TODO: move insertion point for reduction allocation
+  // TODO: free reduction allocation
   Instruction* bloc = nullptr;
   if(Instruction* I = dyn_cast<Instruction>(TripCount)){
     bloc = I->getParent()->getTerminator();
@@ -1737,6 +1773,11 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
   else 
     outerIters = StepSize;
 
+  // Here we iterate over the reductions (calls to reduction functions), and
+  // allocate the local reduction variable array, and build the association
+  // array redMap, and replace references to the original reduction variable
+  // with references to the new local reduction variable in the body of the
+  // inner loop
   auto nred = RB.CreateAdd(outerIters, ConstantInt::get(outerIters->getType(), 1)); 
   for(auto &pair : reductions){
     // TODO: generic allocation/free calls
@@ -1790,21 +1831,8 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
   // start the reduction with that value).
   LLVM_DEBUG(dbgs() << "Function after strip mining, before reduction epilogue\n" << *F); 
    
-  // We insert the reduction code at every sync corresponding to the strimined
-  // loop
-  //
-  // Sync 
-  // RedEpiHeader
-  //   RedEpiBody
-  // RedEpiExit
-
-  // Todo: re-order epilogue and reduction epilogue to preserve associativity
   if(!reductions.empty()){
-    // Peel the first iteration of the loop and replace the reduction calls in
-    // the peeled code with stores
-    // Can't do this in general if the reduction is conditional
     ValueToValueMapTy VMap; 
-    //peelLoop(L, 1, LI, SE, *DT, AC, PreserveLCSSA, VMap); 
     SmallVector<Instruction*, 4> CIS;
     for(auto &BB : NewLoop->blocks()){
       // We find the location that we reduce into and create a store of unit
@@ -1823,12 +1851,14 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
         }
       }
     }
-    
-    /*
-    for(auto &I : CIS){
-      I->eraseFromParent();  
-    }
-    */
+
+  // We insert the reduction code at every sync corresponding to the strimined
+  // loop
+  //
+  // Sync 
+  // RedEpiHeader
+  //   RedEpiBody
+  // RedEpiExit
 
     Instruction* term = LatchExit->getTerminator(); 
     BasicBlock *PostSync = term->getSuccessor(0);
@@ -1838,8 +1868,8 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
     BranchInst::Create(PostSync, RedEpiHeader);
     PHINode *Idx = PHINode::Create(outerIters->getType(), 2,
                                    "reductionepilogueidx",
-                                   RedEpiHeader->getFirstNonPHI());
-    IRBuilder<> BH(RedEpiHeader->getFirstNonPHI()); 
+                                   RedEpiHeader->getFirstNonPHIIt());
+    IRBuilder<> BH(RedEpiHeader, RedEpiHeader->getFirstNonPHIIt()); 
     Idx->addIncoming(ConstantInt::get(outerIters->getType(), 0), LatchExit);
     Instruction *bodyTerm, *exitTerm;
     Value *cmp = BH.CreateCmp(CmpInst::ICMP_NE, Idx, outerIters); 
diff --git a/llvm/lib/Transforms/Tapir/LoopStripMinePass.cpp b/llvm/lib/Transforms/Tapir/LoopStripMinePass.cpp
index e4947490c7e7a..56c81b0396bbb 100644
--- a/llvm/lib/Transforms/Tapir/LoopStripMinePass.cpp
+++ b/llvm/lib/Transforms/Tapir/LoopStripMinePass.cpp
@@ -130,8 +130,8 @@ static bool tryToStripMineLoop(
                   "form.\n");
     return false;
   }
-  bool StripMiningRequested =
-      (hasLoopStripmineTransformation(L) == TM_ForcedByUser);
+  //bool StripMiningRequested =
+  //    (hasLoopStripmineTransformation(L) == TM_ForcedByUser);
   TargetTransformInfo::StripMiningPreferences SMP =
     gatherStripMiningPreferences(L, SE, TTI, ProvidedCount);
 
@@ -286,14 +286,16 @@ static bool tryToStripMineLoop(
 
   // TODO: change this to check tapir loop attributes for custom target
   bool GPU = false;
-  auto target = TLI->getTapirTarget();
+
+  TTID target = TGI.hasTTID() ? TGI.getTTID() : TTID::OpenCilk;
+
   switch(target){
     // We don't want to stripmine for serial targets
-    case TapirTargetID::Serial:
+    case TTID::Serial:
+    case TTID::Cuda:
+    case TTID::Hip:
       return false; 
-    case TapirTargetID::GPU:
-    case TapirTargetID::Cuda:
-    case TapirTargetID::Hip:
+    case TTID::GPU:
       GPU = true;
       break;
     default:
@@ -325,88 +327,10 @@ static bool tryToStripMineLoop(
   return true;
 }
 
-namespace {
-
-class LoopStripMine : public LoopPass {
-public:
-  static char ID; // Pass ID, replacement for typeid
-
-  std::optional<unsigned> ProvidedCount;
-
-  LoopStripMine(std::optional<unsigned> Count = std::nullopt)
-      : LoopPass(ID), ProvidedCount(Count) {
-    initializeLoopStripMinePass(*PassRegistry::getPassRegistry());
-  }
-
-  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
-    if (skipLoop(L))
-      return false;
-
-    Function &F = *L->getHeader()->getParent();
-
-    auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
-    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-    LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-    TaskInfo *TI = &getAnalysis<TaskInfoWrapperPass>().getTaskInfo();
-    ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
-    const TargetTransformInfo &TTI =
-        getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
-    auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
-    // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
-    // pass.  Function analyses need to be preserved across loop transformations
-    // but ORE cannot be preserved (see comment before the pass definition).
-    OptimizationRemarkEmitter ORE(&F);
-    bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
-
-    bool ret = tryToStripMineLoop(L, DT, LI, SE, TTI, AC, TI, ORE, &TLI,
-                              PreserveLCSSA, ProvidedCount);
-    if(!ret){
-      ORE.emit(DiagnosticInfoOptimizationFailure(
-                    DEBUG_TYPE, "FailedRequestedSpawning",
-                    L->getStartLoc(), L->getHeader())
-                << "Tapir loop not stripmined");
-    }
-    return ret;
-  }
-
-  /// This transformation requires natural loop information & requires that
-  /// loop preheaders be inserted into the CFG...
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<AssumptionCacheTracker>();
-    AU.addRequired<TargetTransformInfoWrapperPass>();
-    AU.addRequired<TargetLibraryInfoWrapperPass>();
-    getLoopAnalysisUsage(AU);
-  }
-};
-
-} // end anonymous namespace
-
-char LoopStripMine::ID = 0;
-
-INITIALIZE_PASS_BEGIN(LoopStripMine, "loop-stripmine", "Stripmine Tapir loops",
-                      false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(LoopPass)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(LoopStripMine, "loop-stripmine", "Stripmine Tapir loops",
-                    false, false)
-
-Pass *llvm::createLoopStripMinePass(int Count) {
-  // TODO: It would make more sense for this function to take the optionals
-  // directly, but that's dangerous since it would silently break out of tree
-  // callers.
-  return new LoopStripMine(Count == -1 ? std::nullopt
-                                       : std::optional<unsigned>(Count));
-}
-
-PreservedAnalyses LoopStripMinePass::run(Function &F,
-                                         FunctionAnalysisManager &AM) {
-  Module& M  = *F.getParent();
-
-  auto &MAM = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
-  const TapirTargetInfo &TGI = *MAM.getCachedResult<TapirTargetAnalysis>(M);
 
+static bool loopStripMineImpl(Function &F,
+                              FunctionAnalysisManager &AM,
+                              TapirTargetInfo &TGI) {
   auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
   auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
   auto &LI = AM.getResult<LoopAnalysis>(F);
@@ -473,9 +397,29 @@ PreservedAnalyses LoopStripMinePass::run(Function &F,
       LAM->clear(L, LoopName);
   }
 
+  return Changed;
+}
+
+PreservedAnalyses LoopStripMinePass::run(Module &M,
+                                         ModuleAnalysisManager &AM){
+  auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  auto &TGI = AM.getResult<TapirTargetAnalysis>(M);
+
+  bool Changed = false;
+
+  for(auto &F: M){
+    if(!F.empty())
+      Changed |= loopStripMineImpl(F, FAM, TGI);
+  }
+
   if (!Changed)
     return PreservedAnalyses::all();
 
+  PreservedAnalyses PA = PreservedAnalyses::none();
   // If we've changed, assume we've not preserved anything
-  return PreservedAnalyses::none(); 
+  PA.preserve<ModuleAnalysisManagerFunctionProxy>();
+  return PA;
+  //return PreservedAnalyses::none(); 
+
 }
+                                          
diff --git a/llvm/test/Transforms/Tapir/jump-threading-detach-continue-2.ll b/llvm/test/Transforms/Tapir/jump-threading-detach-continue-2.ll
index 990c577e2e781..a338ee84a92b9 100644
--- a/llvm/test/Transforms/Tapir/jump-threading-detach-continue-2.ll
+++ b/llvm/test/Transforms/Tapir/jump-threading-detach-continue-2.ll
@@ -1,7 +1,7 @@
 ; Check that jump threading does not thread a detach-continue edge if
 ; it does not also thread the corresponding reattach-continue edge.
 ;
-; RUN: opt < %s -passes="cgscc(devirt<4>(inline,function<eager-inv>(loop(indvars,loop-unroll-full),gvn<>,instcombine,loop-mssa(licm<allowspeculation>)))),function<eager-inv>(loop-stripmine,jump-threading)" -unroll-peel-max-count=0 -require-parallel-epilog -S | FileCheck %s
+; RUN: opt < %s -passes="cgscc(devirt<4>(inline,function<eager-inv>(loop(indvars,loop-unroll-full),gvn<>,instcombine,loop-mssa(licm<allowspeculation>)))),loop-stripmine,function<eager-inv>(jump-threading)" -unroll-peel-max-count=0 -require-parallel-epilog -S | FileCheck %s
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Transforms/Tapir/loop-stripmine-zero-iter-pfor-loop.ll b/llvm/test/Transforms/Tapir/loop-stripmine-zero-iter-pfor-loop.ll
index 338af8c741b74..b1cd73cfed784 100644
--- a/llvm/test/Transforms/Tapir/loop-stripmine-zero-iter-pfor-loop.ll
+++ b/llvm/test/Transforms/Tapir/loop-stripmine-zero-iter-pfor-loop.ll
@@ -1,6 +1,6 @@
 ; Check that loop-stripmining generates the correct IR to enable a zero-iteration parallel loop to be optimized away.
 ;
-; RUN: opt < %s -passes="cgscc(devirt<4>(inline,function<eager-inv;no-rerun>(loop(indvars),sroa<modify-cfg>))),function<eager-inv>(loop-stripmine,early-cse<memssa>,instcombine<max-iterations=1;no-verify-fixpoint>),function(simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;no-sink-common-insts;speculate-blocks;simplify-cond-branch;no-speculate-unpredictables>)" -S | FileCheck %s
+; RUN: opt < %s -passes="cgscc(devirt<4>(inline,function<eager-inv;no-rerun>(loop(indvars),sroa<modify-cfg>))),loop-stripmine,function<eager-inv>(early-cse<memssa>,instcombine<max-iterations=1;no-verify-fixpoint>),function(simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;no-sink-common-insts;speculate-blocks;simplify-cond-branch;no-speculate-unpredictables>)" -S | FileCheck %s
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/llvm/test/Transforms/Tapir/missed-loop-opts-test.ll b/llvm/test/Transforms/Tapir/missed-loop-opts-test.ll
index fecf057069ec4..b217ffb6059fb 100644
--- a/llvm/test/Transforms/Tapir/missed-loop-opts-test.ll
+++ b/llvm/test/Transforms/Tapir/missed-loop-opts-test.ll
@@ -1,6 +1,6 @@
 ; Check that Tapir loops can be peeled and subsequently stripmined.
 ;
-; RUN: opt < %s -passes='loop(loop-unroll-full),gvn,loop(tapir-indvars),loop-stripmine' -S | FileCheck %s
+; RUN: opt < %s -passes='function(loop(loop-unroll-full),gvn,loop(tapir-indvars)),loop-stripmine' -S | FileCheck %s
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"