diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 959367a73add2..50063de0d4067 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -5229,6 +5229,14 @@ def NonString : InheritableAttr {
   let Documentation = [NonStringDocs];
 }
 
+// +===== kitsune-/tapir-centric attributes
+
+def KitsuneReduction : InheritableAttr {
+  let Spellings = [Clang<"kitsune_reduction">];
+  let Subjects = SubjectList<[FunctionLike]>;
+  let Documentation = [TapirRTDocs];
+}
+
 def TapirTarget : StmtAttr {
   let Spellings = [CXX11<"tapir","target">];
 
@@ -5247,9 +5255,9 @@ def TapirTarget : StmtAttr {
                              ErrorDiag, "'parallel' statements">;
   let Args = [
     EnumArgument<"TapirTargetAttrType", "TapirTargetAttrTy", /*is_string=*/ true,
-                 ["nolo", "serial", "cuda", "hip", "opencilk", "openmp",
+                 ["nolo", "serial", "cuda", "hip", "gpu",  "opencilk", "openmp",
                   "qthreads", "realm"],
-                 ["Nolo", "Serial", "Cuda", "Hip", "OpenCilk", "OpenMP",
+                 ["Nolo", "Serial", "Cuda", "Hip", "GPU", "OpenCilk", "OpenMP",
                   "Qthreads", "Realm"],
                  0>
   ];
diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
index 29ff8dbe1f12f..80d80e9699a00 100644
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -901,6 +901,9 @@ def err_drv_unsupported_option_argument_for_frontend : Error<
 def err_drv_kitsune_missing_required : Error<
   "missing required option '--%0'">;
 
+def err_drv_kitsune_tapir_required : Error<
+  "--tapir is required with '%0'">;
+
 def err_drv_kitsune_optzns_required : Error<
   "--tapir requires optimization level O1 or higher">;
 def err_drv_kitsune_lto_o2_required : Error<
@@ -944,8 +947,6 @@ def err_drv_opencilk_missing_abi_bitcode: Error<
 
 def err_drv_kitsune_kokkos_disabled : Error<
   "kokkos support was not enabled when kitsune was built">;
-def err_drv_kitsune_kokkos_no_tapir : Error<
-  "--tapir is required with '%0'">;
 
 def err_drv_kitsune_target_not_enabled: Error<
   "tapir target '%0' was not enabled when kitsune was built">;
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index f71de4f5bfa4d..6d958332755ed 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -4283,7 +4283,7 @@ def fno_trigraphs : Flag<["-"], "fno-trigraphs">, Group<f_Group>,
   HelpText<"Do not process trigraph sequences">,
   Visibility<[ClangOption, CC1Option]>;
 def funique_source_file_names: Flag<["-"], "funique-source-file-names">, Group<f_Group>,
-  HelpText<"Allow the compiler to assume that each translation unit has a unique "                       
+  HelpText<"Allow the compiler to assume that each translation unit has a unique "
            "source file identifier (see -funique-source-file-identifier) at link time">;
 def fno_unique_source_file_names: Flag<["-"], "fno-unique-source-file-names">;
 def unique_source_file_identifier_EQ: Joined<["-"], "funique-source-file-identifier=">, Group<f_Group>,
@@ -7084,7 +7084,7 @@ defm android_pad_segment : BooleanFFlag<"android-pad-segment">, Group<f_Group>;
 def shared_libflangrt : Flag<["-"], "shared-libflangrt">,
   HelpText<"Link the flang-rt shared library">, Group<Link_Group>,
   Visibility<[FlangOption]>, Flags<[NoArgumentUnused]>;
-def static_libflangrt : Flag<["-"], "static-libflangrt">, 
+def static_libflangrt : Flag<["-"], "static-libflangrt">,
   HelpText<"Link the flang-rt static library">, Group<Link_Group>,
   Visibility<[FlangOption]>, Flags<[NoArgumentUnused]>;
 
@@ -9508,6 +9508,15 @@ def fkokkos_no_init : Flag<["-"], "fkokkos-no-init">, Alias<kokkos_no_init>,
   Visibility<[ClangOption, CC1Option]>,
   HelpText<"DEPRECATED: Use --kokkos-no-init">;
 
+// Tapir currently requires at least -O1. This makes it difficult to write tests
+// that check that the frontend generates the expected LLVM IR. If this
+// restriction is ever removed, this option will no longer be necessary since
+// we could just pass -O0 to examine the IR generated by the frontend.
+def print_before_first: Joined<["--"], "print-before-first">,
+  Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>,
+  HelpText<"Print the LLVM Module to stderr before running the optimization "
+           "pipeline">;
+
 def kitrt_verbose: Joined<["--"], "kitrt-verbose">,
   Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>,
   HelpText<"Enable verbose mode in kitsune's runtime">;
@@ -9515,7 +9524,7 @@ def kitrt_verbose: Joined<["--"], "kitrt-verbose">,
 def tapir_EQ : Joined<["--"], "tapir=">,
   Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>,
   MetaVarName<"<target>">,
-  Values<"none,serial,cuda,hip,opencilk">,
+  Values<"nolo,serial,cuda,hip,opencilk">,
   HelpText<"The primary tapir target">;
 
 def ftapir_EQ : Joined<["-"], "ftapir=">, Alias<tapir_EQ>,
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 0a82af6fbbbe6..03fb8a71a6746 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -1216,6 +1216,10 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
     return;
   }
 
+  const KitsuneOptions& kitOpts = CI.getKitsuneOpts();
+  if (kitOpts.hasTTID() && kitOpts.getPrintBeforeFirst())
+    llvm::errs() << *TheModule << "\n";
+
   // Now that we have all of the passes ready, run them.
   {
     PrettyStackTraceString CrashInfo("Optimizer");
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 2336c3da3b38c..9fad6a8bd4f39 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -2512,6 +2512,9 @@ void CodeGenModule::ConstructAttributeList(StringRef Name,
       FuncAttrs.addMemoryAttr(llvm::MemoryEffects::inaccessibleOrArgMemOnly());
       FuncAttrs.addAttribute(llvm::Attribute::NoUnwind);
     }
+    if (TargetDecl->hasAttr<KitsuneReductionAttr>()) {
+      FuncAttrs.addAttribute(llvm::Attribute::KitsuneReduction);
+    }
     if (const auto *RA = TargetDecl->getAttr<RestrictAttr>();
         RA && RA->getDeallocator() == nullptr)
       RetAttrs.addAttribute(llvm::Attribute::NoAlias);
diff --git a/clang/lib/CodeGen/CGKitsune.cpp b/clang/lib/CodeGen/CGKitsune.cpp
index 9f99ee449e832..d026d74f0e7bb 100644
--- a/clang/lib/CodeGen/CGKitsune.cpp
+++ b/clang/lib/CodeGen/CGKitsune.cpp
@@ -59,6 +59,7 @@
 #include "clang/CodeGen/CGFunctionInfo.h"
 #include "clang/Frontend/FrontendDiagnostic.h"
 #include "llvm/IR/ValueMap.h"
+#include "llvm/IR/FixedPointBuilder.h"
 
 using namespace clang;
 using namespace CodeGen;
@@ -106,6 +107,8 @@ CodeGenFunction::GetTapirTargetAttr(ArrayRef<const Attr *> Attrs) {
         return llvm::TTID::Cuda;
       case TapirTargetAttr::Hip:
         return llvm::TTID::Hip;
+      case TapirTargetAttr::GPU:
+        return llvm::TTID::GPU; 
       case TapirTargetAttr::OpenCilk:
         return llvm::TTID::OpenCilk;
       case TapirTargetAttr::OpenMP:
@@ -326,6 +329,12 @@ void CodeGenFunction::EmitForallStmt(const ForallStmt &S,
   // Evaluate the initialization before the loop.
   EmitStmt(S.getInit());
 
+  // TODO: explain more
+  // We assume that the boolean is a binary operator and pre-compute RHS
+  auto* BO = dyn_cast<BinaryOperator>(S.getCond()); 
+  llvm::Value *RHS = EmitScalarExpr(BO->getRHS()); 
+
+
   // In a parallel loop there will always be a condition block
   // so there is no need to test
   JumpDest Condition = getJumpDestInCurrentScope("forall.cond");
@@ -364,7 +373,59 @@ void CodeGenFunction::EmitForallStmt(const ForallStmt &S,
 
   // C99 6.8.5p2/p4: The first substatement is executed if the expression
   // compares unequal to 0.  The condition must be a scalar type.
-  llvm::Value *BoolCondVal = EvaluateExprAsBool(S.getCond());
+  
+  //llvm::Value *BoolCondVal = EvaluateExprAsBool(S.getCond());
+  llvm::Value *LHS = EmitScalarExpr(BO->getLHS()); 
+  QualType LHSTy = BO->getLHS()->getType();
+  llvm::Value *BoolCondVal;
+  if (LHSTy->hasSignedIntegerRepresentation()) {
+    switch(BO->getOpcode()) {
+      case BO_GT:
+        BoolCondVal = Builder.CreateICmp(llvm::ICmpInst::ICMP_SGT, LHS, RHS, "cmp");
+        break;
+      case BO_GE:
+        BoolCondVal = Builder.CreateICmp(llvm::ICmpInst::ICMP_SGE, LHS, RHS, "cmp");
+        break;
+      case BO_LT:
+        BoolCondVal = Builder.CreateICmp(llvm::ICmpInst::ICMP_SLT, LHS, RHS, "cmp");
+        break;
+      case BO_LE:
+        BoolCondVal = Builder.CreateICmp(llvm::ICmpInst::ICMP_SLE, LHS, RHS, "cmp");
+        break;
+      case BO_NE:
+        BoolCondVal = Builder.CreateICmp(llvm::ICmpInst::ICMP_NE, LHS, RHS, "cmp");
+        break;
+      case BO_EQ:
+        BoolCondVal = Builder.CreateICmp(llvm::ICmpInst::ICMP_EQ, LHS, RHS, "cmp");
+        break;
+      default:
+        llvm_unreachable("Invalid comparison in forall");
+    }
+  } else {
+    switch(BO->getOpcode()){
+      case BO_GT:
+        BoolCondVal = Builder.CreateICmp(llvm::ICmpInst::ICMP_UGT, LHS, RHS, "cmp");
+        break;
+      case BO_GE:
+        BoolCondVal = Builder.CreateICmp(llvm::ICmpInst::ICMP_UGE, LHS, RHS, "cmp");
+        break;
+      case BO_LT:
+        BoolCondVal = Builder.CreateICmp(llvm::ICmpInst::ICMP_ULT, LHS, RHS, "cmp");
+        break;
+      case BO_LE:
+        BoolCondVal = Builder.CreateICmp(llvm::ICmpInst::ICMP_ULE, LHS, RHS, "cmp");
+        break;
+      case BO_NE:
+        BoolCondVal = Builder.CreateICmp(llvm::ICmpInst::ICMP_NE, LHS, RHS, "cmp");
+        break;
+      case BO_EQ:
+        BoolCondVal = Builder.CreateICmp(llvm::ICmpInst::ICMP_EQ, LHS, RHS, "cmp");
+        break;
+      default:
+        llvm_unreachable("Invalid comparison in forall");
+    }
+  }
+
   Builder.CreateCondBr(
       BoolCondVal, Detach, Sync.getBlock(),
       createProfileWeightsForLoop(S.getCond(), getProfileCount(S.getBody())));
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 5381edbe61a3d..9897492f05cee 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -161,6 +161,9 @@ static void CheckTTEnabled(const Driver &D, llvm::TTID TT) {
     if (!KITSUNE_REALM_ENABLED)
       D.Diag(diag::err_drv_kitsune_target_not_enabled) << llvm::toString(TT);
     return;
+  case llvm::TTID::GPU:
+    // TODO: Check this
+    return;
   case llvm::TTID::Serial:
     // The serial tapir target is always enabled
     return;
@@ -229,13 +232,17 @@ static void CheckKitsuneOptions(const Driver &D, const ArgList &Args,
 
     // If --kokkos is provided, then a tapir target must also be provided.
     if (!Args.hasArg(options::OPT_tapir_EQ)) {
-      D.Diag(diag::err_drv_kitsune_kokkos_no_tapir)
+      D.Diag(diag::err_drv_kitsune_tapir_required)
           << Args.getLastArg(options::OPT_kokkos, options::OPT_kokkos_no_init)
                  ->getSpelling();
       return;
     }
   }
 
+  if (const Arg *A = Args.getLastArg(options::OPT_print_before_first))
+    if (!Args.hasArg(options::OPT_tapir_EQ))
+      D.Diag(diag::err_drv_kitsune_tapir_required) << A->getSpelling();
+
   // Check that the -ftapir flag has a valid value. This stops us from
   // reporting multiple errors because the flag is examined in several places.
   if (const Arg *A = Args.getLastArg(options::OPT_tapir_EQ)) {
diff --git a/clang/lib/Driver/KitsuneOptionUtils.cpp b/clang/lib/Driver/KitsuneOptionUtils.cpp
index a288bdf7f7477..00a2de5e70298 100644
--- a/clang/lib/Driver/KitsuneOptionUtils.cpp
+++ b/clang/lib/Driver/KitsuneOptionUtils.cpp
@@ -112,6 +112,8 @@ clang::getTapirTargetConfigFileName(const opt::ArgList &args) {
     return "qthreads.cfg";
   case TTID::Realm:
     return "realm.cfg";
+  case TTID::GPU:
+    return "gpu.cfg";
   }
   llvm_unreachable("getTapirTargetConfigFile: TTID not handled");
 }
@@ -323,6 +325,7 @@ static bool parseKitsuneTTArgs(KitsuneOptions &kitOpts, TTID tt,
   case llvm::TTID::Realm:
     return parseKitsuneRealmArgs(kitOpts, args, optTable, diags);
   case llvm::TTID::Serial:
+  case llvm::TTID::GPU: // TODO:?
     return true;
   }
   llvm_unreachable("ParseKitsuneTTArgs: TTID not handled");
@@ -335,6 +338,7 @@ bool clang::parseKitsuneArgs(KitsuneOptions &kitOpts, const char *argv0,
 
   kitOpts.setKitsuneFrontend(IsKitsuneFrontend(argv0));
   kitOpts.setStripmineLoops(args.hasArg(OPT_fstripmine));
+  kitOpts.setPrintBeforeFirst(args.hasArg(OPT_print_before_first));
 
   kitOpts.setTapirVerbose(args.hasArg(OPT_tapir_verbose));
   kitOpts.setKitrtVerbose(args.hasArg(OPT_kitrt_verbose));
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index 9f7a3a4a83e23..c2d263e58a3af 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -2277,6 +2277,8 @@ void ToolChain::AddKitsunePreprocessorArgs(const ArgList &Args,
     switch (TT) {
     case TTID::Nolo:
       return;
+    case TTID::GPU:
+      return;
     case TTID::Cuda:
       return ExtractArgsFromString(KITSUNE_CUDA_EXTRA_PREPROCESSOR_FLAGS,
                                    CmdArgs, Args);
@@ -2356,6 +2358,8 @@ void ToolChain::AddKitsuneCompilerArgs(const ArgList &Args,
       ExtractArgsFromString(KITSUNE_OPENCILK_EXTRA_COMPILER_FLAGS, CmdArgs,
                             Args);
       return;
+    case llvm::TTID::GPU:
+      return;
     case TTID::OpenMP:
       AddKitsuneOpenMPCommonArgs(Args, CmdArgs);
       ExtractArgsFromString(KITSUNE_OPENMP_EXTRA_COMPILER_FLAGS, CmdArgs, Args);
@@ -2385,6 +2389,7 @@ void ToolChain::AddKitsuneCompilerArgs(const ArgList &Args,
 
   if (std::optional<TTID> TT = parseTapirTargetIfValid(Args)) {
     Args.AddLastArg(CmdArgs, options::OPT_ffp_contract);
+    Args.AddLastArg(CmdArgs, options::OPT_print_before_first);
     Args.AddLastArg(CmdArgs, options::OPT_kitrt_verbose);
     Args.AddLastArg(CmdArgs, options::OPT_tapir_verbose);
     Args.AddLastArg(CmdArgs, options::OPT_tapir_EQ);
@@ -2563,6 +2568,8 @@ void ToolChain::AddKitsuneLinkerArgs(const ArgList &Args,
       AddKitsuneOpenCilkLinkerArgs(Args, CmdArgs);
       ExtractArgsFromString(KITSUNE_OPENCILK_EXTRA_LINKER_FLAGS, CmdArgs, Args);
       return;
+    case TTID::GPU:
+      return;
     case TTID::OpenMP:
       AddKitsuneOpenMPLinkerArgs(Args, CmdArgs);
       ExtractArgsFromString(KITSUNE_OPENMP_EXTRA_LINKER_FLAGS, CmdArgs, Args);
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index c9309fbebae6f..9a47676b2fe95 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -4813,6 +4813,7 @@ void CompilerInvocationBase::GenerateKitsuneArgs(const KitsuneOptions &Opts,
     case llvm::TTID::Qthreads:
     case llvm::TTID::Realm:
     case llvm::TTID::Serial:
+    case llvm::TTID::GPU:
       return;
     }
     llvm_unreachable("GenerateKitsuneArgs: TTID not handled");
@@ -4849,6 +4850,9 @@ void CompilerInvocationBase::GenerateKitsuneArgs(const KitsuneOptions &Opts,
 
   if (Opts.getStripmineLoops())
     GenerateArg(Consumer, OPT_fstripmine);
+
+  if (Opts.getPrintBeforeFirst())
+    GenerateArg(Consumer, OPT_print_before_first);
 }
 
 bool CompilerInvocation::CheckKitsuneArgs(const ArgList &Args,
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 2ab0b218cf8c6..19f7b537003e6 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -7902,10 +7902,12 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
   case ParsedAttr::AT_TypeNullable:
     handleNullableTypeAttr(S, D, AL);
     break;
-
   case ParsedAttr::AT_VTablePointerAuthentication:
     handleVTablePointerAuthentication(S, D, AL);
     break;
+  case ParsedAttr::AT_KitsuneReduction:
+    handleSimpleAttribute<KitsuneReductionAttr>(S, D, AL);
+    break;
   }
 }
 
diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp
index 51a159697a0a1..bc93989d472ca 100644
--- a/flang/lib/Frontend/FrontendActions.cpp
+++ b/flang/lib/Frontend/FrontendActions.cpp
@@ -1053,6 +1053,9 @@ void CodeGenAction::runOptimizationPipeline(llvm::raw_pwrite_stream &os) {
     return;
   }
 
+  if (kitsuneOpts.hasTTID() && kitsuneOpts.getPrintBeforeFirst())
+    llvm::errs() << *llvmModule << "\n";
+
   // Run the passes.
   mpm.run(*llvmModule, mam);
 
diff --git a/kitsune-tests/reductions/l2.c b/kitsune-tests/reductions/l2.c
new file mode 100644
index 0000000000000..335c6c585ec28
--- /dev/null
+++ b/kitsune-tests/reductions/l2.c
@@ -0,0 +1,49 @@
+#include<time.h>
+#include<math.h>
+#include<stdio.h>
+#include<stdlib.h>
+#include<kitsune.h>
+#include<gpu.h>
+#include<stdint.h>
+#include<assert.h>
+#include<omp.h>
+
+reduction
+void sum(double *a, double b, double unit){
+  *a += b;
+}
+
+double l2(uint64_t n, double* a){
+  double red = 0; 
+  forall(uint64_t i=0; i<n; i++){
+    sum(&red, a[i] * a[i], 0.0); 
+  }
+
+  return sqrt(red);
+}
+
+int main(int argc, char** argv){
+  int e = argc > 1 ? atoi(argv[1]) : 28; 
+  int niter = argc > 2 ? atoi(argv[2]) : 100; 
+  uint64_t n = 1ULL<<e; 
+  double* arr = (double*)gpuManagedMalloc(sizeof(double) * n); 
+
+  forall(uint64_t i=0; i<n; i++){
+    arr[i] = i; 
+  }
+
+  l2(n, arr);
+
+  double par = 0; 
+  double before = omp_get_wtime();
+  for(int i=0; i<niter; i++){
+    par = l2(n, arr);
+  }
+  double after = omp_get_wtime(); 
+  double partime = after - before; 
+
+  printf("%f in %f s\n" , par, partime);
+  double bw = (double)((1ULL<<e) * niter * sizeof(double)) / (1000000000.0 * partime);  
+  printf("bandwidth: %f GB/s \n" , bw);
+}
+
diff --git a/kitsune-tests/reductions/l2_cond.c b/kitsune-tests/reductions/l2_cond.c
new file mode 100644
index 0000000000000..401baefa28300
--- /dev/null
+++ b/kitsune-tests/reductions/l2_cond.c
@@ -0,0 +1,49 @@
+#include<time.h>
+#include<math.h>
+#include<stdio.h>
+#include<stdlib.h>
+#include<kitsune.h>
+#include<gpu.h>
+#include<stdint.h>
+#include<omp.h>
+
+reduction
+void sum(double *a, double b, double unit){
+  *a += b;
+}
+
+double l2(uint64_t n, double* a){
+  double red = 0; 
+  forall(uint64_t i=0; i<n; i++){
+    if(a[i] > n/10)
+      sum(&red, a[i]*a[i], 0); 
+  }
+
+  return sqrt(red);
+}
+
+int main(int argc, char** argv){
+  int e = argc > 1 ? atoi(argv[1]) : 28; 
+  int niter = argc > 2 ? atoi(argv[2]) : 100; 
+  uint64_t n = 1ULL<<e; 
+  double* arr = (double*)gpuManagedMalloc(sizeof(double) * n); 
+
+  forall(uint64_t i=0; i<n; i++){
+    arr[i] = i; 
+  }
+
+  l2(n, arr);
+
+  double par = 0; 
+  double before = omp_get_wtime();
+  for(int i=0; i<niter; i++){
+    par = l2(n, arr);
+  }
+  double after = omp_get_wtime(); 
+  double partime = after - before; 
+
+  printf("%f in %f s\n" , par, partime);
+  double bw = (double)((1ULL<<e) * niter * sizeof(double)) / (1000000000.0 * partime);  
+  printf("bandwidth: %f GB/s \n" , bw);
+}
+
diff --git a/kitsune-tests/reductions/l2_openmp.c b/kitsune-tests/reductions/l2_openmp.c
new file mode 100644
index 0000000000000..2061029e9d033
--- /dev/null
+++ b/kitsune-tests/reductions/l2_openmp.c
@@ -0,0 +1,43 @@
+#include<time.h>
+#include<math.h>
+#include<stdio.h>
+#include<stdlib.h>
+#include<gpu.h>
+#include<omp.h>
+
+double l2(uint64_t n, double* a){
+  double red = 0; 
+  #pragma omp parallel for reduction(+:red)
+  for(uint64_t i=0; i<n; i++){
+    red += a[i] * a[i]; 
+  }
+
+  return sqrt(red);
+}
+
+int main(int argc, char** argv){
+  int e = argc > 1 ? atoi(argv[1]) : 28; 
+  int niter = argc > 2 ? atoi(argv[2]) : 100; 
+  uint64_t n = 1ULL<<e; 
+  double* arr = malloc(sizeof(double) * n); 
+
+  #pragma omp parallel for
+  for(uint64_t i=0; i<n; i++){
+    arr[i] = i; 
+  }
+
+  l2(n, arr);
+
+  double before = omp_get_wtime();
+  double par; 
+  for(int i=0; i<niter; i++){
+    par = l2(n, arr);
+  }
+  double after = omp_get_wtime(); 
+  double partime = after - before; 
+
+  printf("par: %f in %f s\n" , par, partime);
+  double bw = (double)((1ULL<<e) * niter * sizeof(double)) / (1000000000.0 * partime);  
+  printf("par bandwidth: %f GB/s \n" , bw);
+}
+
diff --git a/kitsune-tests/reductions/makefile b/kitsune-tests/reductions/makefile
new file mode 100644
index 0000000000000..b82010b6fa9dc
--- /dev/null
+++ b/kitsune-tests/reductions/makefile
@@ -0,0 +1,30 @@
+exes=l2_opencilk l2_gpu
+
+all: $(exes)
+
+%_serial: %.c
+	kitcc $< -o $@ -O2 -fuse-ld=lld -fPIC -ftapir=serial -lm -lllvm-gpu -lomp #-mllvm -debug-only="loop-stripmine"
+
+%_opencilk: %.c
+	kitcc $< -o $@ -O2 -fuse-ld=lld -fPIC -ftapir=opencilk -lm -lllvm-gpu -lomp #-mllvm -debug-only="loop-stripmine"
+
+%_opencilk.ll: %.c
+	kitcc $< -o $@ -O2 -fPIC -ftapir=opencilk -mllvm -debug-only="loop-stripmine" -S -emit-llvm
+
+%_gpu: %.c
+	kitcc $< -o $@ -O2 -fPIC -ftapir=gpu -lm -lllvm-gpu -lomp -mllvm -debug-only="gpuabi"
+
+%_gpu.ll: %.c
+	kitcc $< -o $@ -O2 -fPIC -ftapir=gpu -mllvm -debug-only="loop-stripmine" -S -emit-llvm
+
+%_cuda: %.c
+	kitcc $< -o $@ -O2 -fPIC -ftapir=cuda -lm -lllvm-gpu -lomp -v -mllvm -cuabi-arch=sm_75
+
+%_stripmined.ll: %.c 
+	kitcc $< -o $@ -O1 -ftapir=none -S -emit-llvm -mllvm -stripmine-loop
+
+%_openmp: %_openmp.c
+	kitcc $< -fopenmp -O2 -o $@ -lm 
+
+clean: 
+	rm -f $(exes) l2_stripmined.ll l2 l2.ll
diff --git a/kitsune-tests/reductions/triangle.c b/kitsune-tests/reductions/triangle.c
new file mode 100644
index 0000000000000..720defc5a9898
--- /dev/null
+++ b/kitsune-tests/reductions/triangle.c
@@ -0,0 +1,49 @@
+#include<time.h>
+#include<math.h>
+#include<stdio.h>
+#include<stdlib.h>
+#include<kitsune.h>
+#include<gpu.h>
+#include<stdint.h>
+#include<omp.h>
+#include<assert.h>
+
+reduction
+void sum(int *a, int b, int unit){
+  *a += b + unit;
+}
+
+int triangle(uint64_t n, int* a){
+  int red = 0; 
+  forall(uint64_t i=0; i<n; i++){
+    sum(&red, a[i], 0); 
+  }
+  return red; 
+}
+
+int main(int argc, char** argv){
+  int e = argc > 1 ? atoi(argv[1]) : 100; 
+  int niter = argc > 2 ? atoi(argv[2]) : 100; 
+  uint64_t n = e;//1ULL<<e; 
+  int* arr = (int*)gpuManagedMalloc(sizeof(int) * n); 
+
+  forall(uint64_t i=0; i<n; i++){
+    arr[i] = i; 
+  }
+
+  printf("%d, %d, %d\n", triangle(n, arr), n, (n*(n-1))/2);
+
+  int par = 0; 
+  double before = omp_get_wtime();
+  for(int i=0; i<niter; i++){
+    par = triangle(n, arr);
+    assert(par == (n*(n-1))/2);
+  }
+  double after = omp_get_wtime(); 
+  double partime = after - before; 
+
+  printf("%d in %f s\n" , par, partime);
+  double bw = (double)((1ULL<<e) * niter * sizeof(double)) / (1000000000.0 * partime);  
+  printf("bandwidth: %f GB/s \n" , bw);
+}
+
diff --git a/kitsune/docs/limitations.md b/kitsune/docs/limitations.md
index eceb4b042a73f..299d08d74672e 100644
--- a/kitsune/docs/limitations.md
+++ b/kitsune/docs/limitations.md
@@ -52,3 +52,14 @@ Virtual method calls from within a `forall` loop are not currently allowed.
 _[This suddenly occurred to me but I haven't thought about whether this is an
 actual limitation or not. I just added it here because I thought that it might 
 be - Tarun (08 June 2022)]_
+
+
+## Reductions
+
+Reductions have a number of unchecked ways that they can fail
+
+    - Single word size: because of how clang can unpack structs passed by
+      value, we don't currently have a way to generalize the reduction approach
+      to handle reductions of structs.
+
+
diff --git a/kitsune/docs/using.md b/kitsune/docs/using.md
index b07a2fa4ea435..a64807bf13cff 100644
--- a/kitsune/docs/using.md
+++ b/kitsune/docs/using.md
@@ -99,3 +99,24 @@ Each special mode and runtime transformation/ABI target has its own named config
   * ``opencl.cfg``: OpenCL runtime ABI target specific flags. 
 
 These files can reduce complexity for end users by providing configuration- and build-specific flags.  This can be important when version-specific bitcode files and other details are used.  In addition, these files can provide developers additional flexibility for debugging, testing, and experimenting.  Obviously, all these features can also be hardcoded onto the command line for a more traditional use case.  In addition, to override any of the Kitsune or system configuration files you can place an empty config file within the user directory (no kitsune or system configuration files will be read in this case). 
+
+## Reductions 
+We provide a reduction keyword that acts as a set of attributes, including a custom kitsune_reduction attribute, that takes three arguments: the first is the lhs operand in memory, which will be updated in-place. The second is the rhs, and the third is the unit value of the monoid. This should be ignored in the body, and is used as an easy way to refer to the unit value in codegen.
+
+
+```
+#include<kitsune.h>
+
+...
+reduction void sum(double* l, double r, double unit){
+    *l += r;
+}
+
+double red=0.0;
+forall(...){
+    sum(&red, x, 0.0) ;
+}
+```
+
+
+
diff --git a/kitsune/include/kitsune/Config/config.h.cmake b/kitsune/include/kitsune/Config/config.h.cmake
index 63eef3ac05aea..4e93c06c9756d 100644
--- a/kitsune/include/kitsune/Config/config.h.cmake
+++ b/kitsune/include/kitsune/Config/config.h.cmake
@@ -274,4 +274,7 @@ constexpr unsigned KITSUNE_MAX_FIXED_THREADS_PER_BLOCK = 1024;
 
 // -----------------------------------------------------------------------------
 
+// Is the gpu target enabled 
+#cmakedefine01 KITSUNE_GPU_ENABLED
+
 #endif // KITSUNE_CONFIG_H
diff --git a/kitsune/include/kitsune/Core/Tapir.h b/kitsune/include/kitsune/Core/Tapir.h
index d31e0b13c0c23..eb8da83bf17b9 100644
--- a/kitsune/include/kitsune/Core/Tapir.h
+++ b/kitsune/include/kitsune/Core/Tapir.h
@@ -55,7 +55,7 @@ enum class TTID : uint32_t {
 
   /// Lower to kitsune's JIT-enabled, GPU-agnostic runtime.
   /// FIXME: This has been disabled for now, but should be re-enabled shortly.
-  // GPUABI = 0x10,
+  GPU = 0x10,
 
   /// Lower to the qthreads runtime.
   /// FIXME: This is currently disabled and needs to be updated before it can be
@@ -78,6 +78,7 @@ enum class TTID : uint32_t {
 
   /// FIXME: Almost certainly obsolete.
   OpenMP = 0x200,
+
 };
 
 /// Convert the integer to a \ref TTID. If the integer cannot be converted to a
diff --git a/kitsune/include/kitsune/Core/TapirTargets.h b/kitsune/include/kitsune/Core/TapirTargets.h
index 808fadf77b394..98b358d960775 100644
--- a/kitsune/include/kitsune/Core/TapirTargets.h
+++ b/kitsune/include/kitsune/Core/TapirTargets.h
@@ -41,8 +41,12 @@
 #include "llvm/Transforms/Tapir/QthreadsABI.h"
 #endif // KITSUNE_QTHREADS_ENABLED
 
-#if KITSNUE_REALM_ENABLED
+#if KITSUNE_REALM_ENABLED
 #include "llvm/Transforms/Tapir/RealmABI.h"
 #endif // KITSUNE_REALM_ENABLED
 
+#if KITSUNE_GPU_ENABLED
+#include "llvm/Transforms/Tapir/GPUABI.h"
+#endif // KITSUNE_REALM_ENABLED
+       //
 #endif // LLVM_TAPIR_TARGETS_H
diff --git a/kitsune/include/kitsune/Frontend/KitsuneOptions.h b/kitsune/include/kitsune/Frontend/KitsuneOptions.h
index a5eafaa4d9090..81c877cade54e 100644
--- a/kitsune/include/kitsune/Frontend/KitsuneOptions.h
+++ b/kitsune/include/kitsune/Frontend/KitsuneOptions.h
@@ -90,6 +90,14 @@ class KitsuneOptions {
   /// Should loop strip-mining be enabled.
   unsigned stripmineLoops : 1;
 
+  /// Should the LLVM Module be printed to stderr just before running the
+  /// optimization passes. Tapir currently requires at least -O1. This makes it
+  /// difficult to write tests that check that the frontend generates the
+  /// expected LLVM IR. If this restriction is ever removed, this option will no
+  /// longer be necessary since we could just pass -O0 to examine the IR
+  /// generated by the frontend.
+  unsigned printBeforeFirst : 1;
+
   /// Enable verbose mode for the tapir target LLVM passes. This is different
   /// from -mllvm -debug-only=<tapir-target-llvm-pass-name>. When the verbose
   /// flag is set, the passes may print some subset of the information that is
@@ -219,6 +227,10 @@ class KitsuneOptions {
   void setStripmineLoops(bool stripmineLoops = true) {
     this->stripmineLoops = stripmineLoops;
   }
+
+  void setPrintBeforeFirst(bool printBeforeFirst = true) {
+    this->printBeforeFirst = printBeforeFirst;
+  }
   /// @}
 
   /// @{
@@ -292,6 +304,8 @@ class KitsuneOptions {
 
   bool getStripmineLoops() const { return stripmineLoops; }
 
+  bool getPrintBeforeFirst() const { return printBeforeFirst; }
+
   bool getTapirVerbose() const { return tapirVerbose; }
 
   bool getKitrtVerbose() const { return kitrtVerbose; }
diff --git a/kitsune/include/kitsune/kitsune.h b/kitsune/include/kitsune/kitsune.h
index 5f24569bf8ac3..57003b9800ff7 100644
--- a/kitsune/include/kitsune/kitsune.h
+++ b/kitsune/include/kitsune/kitsune.h
@@ -8,6 +8,16 @@
 #ifndef __KITSUNE_KITSUNE_H__
 #define __KITSUNE_KITSUNE_H__
 
+#include <stdint.h>
+#include <stddef.h>
+
+#if defined(reduction)
+#warning found reduction definition: try puttin kitsune lower in include order
+#else
+#define reduction __attribute__((optnone, noinline, kitsune_reduction))
+#endif
+
+
 #if defined(spawn)
 // FIXME KITSUNE: Should this be an error instead of a warning?
 #warning encountered multiple definitions of spawn!
diff --git a/kitsune/lib/Analysis/TapirTargetAnalysis.cpp b/kitsune/lib/Analysis/TapirTargetAnalysis.cpp
index 51dc77aa323c9..e0c68d8b2a51c 100644
--- a/kitsune/lib/Analysis/TapirTargetAnalysis.cpp
+++ b/kitsune/lib/Analysis/TapirTargetAnalysis.cpp
@@ -87,6 +87,11 @@ createTT(TTID id, Module &m, const TapirTargetOptions &tto) {
   case TTID::Realm:
     return std::make_unique<RealmABI>(m);
 #endif // KITSUNE_REALM_ENABLED
+       
+#if KITSUNE_GPU_ENABLED
+  case TTID::GPU:
+    return std::make_unique<GPUABI>(m, tto);
+#endif // KITSUNE_REALM_ENABLED
 
   default:
     llvm_unreachable("createTT: TTID not handled");
diff --git a/kitsune/lib/Core/TapirTargetOptions.cpp b/kitsune/lib/Core/TapirTargetOptions.cpp
index 4f6f5e6a10ea1..fec422593eba9 100644
--- a/kitsune/lib/Core/TapirTargetOptions.cpp
+++ b/kitsune/lib/Core/TapirTargetOptions.cpp
@@ -47,7 +47,7 @@ static cl::opt<TTID>
                        clEnumValN(TTID::Cuda, "cuda", ""),
                        clEnumValN(TTID::Hip, "hip", ""),
                        clEnumValN(TTID::OpenCilk, "opencilk", ""),
-                       // clEnumValN(TTID::GPUABI, "gpuabi", ""),
+                       clEnumValN(TTID::GPU, "gpu", ""),
                        clEnumValN(TTID::Qthreads, "qthreads", ""),
                        clEnumValN(TTID::Realm, "realm", ""),
                        clEnumValN(TTID::Lambda, "lambda", ""),
diff --git a/kitsune/lib/Support/Deserialize.cpp b/kitsune/lib/Support/Deserialize.cpp
index 547d9a5806586..629afd5012d07 100644
--- a/kitsune/lib/Support/Deserialize.cpp
+++ b/kitsune/lib/Support/Deserialize.cpp
@@ -22,6 +22,7 @@ std::optional<TTID> llvm::createTTIDFrom(StringRef s) {
       .Case("nolo", TTID::Nolo)
       .Case("cuda", TTID::Cuda)
       .Case("hip", TTID::Hip)
+      .Case("gpu", TTID::GPU)
       .Case("lambda", TTID::Lambda)
       .Case("omptask", TTID::OMPTask)
       .Case("opencilk", TTID::OpenCilk)
@@ -45,8 +46,7 @@ std::optional<TTID> llvm::createTTIDFrom(uint32_t u) {
   case 0x8:
     return TTID::OpenCilk;
   case 0x10:
-    llvm_unreachable("createTTIDFrom: GPUABI has not been enabled");
-    // return TTID::GPUABI;
+    return TTID::GPU;
   case 0x20:
     return TTID::Qthreads;
   case 0x40:
diff --git a/kitsune/lib/Support/ToString.cpp b/kitsune/lib/Support/ToString.cpp
index 41b1163534580..04f66f6d8b5bc 100644
--- a/kitsune/lib/Support/ToString.cpp
+++ b/kitsune/lib/Support/ToString.cpp
@@ -37,6 +37,8 @@ std::string llvm::toString(const TTID &tt) {
     return "qthreads";
   case TTID::Realm:
     return "realm";
+  case TTID::GPU:
+    return "gpu";
   }
   llvm_unreachable("toString: TTID not handled");
 }
diff --git a/kitsune/runtime/cuda/dylib_support.cpp b/kitsune/runtime/cuda/dylib_support.cpp
index c518e72e712a9..907d4f4ca3808 100644
--- a/kitsune/runtime/cuda/dylib_support.cpp
+++ b/kitsune/runtime/cuda/dylib_support.cpp
@@ -114,7 +114,7 @@ bool __kitcuda_load_symbols() {
   DLSYM_LOAD(cuFuncGetAttribute);
 
   /* Context management */
-  DLSYM_LOAD(cuCtxCreate_v3);
+  DLSYM_LOAD(cuCtxCreate_v4);
   DLSYM_LOAD(cuDevicePrimaryCtxRetain);
   DLSYM_LOAD(cuCtxGetCurrent);
   DLSYM_LOAD(cuCtxSetCurrent);
diff --git a/kitsune/runtime/cuda/kitcuda.cpp b/kitsune/runtime/cuda/kitcuda.cpp
index e26aafe2f5b24..76fc1c586228c 100644
--- a/kitsune/runtime/cuda/kitcuda.cpp
+++ b/kitsune/runtime/cuda/kitcuda.cpp
@@ -72,6 +72,7 @@
 bool _kitcuda_initialized = false;
 int _kitcuda_device_id = -1;
 CUdevice _kitcuda_device = -1;
+CUmemLocation _kitcuda_mem_location; 
 CUcontext _kitcuda_context;
 
 // TODO: We currently don't use these values within the runtime but
@@ -141,6 +142,9 @@ bool __kitcuda_initialize() {
   if (!__kitrt_get_env_value("KITCUDA_DEVICE_ID", _kitcuda_device_id))
     _kitcuda_device_id = 0;
 
+  _kitcuda_mem_location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  _kitcuda_mem_location.id = _kitcuda_device_id; 
+
   assert(_kitcuda_device_id < device_count &&
          "kitcuda: KITCUDA_DEVICE_ID value exceeds available number"
          " of devices.");
diff --git a/kitsune/runtime/cuda/kitcuda.h b/kitsune/runtime/cuda/kitcuda.h
index 20de2485db0c9..7ae93d4115d03 100644
--- a/kitsune/runtime/cuda/kitcuda.h
+++ b/kitsune/runtime/cuda/kitcuda.h
@@ -441,6 +441,7 @@ inline CUcontext __kitcuda_get_context() {
 #endif
 
 extern CUdevice _kitcuda_device;
+extern CUmemLocation _kitcuda_mem_location; 
 extern CUcontext _kitcuda_context;
 
 #define CU_SAFE_CALL(x)                                                        \
diff --git a/kitsune/runtime/cuda/kitcuda_dylib.h b/kitsune/runtime/cuda/kitcuda_dylib.h
index 8e89edc835549..3ed7932e84bde 100644
--- a/kitsune/runtime/cuda/kitcuda_dylib.h
+++ b/kitsune/runtime/cuda/kitcuda_dylib.h
@@ -82,7 +82,7 @@ DECLARE_DLSYM(cuFuncGetName);
 DECLARE_DLSYM(cuFuncGetAttribute);
 
 /* Context management */
-DECLARE_DLSYM(cuCtxCreate_v3);
+DECLARE_DLSYM(cuCtxCreate_v4);
 DECLARE_DLSYM(cuDevicePrimaryCtxRetain);
 DECLARE_DLSYM(cuCtxGetCurrent);
 DECLARE_DLSYM(cuCtxSetCurrent);
diff --git a/kitsune/runtime/cuda/memory.cpp b/kitsune/runtime/cuda/memory.cpp
index de1c1d24bee3d..5b8d87f490822 100644
--- a/kitsune/runtime/cuda/memory.cpp
+++ b/kitsune/runtime/cuda/memory.cpp
@@ -79,9 +79,9 @@ __kitcuda_mem_alloc_managed(size_t size) {
   // to occur on first touch -- thus our 'prefetch' status here is a bit
   // misleading (technically we are not prefetched to either host nor device).
   CU_SAFE_CALL(cuMemAdvise_p(devp, size, CU_MEM_ADVISE_SET_ACCESSED_BY,
-                             _kitcuda_device));
+                             _kitcuda_mem_location));
   CU_SAFE_CALL(cuMemAdvise_p(devp, size, CU_MEM_ADVISE_SET_PREFERRED_LOCATION,
-                             _kitcuda_device));
+                             _kitcuda_mem_location));
 
   int enable = 1;
   CU_SAFE_CALL(
@@ -267,7 +267,7 @@ void *__kitcuda_mem_gpu_prefetch(void *vp, void *opaque_stream) {
       // CU_MEM_ADVISE_SET_READ_MOSTLY advice flag.
       CU_SAFE_CALL(cuMemAdvise_p((CUdeviceptr)vp, size,
                                  CU_MEM_ADVISE_SET_PREFERRED_LOCATION,
-                                 _kitcuda_device));
+                                 _kitcuda_mem_location));
 
       // Issue a prefetch request on the provided stream.  If the given
       // stream is null, create a new stream and return it. Once issued
@@ -280,7 +280,7 @@ void *__kitcuda_mem_gpu_prefetch(void *vp, void *opaque_stream) {
       else
         cu_stream = (CUstream)__kitcuda_get_thread_stream();
 
-      CU_SAFE_CALL(cuMemPrefetchAsync_p((CUdeviceptr)vp, size, _kitcuda_device,
+      CU_SAFE_CALL(cuMemPrefetchAsync_p((CUdeviceptr)vp, size, _kitcuda_mem_location, 0,
                                         cu_stream));
       _kitcuda_mem_alloc_mutex.lock();
       __kitrt_mark_mem_prefetched(vp);
@@ -322,9 +322,12 @@ void *__kitcuda_mem_host_prefetch(void *vp, void *opaque_stream) {
       //
       // TODO: A lot of work needs to go into seeing if we can be
       // smarter about device- and host-side prefetching.
+      CUmemLocation cpu;
+      cpu.type = CU_MEM_LOCATION_TYPE_HOST;
+      cpu.id = 0; 
       CU_SAFE_CALL(cuMemAdvise_p((CUdeviceptr)vp, size,
                                  CU_MEM_ADVISE_SET_PREFERRED_LOCATION,
-                                 CU_DEVICE_CPU));
+                                 cpu));
       // Issue a prefetch request on the stream associated with the
       // calling thread. Once issued go ahead and mark the memory as
       // no long being prefetched to the device/GPU.  This "mark" does
@@ -336,7 +339,7 @@ void *__kitcuda_mem_host_prefetch(void *vp, void *opaque_stream) {
       else
         cu_stream = (CUstream)__kitcuda_get_thread_stream();
 
-      CU_SAFE_CALL(cuMemPrefetchAsync_p((CUdeviceptr)vp, size, CU_DEVICE_CPU,
+      CU_SAFE_CALL(cuMemPrefetchAsync_p((CUdeviceptr)vp, size, cpu, 0,
                                         cu_stream));
       __kitrt_set_mem_prefetch(vp, false);
       return cu_stream;
diff --git a/kitsune/test/driver/print-before-first.c b/kitsune/test/driver/print-before-first.c
new file mode 100644
index 0000000000000..2e6fd1db37d3a
--- /dev/null
+++ b/kitsune/test/driver/print-before-first.c
@@ -0,0 +1,40 @@
+// Check that the --print-before-first option is handled correctly.
+//
+// ----------------------------------------------------------------------------
+// The --print-before-first option must be used with a Kitsune frontend
+//
+// RUN: not %clang -O1 --print-before-first %s \
+// RUN:     -S -emit-llvm -o /dev/null 2>&1 \
+// RUN:     | FileCheck %s -check-prefix FRONTEND
+//
+// FRONTEND: option '--print-before-first' must be used with a Kitsune frontend
+//
+// ----------------------------------------------------------------------------
+// The --print-before-first option requires the --tapir option
+//
+// RUN: not %kitcc -O1 --print-before-first %s \
+// RUN:     -S -emit-llvm -o /dev/null 2>&1 \
+// RUN:     | FileCheck %s -check-prefix TT -allow-empty
+//
+// TT: --tapir is required with '--print-before-first'
+//
+// ----------------------------------------------------------------------------
+// If the --print-before-first option has been implemented correctly, mem2reg
+// will not have run, so a stack slot will have been created for the function
+// argument.
+//
+// RUN: %kitcc -O1 --tapir=serial --print-before-first %s \
+// RUN:     -S -emit-llvm -o /dev/null 2>&1 \
+// RUN:     | FileCheck %s -check-prefix EARLY
+//
+// EARLY: define {{.+}}ptr @f(ptr {{.*}}%[[P:[^)]+]])
+// EARLY: %[[SLOT:.+]] = alloca ptr
+// EARLY: store ptr %[[P]], ptr %[[SLOT]]
+// EARLY: %[[RV:.+]] = load ptr, ptr %[[SLOT]]
+// EARLY: ret ptr %[[RV]]
+//
+// ----------------------------------------------------------------------------
+
+void* f(void* p) {
+  return p;
+}
diff --git a/kitsune/test/driver/print-before-first.f90 b/kitsune/test/driver/print-before-first.f90
new file mode 100644
index 0000000000000..1af4ae07173c3
--- /dev/null
+++ b/kitsune/test/driver/print-before-first.f90
@@ -0,0 +1,39 @@
+! REQUIRES: kitfc
+!
+! Check that the --print-before-first option is handled correctly.
+!
+! ----------------------------------------------------------------------------
+! The --print-before-first option must be used with a Kitsune frontend
+!
+! RUN: not %flang -O1 --print-before-first %s \
+! RUN:     -S -emit-llvm -o /dev/null 2>&1 \
+! RUN:     | FileCheck %s -check-prefix FRONTEND
+!
+! FRONTEND: option '--print-before-first' must be used with a Kitsune frontend
+!
+! ----------------------------------------------------------------------------
+! The --print-before-first option requires the --tapir option
+!
+! RUN: not %kitfc -O1 --print-before-first %s \
+! RUN:     -S -emit-llvm -o /dev/null 2>&1 \
+! RUN:     | FileCheck %s -check-prefix TT -allow-empty
+!
+! TT: --tapir is required with '--print-before-first'
+!
+! ----------------------------------------------------------------------------
+! If the --print-before-first option has been implemented correctly, @_QQmain
+! be called from @main. Since @_QQmain is empty, it will be absent after
+! optimizations are run (because the empty body will have been inlined)
+!
+! RUN: %kitfc -O1 --tapir=serial --print-before-first %s \
+! RUN:     -S -emit-llvm -o /dev/null 2>&1 \
+! RUN:     | FileCheck %s -check-prefix EARLY
+!
+! EARLY: define {{.*}}i32 @main
+! EARLY: call {{.+}} @_FortranAProgramStart
+! EARLY: call {{.+}} @_QQmain
+! EARLY: call {{.+}} @_FortranAProgramEndStatement
+!
+! ----------------------------------------------------------------------------
+
+end program
diff --git a/kitsune/test/tapir/cuda/kernel-opt.ll b/kitsune/test/tapir/cuda/kernel-opt.ll
index 682292a342bed..189a5ad7f93ef 100644
--- a/kitsune/test/tapir/cuda/kernel-opt.ll
+++ b/kitsune/test/tapir/cuda/kernel-opt.ll
@@ -32,7 +32,13 @@
 ; RUN:     | FileCheck %s --check-prefix=O2
 ;
 ; O2-NOT: = phi i64
-; O2: define {{.+}} @__kitcu_{{.+}}(i64 {{.*}}%[[UB:[^,]+]], i64 {{[^,]+}}, i64 {{[^,]+}}, ptr {{.*}}%[[BUF:[^,]+]], i64 {{.*}}%[[N:[^)]+]]) {{.*}}#[[ATTRS:[0-9]+]]
+; O2: define {{.+}} @__kitcu_{{[^(]+}}(
+; O2-SAME: i64 {{[^%]*}}%[[LB:[^,]+]],
+; O2-SAME: i64 {{[^%]*}}%[[UB:[^,]+]],
+; O2-SAME: i64 {{[^%]*}}%[[GRAINSIZE:[^,]+]],
+; O2-SAME: ptr {{[^%]*}}%[[BUF:[^,]+]],
+; O2-SAME: i64 {{[^%]*}}%[[N:[^)]+]])
+; O2-SAME: {{.*}}#[[ATTRS:[0-9]+]]
 ; O2-NEXT: [[BBENTRY:.+]]:
 ; O2-NEXT: %[[TID:.+]] = tail call {{(range.+ )?}}i32 @llvm.nvvm.read.ptx.sreg.tid.x()
 ; O2-NEXT: %[[BIDX:.+]] = tail call {{(range.+ )?}}i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
diff --git a/kitsune/test/tapir/cuda/kernel-threadiv-1d.ll b/kitsune/test/tapir/cuda/kernel-threadiv-1d.ll
index 90132664556fd..ed8d90f8cb134 100644
--- a/kitsune/test/tapir/cuda/kernel-threadiv-1d.ll
+++ b/kitsune/test/tapir/cuda/kernel-threadiv-1d.ll
@@ -6,7 +6,11 @@
 ; RUN:     | %kit-mbc -S \
 ; RUN:     | FileCheck %s
 ;
-; CHECK: define {{.+}}(i64 {{[^%]*}}%[[UB:[^,]+]], {{.+}}) #[[ATTRS:[0-9]+]]
+; CHECK: define {{[^(]+}}(
+; CHECK-SAME: i64 {{[^%]*}}%[[LB:[^,]+]],
+; CHECK-SAME: i64 {{[^%]*}}%[[UB:[^,]+]],
+; CHECK-SAME: i64 {{[^)]+}})
+; CHECK-SAME: #[[ATTRS:[0-9]+]]
 ; CHECK: %[[TID:.+]] = {{.*}}call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
 ; CHECK: %[[BIDX:.+]] = {{.*}}call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
 ; CHECK: %[[BDIM:.+]] = {{.*}}call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
diff --git a/kitsune/test/tapir/cuda/launch.ll b/kitsune/test/tapir/cuda/launch.ll
index 4c81e9c68f494..f3a2a8f608c82 100644
--- a/kitsune/test/tapir/cuda/launch.ll
+++ b/kitsune/test/tapir/cuda/launch.ll
@@ -28,8 +28,8 @@
 ; These are followed by a variable number of arguments that are to be passed to
 ; the kernel being launched. These are typically in the order
 ;
-;   - trip count
 ;   - start index
+;   - trip count
 ;   - grain size
 ;   - ...
 ;
@@ -44,8 +44,8 @@
 ; CHECK-SAME: i32 0,
 ; CHECK-SAME: ptr {{.*}}@[[G_KERNEL_PROPS]],
 ; CHECK-SAME: ptr %[[STREAM]],
-; CHECK-SAME: i64 %n,
 ; CHECK-SAME: i64 0,
+; CHECK-SAME: i64 %n,
 ; CHECK-SAME: i64 1,
 ; CHECK-SAME: ptr %c,
 ; CHECK-SAME: i64 %n
diff --git a/kitsune/test/tapir/hip/kernel-opt.ll b/kitsune/test/tapir/hip/kernel-opt.ll
index 4f58eec184fd2..a16de3d57a85d 100644
--- a/kitsune/test/tapir/hip/kernel-opt.ll
+++ b/kitsune/test/tapir/hip/kernel-opt.ll
@@ -32,7 +32,13 @@
 ; RUN:     | FileCheck %s --check-prefix=O2
 ;
 ; O2-NOT: = phi i64
-; O2: define {{.+}} @__kithip_{{.+}}(i64 {{.*}}%[[UB:[^,]+]], i64 {{[^,]+}}, i64 {{[^,]+}}, ptr {{.*}}%[[BUF:[^,]+]], i64 {{.*}}%[[N:[^)]+]]) {{.*}}#[[ATTRS:[0-9]+]]
+; O2: define {{.+}} @__kithip_{{[^(]+}}(
+; O2-SAME: i64 {{[^%]*}}%[[LB:[^,]+]],
+; O2-SAME: i64 {{[^%]*}}%[[UB:[^,]+]],
+; O2-SAME: i64 {{[^%]*}}%[[GRAINSIZE:[^,]+]],
+; O2-SAME: ptr {{[^%]*}}%[[BUF:[^,]+]],
+; O2-SAME: i64 {{[^%]*}}%[[N:[^)]+]])
+; O2-SAME: {{.*}}#[[ATTRS:[0-9]+]]
 ; O2-NEXT: [[BBENTRY:.+]]:
 ; O2-NEXT: %[[BUFCST:.+]] = addrspacecast ptr %[[BUF]] to ptr addrspace(1)
 ; O2-NEXT: %[[WITEM:.+]] = {{.*}}call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/kitsune/test/tapir/hip/kernel-threadiv-1d-y.ll b/kitsune/test/tapir/hip/kernel-threadiv-1d-y.ll
index 8e93d1d33071b..008d681218a27 100644
--- a/kitsune/test/tapir/hip/kernel-threadiv-1d-y.ll
+++ b/kitsune/test/tapir/hip/kernel-threadiv-1d-y.ll
@@ -7,7 +7,11 @@
 ; RUN:     | %kit-mbc -S \
 ; RUN:     | FileCheck %s
 ;
-; CHECK: define {{.+}}(i64 {{[^%]*}}%[[UB:[^,]+]], {{.+}}) #[[ATTRS:[0-9]+]]
+; CHECK: define {{.+}}(
+; CHECK-SAME: i64 {{[^%]*}}%[[LB:[^,]+]],
+; CHECK-SAME: i64 {{[^%]*}}%[[UB:[^,]+]],
+; CHECK-SAME: i64 {{[^)]+}})
+; CHECK-SAME: #[[ATTRS:[0-9]+]]
 ; CHECK: %[[WITEM:.+]] = {{.*}}call i32 @llvm.amdgcn.workitem.id.y()
 ; CHECK: %[[TID:.+]] = zext i32 %[[WITEM]] to i64
 ; CHECK: %[[BDIM:.+]] = {{.*}}call i64 @__ockl_get_local_size(i32 1)
diff --git a/kitsune/test/tapir/hip/kernel-threadiv-1d.ll b/kitsune/test/tapir/hip/kernel-threadiv-1d.ll
index b2a91a1d97fc0..7247002ef907b 100644
--- a/kitsune/test/tapir/hip/kernel-threadiv-1d.ll
+++ b/kitsune/test/tapir/hip/kernel-threadiv-1d.ll
@@ -6,7 +6,11 @@
 ; RUN:     | %kit-mbc -S \
 ; RUN:     | FileCheck %s
 ;
-; CHECK: define {{.+}}(i64 {{[^%]*}}%[[UB:[^,]+]], {{.+}}) #[[ATTRS:[0-9]+]]
+; CHECK: define {{.+}}(
+; CHECK-SAME: i64 {{[^%]*}}%[[LB:[^,]+]],
+; CHECK-SAME: i64 {{[^%]*}}%[[UB:[^,]+]],
+; CHECK-SAME: i64 {{[^)]+}})
+; CHECK-SAME: #[[ATTRS:[0-9]+]]
 ; CHECK: %[[WITEM:.+]] = {{.*}}call i32 @llvm.amdgcn.workitem.id.x()
 ; CHECK: %[[TID:.+]] = zext i32 %[[WITEM]] to i64
 ; CHECK: %[[BDIM:.+]] = {{.*}}call i64 @__ockl_get_local_size(i32 0)
diff --git a/kitsune/test/tapir/hip/launch.ll b/kitsune/test/tapir/hip/launch.ll
index 72b8dec1de102..183806f2d18d1 100644
--- a/kitsune/test/tapir/hip/launch.ll
+++ b/kitsune/test/tapir/hip/launch.ll
@@ -29,8 +29,8 @@
 ; These are followed by a variable number of arguments that are to be passed to
 ; the kernel being launched. These are typically in the order
 ;
-;   - trip count
 ;   - start index
+;   - trip count
 ;   - grain size
 ;   - ...
 ;
@@ -45,8 +45,8 @@
 ; CHECK-SAME: i32 0,
 ; CHECK-SAME: ptr {{.*}}@[[G_KERNEL_PROPS]],
 ; CHECK-SAME: ptr %[[STREAM]],
-; CHECK-SAME: i64 %n,
 ; CHECK-SAME: i64 0,
+; CHECK-SAME: i64 %n,
 ; CHECK-SAME: i64 1,
 ; CHECK-SAME: ptr %c,
 ; CHECK-SAME: i64 %n
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 9192fca94451d..658ff27f06a8e 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -444,6 +444,7 @@ list(JOIN KITSUNE_ENABLED_LANGS_STR " " KITSUNE_ENABLED_LANGS_STR)
 # affects clang. And we have ensured that kitsune is configured before clang.
 set(KITSUNE_CUDA_ENABLED OFF CACHE INTERNAL "Enable 'cuda' tapir target" FORCE)
 set(KITSUNE_HIP_ENABLED OFF CACHE INTERNAL "Enable 'hip' tapir target" FORCE)
+set(KITSUNE_GPU_ENABLED OFF CACHE INTERNAL "Enable 'gpu' tapir target" FORCE)
 set(KITSUNE_LAMBDA_ENABLED OFF CACHE INTERNAL "Enable 'lambda' tapir target" FORCE)
 set(KITSUNE_OMPTASK_ENABLED OFF CACHE INTERNAL "Enable 'omptask' tapir target" FORCE)
 set(KITSUNE_OPENCILK_ENABLED OFF CACHE INTERNAL "Enable 'opencilk' tapir target" FORCE)
@@ -453,7 +454,7 @@ set(KITSUNE_REALM_ENABLED OFF CACHE INTERNAL "Enable 'realm' tapir target" FORCE
 set(KITSUNE_SERIAL_ENABLED ON CACHE INTERNAL "Enable 'serial' tapir target" FORCE)
 
 set(KITSUNE_KNOWN_TAPIR_TARGETS
-  "cuda;hip;lambda;omptask;opencilk;openmp;qthreads;realm;serial"
+  "cuda;hip;gpu;lambda;omptask;opencilk;openmp;qthreads;realm;serial"
   CACHE INTERNAL
   "All known Tapir targets" FORCE)
 
@@ -493,6 +494,8 @@ foreach(target IN LISTS KITSUNE_ENABLED_TAPIR_TARGETS)
     message(FATAL_ERROR "Unknown Tapir target '${target}'")
   endif ()
 endforeach ()
+message(WARNING "CUDA: ${KITSUNE_CUDA_ENABLED}")
+message(WARNING "GPU: ${KITSUNE_GPU_ENABLED}")
 list(SORT KITSUNE_ENABLED_TAPIR_TARGETS_STR)
 list(JOIN KITSUNE_ENABLED_TAPIR_TARGETS_STR " " KITSUNE_ENABLED_TAPIR_TARGETS_STR)
 
diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
index 14c6e28d55369..a45afe2dfe615 100644
--- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
@@ -813,6 +813,7 @@ enum AttributeKindCodes {
   ATTR_KIND_KIT_KERNEL_PROPS = 109,
   ATTR_KIND_KIT_KERNEL = 110,
   ATTR_KIND_KIT_DEVICE = 111,
+  ATTR_KIND_KIT_REDUCTION = 112,
 };
 
 enum ComdatSelectionKindCodes {
diff --git a/llvm/include/llvm/IR/Attributes.td b/llvm/include/llvm/IR/Attributes.td
index 165573f719d59..b88603738b846 100644
--- a/llvm/include/llvm/IR/Attributes.td
+++ b/llvm/include/llvm/IR/Attributes.td
@@ -276,6 +276,9 @@ def ReadNone : EnumAttr<"readnone", IntersectAnd, [ParamAttr]>;
 /// Function only reads from memory.
 def ReadOnly : EnumAttr<"readonly", IntersectAnd, [ParamAttr]>;
 
+/// Tapir reducer-related attributes.
+def KitsuneReduction : EnumAttr<"kitsune_reduction", IntersectPreserve, [FnAttr]>;
+
 /// Return value is always equal to this argument.
 def Returned : EnumAttr<"returned", IntersectAnd, [ParamAttr]>;
 
diff --git a/llvm/include/llvm/Transforms/Tapir/CudaABI.h b/llvm/include/llvm/Transforms/Tapir/CudaABI.h
index 15f9efd74c287..5d9edd5dd03e6 100644
--- a/llvm/include/llvm/Transforms/Tapir/CudaABI.h
+++ b/llvm/include/llvm/Transforms/Tapir/CudaABI.h
@@ -144,8 +144,6 @@ class CudaLoop : public LoopOutlineProcessor {
   // Cuda/PTX grid dimensions access.
   Function *CUGridDimX = nullptr, *CUGridDimY = nullptr, *CUGridDimZ = nullptr;
 
-  SmallVector<Value *, 5> OrderedInputs;
-
   /// The GlobalValue's used in the loop that is being outlined. This includes
   /// functions, global variables, aliases and ifunc's.
   std::set<GlobalValue *> UsedGlobalValues;
@@ -161,25 +159,11 @@ class CudaLoop : public LoopOutlineProcessor {
            const TapirTargetOptions &TTOpts);
   ~CudaLoop();
 
-  void setupLoopOutlineArgs(Function &F, ValueSet &HelperArgs,
-                            SmallVectorImpl<Value *> &HelperInputs,
-                            ValueSet &InputSet,
-                            const SmallVectorImpl<Value *> &LCArgs,
-                            const SmallVectorImpl<Value *> &LCInputs,
-                            const ValueSet &TLInputsFixed) override final;
-
-  unsigned getIVArgIndex(const Function &F,
-                         const ValueSet &Args) const override final;
-
-  unsigned getLimitArgIndex(const Function &F,
-                            const ValueSet &Args) const override final;
-
   void preProcessTapirLoop(TapirLoopInfo &TL, ValueToValueMapTy &VMap) override;
   void postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out,
                           ValueToValueMapTy &VMap) override final;
   void processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
                                DominatorTree &DT) override final;
-  void remapData(ValueToValueMapTy &VMap) override final;
 };
 
 } // namespace llvm
diff --git a/llvm/include/llvm/Transforms/Tapir/GPUABI.h b/llvm/include/llvm/Transforms/Tapir/GPUABI.h
index 7e0ccda6af380..0ad6808a5420f 100644
--- a/llvm/include/llvm/Transforms/Tapir/GPUABI.h
+++ b/llvm/include/llvm/Transforms/Tapir/GPUABI.h
@@ -26,13 +26,13 @@ class LLVMLoop;
 class GPUABI : public TapirTarget {
   LLVMLoop *LOP = nullptr;
 public:
-  GPUABI(Module &M) : TapirTarget(M) {}
+  GPUABI(Module &M, const TapirTargetOptions &TTO) : TapirTarget(M, TTO) {}; 
   ~GPUABI() {}
   Value *lowerGrainsizeCall(CallInst *GrainsizeCall) override final;
   void lowerSync(SyncInst &SI) override final;
 
   void addHelperAttributes(Function &F) override final {}
-  void preProcessFunction(Function &F, TaskInfo &TI,
+  bool preProcessFunction(Function &F, TaskInfo &TI,
                           bool OutliningTapirLoops) override final;
   void postProcessFunction(Function &F, bool OutliningTapirLoops)
     override final;
@@ -49,8 +49,7 @@ class GPUABI : public TapirTarget {
   void processSubTaskCall(TaskOutlineInfo &TOI, DominatorTree &DT)
     override final;
 
-  LoopOutlineProcessor *getLoopOutlineProcessor(const TapirLoopInfo *TL,
-		  OptimizationLevel OptLevel = OptimizationLevel=O2)						
+  LoopOutlineProcessor *getLoopOutlineProcessor(const TapirLoopInfo *TL)						
     override final;
 };
 
diff --git a/llvm/include/llvm/Transforms/Tapir/HipABI.h b/llvm/include/llvm/Transforms/Tapir/HipABI.h
index 77fbc393dab01..dc818073cf4b4 100644
--- a/llvm/include/llvm/Transforms/Tapir/HipABI.h
+++ b/llvm/include/llvm/Transforms/Tapir/HipABI.h
@@ -159,31 +159,6 @@ class HipLoop : public LoopOutlineProcessor {
           const TapirTargetOptions &TTO);
   ~HipLoop();
 
-  /// Prepares the set HelperArgs of function arguments for the outlined helper
-  /// function Helper for a Tapir loop.  Also prepares the list HelperInputs of
-  /// input values passed to a call to Helper.  HelperArgs and HelperInputs are
-  /// derived from the loop-control arguments LCArgs and loop-control inputs
-  /// LCInputs for the Tapir loop, as well the set TLInputsFixed of arguments to
-  /// the task underlying the Tapir loop.
-  void setupLoopOutlineArgs(Function &F, ValueSet &HelperArgs,
-                            SmallVectorImpl<Value *> &HelperInputs,
-                            ValueSet &InputSet,
-                            const SmallVectorImpl<Value *> &LCArgs,
-                            const SmallVectorImpl<Value *> &LCInputs,
-                            const ValueSet &TLInputsFixed) override;
-
-  /// Returns an integer identifying the index of the helper-function argument
-  /// in Args that specifies the starting iteration number.  This return value
-  /// must complement the behavior of setupLoopOutlineArgs().
-  unsigned getIVArgIndex(const Function &F,
-                         const ValueSet &Args) const override;
-
-  /// Returns an integer identifying the index of the helper-function argument
-  /// in Args that specifies the ending iteration number.  This return value
-  /// must complement the behavior of setupLoopOutlineArgs().
-  unsigned getLimitArgIndex(const Function &F,
-                            const ValueSet &Args) const override;
-
   /// Process the TapirLoop before it is outlined -- just prior to the
   /// outlining occurs.  This allows the VMap and related details to be
   /// customized prior to outlining related operations (e.g. cloning of
@@ -199,8 +174,6 @@ class HipLoop : public LoopOutlineProcessor {
   void processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
                                DominatorTree &DT) override;
 
-  void remapData(ValueToValueMapTy &VMap) override final;
-
 private:
   Value *emitWorkItemId(IRBuilder<> &Builder, int ItemIndex);
   Value *emitWorkGroupId(IRBuilder<> &Builder, int ItemIndex);
@@ -220,8 +193,6 @@ class HipLoop : public LoopOutlineProcessor {
   FunctionCallee HipWorkGroupIdXFn, HipWorkGroupIdYFn, HipWorkGroupIdZFn;
   FunctionCallee HipBlockDimFn;
 
-  SmallVector<Value *, 5> OrderedInputs;
-
   /// The GlobalValue's used in the loop that is being outlined. This includes
   /// functions, global variables, aliases and ifunc's.
   std::set<GlobalValue *> UsedGlobalValues;
diff --git a/llvm/include/llvm/Transforms/Tapir/LoopStripMine.h b/llvm/include/llvm/Transforms/Tapir/LoopStripMine.h
index 270b77794620c..4940f3d7d463a 100644
--- a/llvm/include/llvm/Transforms/Tapir/LoopStripMine.h
+++ b/llvm/include/llvm/Transforms/Tapir/LoopStripMine.h
@@ -47,7 +47,8 @@ Loop *StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
                     AssumptionCache *AC, TaskInfo *TI,
                     OptimizationRemarkEmitter *ORE, bool PreserveLCSSA,
                     bool ParallelEpilog, bool NeedNestedSync,
-                    Loop **Remainderloop = nullptr);
+                    Loop **Remainderloop = nullptr,
+                    bool GPU = false);
 
 } // end namespace llvm
 
diff --git a/llvm/include/llvm/Transforms/Tapir/LoopStripMinePass.h b/llvm/include/llvm/Transforms/Tapir/LoopStripMinePass.h
index 5b130c3e89d62..f148f20ff1820 100644
--- a/llvm/include/llvm/Transforms/Tapir/LoopStripMinePass.h
+++ b/llvm/include/llvm/Transforms/Tapir/LoopStripMinePass.h
@@ -24,7 +24,7 @@ class LoopStripMinePass : public PassInfoMixin<LoopStripMinePass> {
 public:
   explicit LoopStripMinePass() {}
 
-  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+  PreservedAnalyses run(Module &F, ModuleAnalysisManager &AM);
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 35c4a8bd78229..945f422ef6a0e 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -2260,6 +2260,8 @@ static Attribute::AttrKind getAttrFromCode(uint64_t Code) {
     return Attribute::KitKernel;
   case bitc::ATTR_KIND_KIT_DEVICE:
     return Attribute::KitDevice;
+  case bitc::ATTR_KIND_KIT_REDUCTION:
+    return Attribute::KitsuneReduction;
   }
 }
 
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 05a103aae5ddd..c9451018b070b 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -960,6 +960,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) {
     return bitc::ATTR_KIND_KIT_KERNEL;
   case Attribute::KitDevice:
     return bitc::ATTR_KIND_KIT_DEVICE;
+  case Attribute::KitsuneReduction:
+    return bitc::ATTR_KIND_KIT_REDUCTION; 
   case Attribute::EndAttrKinds:
     llvm_unreachable("Can not encode end-attribute kinds marker.");
   case Attribute::None:
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 57c38dcd5aa1e..c21e98f38e980 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -14,6 +14,7 @@
 ///
 //===----------------------------------------------------------------------===//
 
+#include "kitsune/Analysis/TapirTargetAnalysis.h"
 #include "kitsune/Passes/PipelineUtils.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
@@ -1597,34 +1598,6 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
   // rather than on each loop in an inside-out manner, and so they are actually
   // function passes.
 
-  // Stripmine Tapir loops, if pass is enabled.
-  if (PTO.LoopStripmine && Level.getSpeedupLevel() > 1 &&
-      !Level.isOptimizingForSize()) {
-    LoopPassManager LPM1, LPM2;
-    LPM1.addPass(
-        IndVarSimplifyPass(/*WidenIndVars=*/true, /*TapirLoopsOnly=*/true));
-    OptimizePM.addPass(
-        createFunctionToLoopPassAdaptor(std::move(LPM1),
-                                        /*UseMemorySSA=*/true,
-                                        /*UseBlockFrequencyInfo=*/true));
-    OptimizePM.addPass(LoopStripMinePass());
-    // Cleanup tasks after stripmining loops.
-    OptimizePM.addPass(TaskSimplifyPass());
-    // Cleanup after stripmining loops.
-    LPM2.addPass(LoopSimplifyCFGPass());
-    LPM2.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap,
-                          /*AllowSpeculation=*/true));
-    OptimizePM.addPass(
-        createFunctionToLoopPassAdaptor(std::move(LPM2),
-                                        /*UseMemorySSA=*/true,
-                                        /*UseBlockFrequencyInfo=*/true));
-    // Don't run IndVarSimplify at this point, as it can actually inhibit
-    // vectorization in some cases.
-    OptimizePM.addPass(JumpThreadingPass());
-    OptimizePM.addPass(CorrelatedValuePropagationPass());
-    OptimizePM.addPass(InstCombinePass());
-  }
-
   invokeVectorizerStartEPCallbacks(OptimizePM, Level);
 
   LoopPassManager LPM;
@@ -1760,8 +1733,41 @@ PassBuilder::buildTapirLoopLoweringPipeline(OptimizationLevel Level,
   FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM2),
                                               /*UseMemorySSA=*/false,
                                               /*UseBlockFrequencyInfo=*/false));
-  MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
 
+  // Stripmine Tapir loops, if pass is enabled.
+  // This was moved from buildModuleOptimizationPipeline, I (George) expect it to be
+  // beneficial for all parallel backends. Needs performance evaluation to confirm.
+  if (PTO.LoopStripmine) {
+    // We need to do tapir target analysis to determine how to stripmine
+    MPM.addPass(RequireAnalysisPass<TapirTargetAnalysis, Module>());
+    //MPM.addPass(
+        //createModuleToFunctionPassAdaptor(InvalidateAnalysisPass<TapirTargetAnalysis>()));
+
+    LoopPassManager LPM1, LPM2;
+    LPM1.addPass(IndVarSimplifyPass(true, true));
+    FPM.addPass(
+        createFunctionToLoopPassAdaptor(std::move(LPM1),
+                                        /*UseMemorySSA=*/true,
+                                        /*UseBlockFrequencyInfo=*/true));
+    MPM.addPass(LoopStripMinePass());
+    // Cleanup tasks after stripmining loops.
+    FPM.addPass(TaskSimplifyPass());
+    // Cleanup after stripmining loops.
+    LPM2.addPass(LoopSimplifyCFGPass());
+    LPM2.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap,
+                          /*AllowSpeculation=*/true));
+    FPM.addPass(
+        createFunctionToLoopPassAdaptor(std::move(LPM2),
+                                        /*UseMemorySSA=*/true,
+                                        /*UseBlockFrequencyInfo=*/true));
+    // Don't run IndVarSimplify at this point, as it can actually inhibit
+    // vectorization in some cases.
+    FPM.addPass(JumpThreadingPass());
+    FPM.addPass(CorrelatedValuePropagationPass());
+    FPM.addPass(InstCombinePass());
+  }
+
+  MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
   // Outline Tapir loops as needed.
   MPM.addPass(LoopSpawningPass(Level));
   if (VerifyTapirLowering)
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 8e62e9a2b97f6..063eccc8c83a8 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -192,6 +192,7 @@ MODULE_PASS("kit-kernel-properties", RecomputeKernelPropertiesPass())
 MODULE_PASS("kit-prefetch", PrefetchingPass())
 MODULE_PASS("kit-lower-intrinsics", LowerKitsuneIntrinsicsPass())
 MODULE_PASS("kit-strip-addr-spaces", StripKitsuneAddrSpacesPass())
+MODULE_PASS("loop-stripmine", LoopStripMinePass())
 
 #undef MODULE_PASS
 
@@ -510,7 +511,6 @@ FUNCTION_PASS("loop-fusion", LoopFusePass())
 FUNCTION_PASS("loop-load-elim", LoopLoadEliminationPass())
 FUNCTION_PASS("loop-simplify", LoopSimplifyPass())
 FUNCTION_PASS("loop-sink", LoopSinkPass())
-FUNCTION_PASS("loop-stripmine", LoopStripMinePass())
 FUNCTION_PASS("loop-versioning", LoopVersioningPass())
 FUNCTION_PASS("lower-atomic", LowerAtomicPass())
 FUNCTION_PASS("lower-constant-intrinsics", LowerConstantIntrinsicsPass())
diff --git a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
index d32b829e2ad79..79e530c6362b1 100644
--- a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -284,6 +284,10 @@ bool DeadArgumentEliminationPass::removeDeadArgumentsFromCallers(Function &F) {
   if (F.hasFnAttribute(Attribute::Naked))
     return false;
 
+  // Don't operate on optnone (we need this for kitsune reductions)
+  if (F.hasFnAttribute(Attribute::OptimizeNone))
+    return false;
+
   if (F.use_empty())
     return false;
 
@@ -730,6 +734,10 @@ bool DeadArgumentEliminationPass::removeDeadStuffFromFunction(Function *F) {
   if (FrozenFunctions.count(F))
     return false;
 
+  // Don't operate on optnone (we need this for kitsune reductions)
+  if (F->hasFnAttribute(Attribute::OptimizeNone))
+    return false;
+
   // Start by computing a new prototype for the function, which is the same as
   // the old function, but has fewer arguments and a different return type.
   FunctionType *FTy = F->getFunctionType();
diff --git a/llvm/lib/Transforms/Tapir/CudaABI.cpp b/llvm/lib/Transforms/Tapir/CudaABI.cpp
index 6d72fa29a937f..ec731bcc7a9d3 100644
--- a/llvm/lib/Transforms/Tapir/CudaABI.cpp
+++ b/llvm/lib/Transforms/Tapir/CudaABI.cpp
@@ -198,78 +198,6 @@ CudaLoop::~CudaLoop() {
                     << KernelName << "'.\n");
 }
 
-void CudaLoop::setupLoopOutlineArgs(Function &F, ValueSet &HelperArgs,
-                                    SmallVectorImpl<Value *> &HelperInputs,
-                                    ValueSet &InputSet,
-                                    const SmallVectorImpl<Value *> &LCArgs,
-                                    const SmallVectorImpl<Value *> &LCInputs,
-                                    const ValueSet &TLInputsFixed) {
-  LLVM_DEBUG(dbgs() << "debug[cuabi]: setting up loop outline arguments...\n");
-
-  // Add the loop control inputs -- the first parameter defines the extent of
-  // the index space (the number of threads to launch).
-  {
-    Argument *EndArg = cast<Argument>(LCArgs[1]);
-    EndArg->setName("runSize");
-    HelperArgs.insert(EndArg);
-
-    Value *InputVal = LCInputs[1];
-    HelperInputs.push_back(InputVal);
-    InputSet.insert(InputVal);
-  }
-
-  // The second parameter defines the start of the index space.
-  {
-    Argument *StartArg = cast<Argument>(LCArgs[0]);
-    StartArg->setName("runStart");
-    HelperArgs.insert(StartArg);
-
-    Value *InputVal = LCInputs[0];
-    HelperInputs.push_back(InputVal);
-    InputSet.insert(InputVal);
-  }
-
-  // The third parameter defines the grain size, if it is not constant.
-  if (!isa<ConstantInt>(LCInputs[2])) {
-    Argument *GrainsizeArg = cast<Argument>(LCArgs[2]);
-    GrainsizeArg->setName("grainSize");
-    HelperArgs.insert(GrainsizeArg);
-
-    Value *InputVal = LCInputs[2];
-    HelperInputs.push_back(InputVal);
-    InputSet.insert(InputVal);
-  }
-
-  // Add the loop-centric kernel parameters (i.e., variables/arrays
-  // used in the loop body).
-  LLVM_DEBUG(dbgs() << "  - adding loop-centric kernel arguments...\n");
-  for (Value *V : TLInputsFixed) {
-    HelperArgs.insert(V);
-    HelperInputs.push_back(V);
-    LLVM_DEBUG(dbgs() << "    - arg: " << V->getName() << "\n");
-  }
-
-  LLVM_DEBUG(dbgs() << "  - adding helper kernel arguments...\n");
-  for (Value *V : HelperInputs) {
-    OrderedInputs.push_back(V);
-    LLVM_DEBUG(dbgs() << "    - helper arg: " << V->getName() << "\n");
-  }
-
-  LLVM_DEBUG(dbgs() << "  - done.\n");
-}
-
-unsigned CudaLoop::getIVArgIndex(const Function &F,
-                                 const ValueSet &Args) const {
-  // The argument for the primary induction variable is the second input.
-  return 1;
-}
-
-unsigned CudaLoop::getLimitArgIndex(const Function &F,
-                                    const ValueSet &Args) const {
-  // The argument for the loop limit is the first input.
-  return 0;
-}
-
 void CudaLoop::preProcessTapirLoop(TapirLoopInfo &TL, ValueToValueMapTy &VMap) {
   LLVM_DEBUG(dbgs() << "debug[cuabi]: -preprocessing loop for kernel '"
                     << KernelName << "'.\n");
@@ -378,23 +306,19 @@ void CudaLoop::postProcessOutline(TapirLoopInfo &TLI, TaskOutlineInfo &Out,
   // loops use canonical induction variables, valid iterations range from 0 to
   // the loop limit with stride 1. The End argument encodes the loop limit. Get
   // end and grainsize arguments
-  Argument *End;
-  Value *Grainsize;
-  {
-    // TODO: We only support a grain size of 1 right now. Not clear if this
-    // could be a future optimization but strip mining on our current tests only
-    // results in degraded performance...
-    auto OutlineArgsIter = KernelF->arg_begin();
-    // End argument is the first LC arg.
-    End = &*OutlineArgsIter++;
-
-    // Get the grainsize value, which is either constant or the third LC arg.
-    // if (unsigned ConstGrainsize = TLI.getGrainsize())
-    //  Grainsize = ConstantInt::get(PrimaryIV->getType(), ConstGrainsize);
-    // else
-    Grainsize =
-        ConstantInt::get(PrimaryIV->getType(), DefaultGrainSize.getValue());
-  }
+
+  // End argument is always the second argument in the kernel function.
+  Argument *End = KernelF->getArg(1);
+
+  // Get the grainsize value, which is either constant or the third LC arg.
+  // TODO: We only support a grain size of 1 right now. Not clear if this
+  // could be a future optimization but strip mining on our current tests only
+  // results in degraded performance.
+  // if (unsigned ConstGrainsize = TLI.getGrainsize())
+  //  Grainsize = ConstantInt::get(PrimaryIV->getType(), ConstGrainsize);
+  // else
+  Value *Grainsize =
+      ConstantInt::get(PrimaryIV->getType(), DefaultGrainSize.getValue());
 
   IRBuilder<> B(Entry->getTerminator());
 
@@ -433,14 +357,6 @@ void CudaLoop::postProcessOutline(TapirLoopInfo &TLI, TaskOutlineInfo &Out,
   ClonedCond->setOperand(TripCountIdx, ThreadEnd);
 }
 
-void CudaLoop::remapData(ValueToValueMapTy &VMap) {
-  for (auto &V : OrderedInputs) {
-    if (auto MappedV = VMap[V]) {
-      V = MappedV;
-    }
-  }
-}
-
 void CudaLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
                                        DominatorTree &DT) {
   LLVM_DEBUG(dbgs() << "cudaloop: processing outlined loop call...\n"
@@ -468,12 +384,13 @@ void CudaLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
   TapirLoopHints Hints(TL.getLoop());
   Value *TPB = ConstantInt::get(Int32Ty, Hints.getThreadsPerBlock());
 
-  BasicBlock *RCBB = TOI.ReplCall->getParent();
-  BasicBlock *NewBB = RCBB->splitBasicBlock(TOI.ReplCall);
+  CallBase* CallOutlined = cast<CallBase>(TOI.ReplCall);
+  BasicBlock *RCBB = CallOutlined->getParent();
+  BasicBlock *NewBB = RCBB->splitBasicBlock(CallOutlined);
   IRBuilder<> Builder(&NewBB->front());
 
   // Deal with type mismatches for the trip count.
-  Value *TripCount = OrderedInputs[0];
+  Value *TripCount = CallOutlined->getArgOperand(1);
   if (TripCount->getType() != Int64Ty)
     TripCount = Builder.CreateSExtOrBitCast(TripCount, Int64Ty, "cast.tc");
 
@@ -485,7 +402,7 @@ void CudaLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
       Builder.CreateIntrinsic(PtrTy, Intrinsic::kit_thread_stream, {CTT});
   std::vector<Value *> Args = {CTT, EmbFB,  KName,     TripCount,
                                TPB, KProps, CudaStream};
-  for (Value *Inp : OrderedInputs)
+  for (Value *Inp : CallOutlined->args())
     Args.push_back(Inp);
 
   // TODO: We should probably have the launch and sync kitsune intrinsics take
@@ -506,7 +423,7 @@ void CudaLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
   // the global again).
   copyNonConstGlobalsDToH(UsedGlobalValues, TTID::Cuda, M, Builder);
 
-  TOI.ReplCall->eraseFromParent();
+  CallOutlined->eraseFromParent();
   LLVM_DEBUG(dbgs() << "*** finished processing outlined call.\n");
 }
 
diff --git a/llvm/lib/Transforms/Tapir/GPUABI.cpp b/llvm/lib/Transforms/Tapir/GPUABI.cpp
index d1909db60de42..c6502fbeb8938 100644
--- a/llvm/lib/Transforms/Tapir/GPUABI.cpp
+++ b/llvm/lib/Transforms/Tapir/GPUABI.cpp
@@ -12,11 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#pragma warning "GPUABI has been deprecated"
-#if 0
-
 #include "llvm/Transforms/Tapir/GPUABI.h"
-#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
@@ -25,24 +21,37 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/GVN.h"
-#include "llvm/Transforms/Vectorize.h"
-#include "llvm/Support/SmallVectorMemoryBuffer.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
 #include "llvm/IRReader/IRReader.h"
 #include "llvm/MC/TargetRegistry.h"
-#include <sstream>
 #include <fstream>
 
 using namespace llvm;
 
 #define DEBUG_TYPE "gpuabi"
 
+// JIT compiler kernel at containing function entry, makes timing easier at the
+// cost of less laziness
+static cl::opt<bool>
+    JIT("jit-callsite", cl::init(false), cl::NotHidden,
+          cl::desc("Wait until parallel loop is called to jit kernel. " 
+          "(default=false)"));
+
 Value *GPUABI::lowerGrainsizeCall(CallInst *GrainsizeCall) {
-  Value *Grainsize = ConstantInt::get(GrainsizeCall->getType(), 8);
+  IRBuilder<> BH(GrainsizeCall); 
+  auto *M = GrainsizeCall->getModule(); 
+  Type *LLVMInt64Ty = Type::getInt64Ty(M->getContext());
+  Type *LLVMInt32Ty = Type::getInt64Ty(M->getContext());
+  // we have to cast 
+  Value *GSO = BH.CreateIntCast(GrainsizeCall->getOperand(0), LLVMInt64Ty, false);
+  Value *GS = BH.CreateCall(M->getOrInsertFunction("gpuGridSize", LLVMInt32Ty, LLVMInt64Ty), {GSO}); 
+  Value *GSN = BH.CreateIntCast(GS, GrainsizeCall->getType(), false);
+  //FunctionCallee GGS = M->getOrInsertFunction("gpuGridSize", LLVMInt64Ty);
 
   // Replace uses of grainsize intrinsic call with this grainsize value.
-  GrainsizeCall->replaceAllUsesWith(Grainsize);
-  return Grainsize;
+  //GrainsizeCall->setCalledFunction(GGS); 
+  GrainsizeCall->replaceAllUsesWith(GSN); 
+  return GSN;
 }
 
 void GPUABI::lowerSync(SyncInst &SI) {
@@ -54,8 +63,9 @@ void GPUABI::postProcessOutlinedTask(llvm::Function&, llvm::Instruction*, llvm::
 void GPUABI::preProcessRootSpawner(llvm::Function&, BasicBlock *TFEntry){}
 void GPUABI::postProcessRootSpawner(llvm::Function&, BasicBlock *TFEntry){}
 
-void GPUABI::preProcessFunction(Function &F, TaskInfo &TI,
+bool GPUABI::preProcessFunction(Function &F, TaskInfo &TI,
                                  bool OutliningTapirLoops) {
+  return false;
 }
 
 void GPUABI::postProcessFunction(Function &F, bool OutliningTapirLoops) {
@@ -68,8 +78,7 @@ void GPUABI::processSubTaskCall(TaskOutlineInfo &TOI, DominatorTree &DT) {
 }
 
 LoopOutlineProcessor *
-GPUABI::getLoopOutlineProcessor(const TapirLoopInfo *TL,
-		OptimizationLevel OptLevel = OptimizationLevel=O2) {
+GPUABI::getLoopOutlineProcessor(const TapirLoopInfo *TL) {
   if(!LOP)
     return new LLVMLoop(M);
   return LOP;
@@ -79,7 +88,7 @@ GPUABI::getLoopOutlineProcessor(const TapirLoopInfo *TL,
 unsigned LLVMLoop::NextKernelID = 0;
 
 LLVMLoop::LLVMLoop(Module &M)
-    : LoopOutlineProcessor(M, LLVMM), LLVMM("kernelModule", M.getContext()) {
+    : LoopOutlineProcessor(M, TTOpts), LLVMM("kernelModule", M.getContext()) {
   ValueToValueMapTy VMap;
   // LLVMMptr = CloneModule(M, vmap, [](const GlobalValue* gv) { return false; });
   // And named metadata....
@@ -93,19 +102,16 @@ LLVMLoop::LLVMLoop(Module &M)
 
   // Setup an LLVM triple.
   Triple LLVMTriple("spir64-unknown-unknown");
-  LLVMM.setTargetTriple(LLVMTriple.str());
+  LLVMM.setTargetTriple(LLVMTriple);
 
   // Insert runtime-function declarations in LLVM host modules.
   Type *LLVMInt32Ty = Type::getInt32Ty(LLVMM.getContext());
-  Type *LLVMInt64Ty = Type::getInt64Ty(LLVMM.getContext());
   GetThreadIdx = LLVMM.getOrInsertFunction("gtid", LLVMInt32Ty);
-  Function* getid = LLVMM.getFunction("gtid");
+  
 
   Type *VoidTy = Type::getVoidTy(M.getContext());
-  Type *VoidPtrTy = Type::getInt8PtrTy(M.getContext());
+  Type *VoidPtrTy = PointerType::getUnqual(M.getContext());
   Type *VoidPtrPtrTy = VoidPtrTy->getPointerTo();
-  Type *Int8Ty = Type::getInt8Ty(M.getContext());
-  Type *Int32Ty = Type::getInt32Ty(M.getContext());
   Type *Int64Ty = Type::getInt64Ty(M.getContext());
   GPUInit = M.getOrInsertFunction("initRuntime", VoidTy);
   GPULaunchKernel = M.getOrInsertFunction("launchBCKernel", VoidPtrTy, VoidPtrTy, Int64Ty, VoidPtrPtrTy, Int64Ty);
@@ -177,17 +183,12 @@ unsigned LLVMLoop::getLimitArgIndex(const Function &F, const ValueSet &Args)
 
 void LLVMLoop::postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out,
                                    ValueToValueMapTy &VMap) {
-  LLVMContext &Ctx = M.getContext();
-  Type *Int8Ty = Type::getInt8Ty(Ctx);
-  Type *Int32Ty = Type::getInt32Ty(Ctx);
-  //Type *Int64Ty = Type::getInt64Ty(Ctx);
-  //Type *VoidPtrTy = Type::getInt8PtrTy(Ctx);
   Task *T = TL.getTask();
   Loop *L = TL.getLoop();
 
-
   BasicBlock *Entry = cast<BasicBlock>(VMap[L->getLoopPreheader()]);
   BasicBlock *Header = cast<BasicBlock>(VMap[L->getHeader()]);
+  BasicBlock *Latch = cast<BasicBlock>(VMap[L->getLoopLatch()]);
   BasicBlock *Exit = cast<BasicBlock>(VMap[TL.getExitBlock()]);
   PHINode *PrimaryIV = cast<PHINode>(VMap[TL.getPrimaryInduction().first]);
   Value *PrimaryIVInput = PrimaryIV->getIncomingValueForBlock(Entry);
@@ -201,10 +202,18 @@ void LLVMLoop::postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out,
   // Get the thread ID for this invocation of Helper.
   IRBuilder<> B(Entry->getTerminator());
   Value *ThreadIdx = B.CreateCall(GetThreadIdx);
-  //Value *BlockIdx = B.CreateCall(GetBlockIdx, ConstantInt::get(Int32Ty, 0));
-  //Value *BlockDim = B.CreateCall(GetBlockDim, ConstantInt::get(Int32Ty, 0));
   Value *ThreadID = B.CreateIntCast(ThreadIdx, PrimaryIV->getType(), false);
 
+  // Loop should be handled in stripmining, here we just remove the loop by setting it to a jump
+  BranchInst *BI = cast<BranchInst>(Latch->getTerminator()); 
+  if(BI->getSuccessor(0) == Exit)
+    BI->setCondition(ConstantInt::get(BI->getCondition()->getType(), true));
+  else 
+    BI->setCondition(ConstantInt::get(BI->getCondition()->getType(), false));
+
+  //AV.push_back(ValueAsMetadata::get(ConstantInt::get(Type::getInt32Ty(LLVMCtx),
+  //                                                   1)));
+   
 
   Function *Helper = Out.Outline;
   Helper->setName("kitsune_kernel");
@@ -218,25 +227,12 @@ void LLVMLoop::postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out,
   // the loop limit with stride 1.  The End argument encodes the loop limit.
   // Get end and grainsize arguments
   Argument *End;
-  Value *Grainsize;
   {
-    auto OutlineArgsIter = Helper->arg_begin();
+    auto *OutlineArgsIter = Helper->arg_begin();
     // End argument is the first LC arg.
     End = &*OutlineArgsIter;
-
-    // Get the grainsize value, which is either constant or the third LC arg.
-    // ReplaceInstWithInst(gep, GetElementPtrInst::Create(
-    if (unsigned ConstGrainsize = TL.getGrainsize())
-      Grainsize = ConstantInt::get(PrimaryIV->getType(), ConstGrainsize);
-    else
-      // Grainsize argument is the third LC arg.
-      Grainsize = &*++(++OutlineArgsIter);
   }
-  ThreadID = B.CreateMul(ThreadID, Grainsize);
-  Value *ThreadEndGrain = B.CreateAdd(ThreadID, Grainsize);
-  Value *Cmp = B.CreateICmp(ICmpInst::ICMP_ULT, ThreadEndGrain, End);
-  Value *ThreadEnd = B.CreateSelect(Cmp, ThreadEndGrain, End);
-  Value *Cond = B.CreateICmpUGE(ThreadID, ThreadEnd);
+  Value *Cond = B.CreateICmpUGE(ThreadID, End);
 
   ReplaceInstWithInst(Entry->getTerminator(), BranchInst::Create(Exit, Header,
                                                                  Cond));
@@ -246,10 +242,10 @@ void LLVMLoop::postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out,
   // Update cloned loop condition to use the thread-end value.
   unsigned TripCountIdx = 0;
   ICmpInst *ClonedCond = cast<ICmpInst>(VMap[TL.getCondition()]);
-  if (ClonedCond->getOperand(0) != ThreadEnd)
+  if (ClonedCond->getOperand(0) != End)
     ++TripCountIdx;
-  ClonedCond->setOperand(TripCountIdx, ThreadEnd);
-  assert(ClonedCond->getOperand(TripCountIdx) == ThreadEnd &&
+  ClonedCond->setOperand(TripCountIdx, End);
+  assert(ClonedCond->getOperand(TripCountIdx) == End &&
          "End argument not used in condition");
 
 }
@@ -258,12 +254,9 @@ void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
                                       DominatorTree &DT) {
   LLVMContext &Ctx = M.getContext();
   Type *Int8Ty = Type::getInt8Ty(Ctx);
-  Type *Int32Ty = Type::getInt32Ty(Ctx);
   Type *Int64Ty = Type::getInt64Ty(Ctx);
-  Type *VoidPtrTy = Type::getInt8PtrTy(Ctx);
+  PointerType *VoidPtrTy = PointerType::getUnqual(Ctx);
 
-  //Task *T = TL.getTask();
-  //Instruction *ReplCall = cast<CallBase>(TOI.ReplCall);
   LLVM_DEBUG(dbgs() << "Running processOutlinedLoopCall: " << LLVMM);
   Function *Parent = TOI.ReplCall->getFunction();
   Value *TripCount = OrderedInputs[0];
@@ -276,7 +269,6 @@ void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
   // Compile the kernel
   //LLVMM.getFunctionList().remove(TOI.Outline);
   //TOI.Outline->eraseFromParent();
-  LLVMContext &LLVMCtx = LLVMM.getContext();
 
   ValueToValueMapTy VMap;
   // We recursively add definitions and declarations to the device module
@@ -305,7 +297,8 @@ void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
                   SmallVector<ReturnInst*, 8> Returns;
                   CloneFunctionInto(deviceF, f, VMap, CloneFunctionChangeType::DifferentModule, Returns);
                   // GPU calls are slow as balls, try to force inlining
-                  deviceF->addFnAttr(Attribute::AlwaysInline);
+                  if(!deviceF->hasFnAttribute(Attribute::NoInline))
+                    deviceF->addFnAttr(Attribute::AlwaysInline); 
                   todo.push_back(deviceF);
                 }
               }
@@ -354,12 +347,9 @@ void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
   PassManager->add(createReassociatePass());
   PassManager->add(createGVNPass());
   PassManager->add(createCFGSimplificationPass());
-  PassManager->add(createLoopVectorizePass());
-  PassManager->add(createSLPVectorizerPass());
   //PassManager->add(createBreakCriticalEdgesPass());
   //PassManager->add(createConstantPropagationPass());
   PassManager->add(createDeadCodeEliminationPass());
-  PassManager->add(createDeadStoreEliminationPass());
   //PassManager->add(createInstructionCombiningPass());
   PassManager->add(createCFGSimplificationPass());
   PassManager->add(createDeadCodeEliminationPass());
@@ -389,15 +379,6 @@ void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
                                  GlobalValue::PrivateLinkage, LLVMBC,
                                  "gpu_" + Twine("kitsune_kernel"));
 
-  //Value* TripCount = isSRetInput(TOI.InputSet[0]) ? TOI.InputSet[1] : TOI.InputSet[0];
-  //Value *RunStart = ReplCall->getArgOperand(getIVArgIndex(*Parent,
-  //                                                        TOI.InputSet));
-  //Value *TripCount = ReplCall->getArgOperand(getLimitArgIndex(*Parent,
-  //                                                            TOI.InputSet));
-
-  Value *KernelID = ConstantInt::get(Int32Ty, MyKernelID);
-  Value *LLVMPtr = B.CreateBitCast(LLVMGlobal, VoidPtrTy);
-  Type *VoidPtrPtrTy = VoidPtrTy->getPointerTo();
 
   Constant *kernelSize = ConstantInt::get(Int64Ty,
     LLVMGlobal->getInitializer()->getType()->getArrayNumElements());
@@ -418,23 +399,12 @@ void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
     B.CreateStore(VoidVPtr, argPtr);
   }
 
-  Value *Grainsize = TL.getGrainsize() ?
-    ConstantInt::get(TripCount->getType(), TL.getGrainsize()) :
-    OrderedInputs[2];
-
-  //Type *Int64Ty = Type::getInt64Ty(LLVMM.getContext());
-  Value *RunSizeQ = B.CreateUDiv(TripCount, Grainsize);
-  Value *RunRem = B.CreateURem(TripCount, Grainsize);
-  Value *IsRem = B.CreateICmp(ICmpInst::ICMP_UGT, RunRem, ConstantInt::get(RunRem->getType(), 0));
-  Value *IsRemAdd = B.CreateZExt(IsRem, RunSizeQ->getType());
-  Value *RunSize = B.CreateZExt(B.CreateAdd(RunSizeQ, IsRemAdd), Int64Ty);
-
-  Value* argsPtr = B.CreateConstInBoundsGEP2_32(arrayType, argArray, 0, 0);
-  Value* bcPtr = B.CreateConstInBoundsGEP2_32(LLVMGlobal->getValueType(), LLVMGlobal, 0, 0);
-  Value* stream = B.CreateCall(GPULaunchKernel, { bcPtr, kernelSize, argsPtr, RunSize });
+  Value* argsPtr = B.CreateConstInBoundsGEP2_32(arrayType, argArray, 0, 0); 
+  Value* bcPtr = B.CreateConstInBoundsGEP2_32(LLVMGlobal->getValueType(), LLVMGlobal, 0, 0); 
+  Value *TripCount64 = B.CreateIntCast(TripCount, Int64Ty, true);
+  Value* stream = B.CreateCall(GPULaunchKernel, { bcPtr, kernelSize, argsPtr, TripCount64 });
   B.CreateCall(GPUWaitKernel, stream);
 
   LLVM_DEBUG(dbgs() << "Finished processOutlinedLoopCall: " << M);
 }
 
-#endif
diff --git a/llvm/lib/Transforms/Tapir/HipABI.cpp b/llvm/lib/Transforms/Tapir/HipABI.cpp
index d2b27ce7dcce0..38a9cfdec855d 100644
--- a/llvm/lib/Transforms/Tapir/HipABI.cpp
+++ b/llvm/lib/Transforms/Tapir/HipABI.cpp
@@ -221,75 +221,6 @@ HipLoop::HipLoop(Module &M, Module &KM, StringRef Name,
 
 HipLoop::~HipLoop() { /* no-op */ }
 
-// TODO: Can we also transform the arguments into a different address space here
-// and avoid our use of 'mutate' elsewhere in the code?
-void HipLoop::setupLoopOutlineArgs(Function &F, ValueSet &HelperArgs,
-                                   SmallVectorImpl<Value *> &HelperInputs,
-                                   ValueSet &InputSet,
-                                   const SmallVectorImpl<Value *> &LCArgs,
-                                   const SmallVectorImpl<Value *> &LCInputs,
-                                   const ValueSet &TLInputsFixed) {
-  LLVM_DEBUG(dbgs() << "\n\n"
-                    << "hipabi: SETTING UP LOOP OUTLINE ARGUMENTS FOR '"
-                    << F.getName() << "()'.\n");
-
-  // Add the loop control inputs -- the first parameter defines the extent of
-  // the index space.
-  {
-    Argument *EndArg = cast<Argument>(LCArgs[1]);
-    EndArg->setName(".kern.input_size"); // nice for debugging...
-    HelperArgs.insert(EndArg);
-
-    Value *InputVal = LCInputs[1];
-    HelperInputs.push_back(InputVal);
-    InputSet.insert(InputVal);
-  }
-
-  // The second parameter defines the start of the index space.
-  {
-    Argument *StartArg = cast<Argument>(LCArgs[0]);
-    StartArg->setName(".kern.start_idx");
-    HelperArgs.insert(StartArg);
-
-    Value *InputVal = LCInputs[0];
-    HelperInputs.push_back(InputVal);
-    InputSet.insert(InputVal);
-  }
-
-  // The third parameter defines the grain size, if it is not constant.
-  if (!isa<ConstantInt>(LCInputs[2])) {
-    Argument *GrainsizeArg = cast<Argument>(LCArgs[2]);
-    GrainsizeArg->setName(".kern.grain_size");
-    HelperArgs.insert(GrainsizeArg);
-
-    Value *InputVal = LCInputs[2];
-    HelperInputs.push_back(InputVal);
-    InputSet.insert(InputVal);
-  }
-
-  // Add the loop-centric kernel parameters (i.e., variables/arrays
-  // used in the loop body).
-  for (Value *V : TLInputsFixed) {
-    HelperArgs.insert(V);
-    HelperInputs.push_back(V);
-  }
-
-  for (Value *V : HelperInputs) {
-    OrderedInputs.push_back(V);
-  }
-}
-
-unsigned HipLoop::getIVArgIndex(const Function &F, const ValueSet &Args) const {
-  // The argument for the primary induction variable is the second input.
-  return 1;
-}
-
-unsigned HipLoop::getLimitArgIndex(const Function &F,
-                                   const ValueSet &Args) const {
-  // The argument for the loop limit is the first input.
-  return 0;
-}
-
 void HipLoop::preProcessTapirLoop(TapirLoopInfo &TL, ValueToValueMapTy &VMap) {
   bool VerboseMode = getOptions().getTapirVerbose();
   if (VerboseMode) {
@@ -429,22 +360,19 @@ void HipLoop::postProcessOutline(TapirLoopInfo &TLI, TaskOutlineInfo &Out,
   // loops use canonical induction variables, valid iterations range from 0 to
   // the loop limit with stride 1. The End argument encodes the limit.
   // Get end and grain size arguments
-  Argument *End;
-  Value *Grainsize;
-  {
-    // TODO: We really only want a grain size of 1 for now...
-    auto OutlineArgsIter = KernelF->arg_begin();
-    // End argument is the first LC arg.
-    End = &*OutlineArgsIter++;
-
-    // Get the grain size value, which is either constant or the third LC
-    // arg.
-    // if (unsigned ConstGrainsize = TLI.getGrainsize())
-    //  Grainsize = ConstantInt::get(PrimaryIV->getType(), ConstGrainsize);
-    // else
-    Grainsize =
-        ConstantInt::get(PrimaryIV->getType(), DefaultGrainSize.getValue());
-  }
+
+  // End argument is always the second argument in the kernel function.
+  Argument *End = KernelF->getArg(1);
+
+  // Get the grainsize value, which is either constant or the third LC arg.
+  // TODO: We only support a grain size of 1 right now. Not clear if this
+  // could be a future optimization but strip mining on our current tests only
+  // results in degraded performance.
+  // if (unsigned ConstGrainsize = TLI.getGrainsize())
+  //  Grainsize = ConstantInt::get(PrimaryIV->getType(), ConstGrainsize);
+  // else
+  Value *Grainsize =
+      ConstantInt::get(PrimaryIV->getType(), DefaultGrainSize.getValue());
 
   IRBuilder<> Builder(Entry->getTerminator());
 
@@ -500,12 +428,6 @@ void HipLoop::postProcessOutline(TapirLoopInfo &TLI, TaskOutlineInfo &Out,
   ClonedCond->setOperand(TripCountIdx, ThreadEnd);
 }
 
-void HipLoop::remapData(ValueToValueMapTy &VMap) {
-  for (auto &V : OrderedInputs)
-    if (Value *MappedV = VMap[V])
-      V = MappedV;
-}
-
 void HipLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
                                       DominatorTree &DT) {
   LLVM_DEBUG(dbgs() << "hiploop: processing outlined loop call...\n"
@@ -541,12 +463,13 @@ void HipLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
   else
     TPB = ConstantInt::get(Int32Ty, 0);
 
-  BasicBlock *RCBB = TOI.ReplCall->getParent();
-  BasicBlock *NewBB = RCBB->splitBasicBlock(TOI.ReplCall);
+  CallBase *CallOutlined = cast<CallBase>(TOI.ReplCall);
+  BasicBlock *RCBB = CallOutlined->getParent();
+  BasicBlock *NewBB = RCBB->splitBasicBlock(CallOutlined);
   IRBuilder<> Builder(&NewBB->front());
 
   // Deal with type mismatches for the trip count.
-  Value *TripCount = OrderedInputs[0];
+  Value *TripCount = CallOutlined->getArgOperand(1);
   if (TripCount->getType() != Int64Ty)
     TripCount = Builder.CreateSExtOrBitCast(TripCount, Int64Ty, "cast.tc");
 
@@ -558,7 +481,7 @@ void HipLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
       Builder.CreateIntrinsic(PtrTy, Intrinsic::kit_thread_stream, {CTT});
   std::vector<Value *> Args = {CTT, EmbFB,  KName,    TripCount,
                                TPB, KProps, HipStream};
-  for (Value *Inp : OrderedInputs)
+  for (Value *Inp : CallOutlined->args())
     Args.push_back(Inp);
 
   // TODO: We should probably have the launch and sync kitsune intrinsics take
@@ -579,7 +502,7 @@ void HipLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
   // the global again).
   copyNonConstGlobalsDToH(UsedGlobalValues, TTID::Hip, M, Builder);
 
-  TOI.ReplCall->eraseFromParent();
+  CallOutlined->eraseFromParent();
   LLVM_DEBUG(dbgs() << "*** finished processing outlined call.\n");
 }
 
diff --git a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
index ca651c64c7ece..b1db7abb46776 100644
--- a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
+++ b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
@@ -40,6 +40,7 @@
 #include "llvm/Transforms/Utils/SimplifyIndVar.h"
 #include "llvm/Transforms/Utils/TapirUtils.h"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
+#include "llvm/Transforms/Utils/LoopPeel.h"
 
 using namespace llvm;
 
@@ -766,7 +767,8 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
                           const TargetTransformInfo &TTI, AssumptionCache *AC,
                           TaskInfo *TI, OptimizationRemarkEmitter *ORE,
                           bool PreserveLCSSA, bool ParallelEpilog,
-                          bool NeedNestedSync, Loop **RemainderLoop) {
+                          bool NeedNestedSync, Loop **RemainderLoop,
+                          bool GPU) {
   Task *T = getTapirLoopForStripMining(L, *TI, ORE);
   if (!T)
     return nullptr;
@@ -815,6 +817,7 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
   BasicBlock *Latch = L->getLoopLatch();
   BasicBlock *Header = L->getHeader();
   BasicBlock *TaskEntry = T->getEntry();
+
   assert(isa<DetachInst>(Header->getTerminator()) &&
          "Header not terminated by a detach.");
   DetachInst *DI = cast<DetachInst>(Header->getTerminator());
@@ -825,6 +828,10 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
   unsigned ExitIndex = LatchBR->getSuccessor(0) == Header ? 1 : 0;
   BasicBlock *LatchExit = LatchBR->getSuccessor(ExitIndex);
 
+  
+  Function *F = Header->getParent();
+  LLVM_DEBUG(dbgs() << "Function before strip mining\n" << *F); 
+
   // We will use the increment of the primary induction variable to derive
   // wrapping flags.
   Instruction *PrimaryInc =
@@ -878,7 +885,12 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
     return nullptr;
   }
 
-  LLVM_DEBUG(dbgs() << "Stripmining loop using grainsize " << Count << "\n");
+  if(GPU){
+    LLVM_DEBUG(dbgs() << "Stripmining loop using grainsize " << "gpu grainsize call" << "\n");
+  }
+  else {
+    LLVM_DEBUG(dbgs() << "Stripmining loop using grainsize " << Count << "\n");
+  }
   using namespace ore;
   ORE->emit([&]() {
               return OptimizationRemark(LSM_NAME, "Stripmined",
@@ -894,6 +906,7 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
   //   ...
   //   Latch
   // LatchExit
+  Module *M = F->getParent();
 
   // Insert the epilog remainder.
   BasicBlock *NewPreheader;
@@ -937,41 +950,74 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
   // *EpilogPreheader
   // LatchExit
 
-  IRBuilder<> B(PreheaderBR);
+  IRBuilder<> B(PreheaderBR); 
   Value *ModVal;
-  // Calculate ModVal = (BECount + 1) % Count.
-  // Note that TripCount is BECount + 1.
-  if (isPowerOf2_32(Count)) {
-    // When Count is power of 2 we don't BECount for epilog case.  However we'll
-    // need it for a branch around stripmined loop for prolog case.
-    ModVal = B.CreateAnd(TripCount, Count - 1, "xtraiter");
-    //  1. There are no iterations to be run in the prolog/epilog loop.
-    // OR
-    //  2. The addition computing TripCount overflowed.
-    //
-    // If (2) is true, we know that TripCount really is (1 << BEWidth) and so
-    // the number of iterations that remain to be run in the original loop is a
-    // multiple Count == (1 << Log2(Count)) because Log2(Count) <= BEWidth (we
-    // explicitly check this above).
-    if (TL.isInclusiveRange())
-      ModVal = B.CreateAdd(ModVal, ConstantInt::get(ModVal->getType(), 1));
-  } else {
-    // As (BECount + 1) can potentially unsigned overflow we count
-    // (BECount % Count) + 1 which is overflow safe as BECount % Count < Count.
-    Value *ModValTmp = B.CreateURem(BECount,
-                                    ConstantInt::get(BECount->getType(),
-                                                     Count));
-    Value *ModValAdd = B.CreateAdd(ModValTmp,
-                                   ConstantInt::get(ModValTmp->getType(), 1));
-    // At that point (BECount % Count) + 1 could be equal to Count.
-    // To handle this case we need to take mod by Count one more time.
-    ModVal = B.CreateURem(ModValAdd,
-                          ConstantInt::get(BECount->getType(), Count),
-                          "xtraiter");
+  Value *StepSize; 
+  Value *BranchVal; 
+  // Int the gpu case we don't need an epilogue
+  // If we start with forall(i=0..n)
+  // GPU stripmine converts to 
+  //   forall(i=0; i<k; i++)
+  //     for(j=i; j+=k; j<n)
+  // CPU stripmine converts to
+  //   forall(i=0; i<n/k; i++){
+  //     for(j=i*k; j<(i+1)*k; j++)
+  //   }
+  //   epilogue
+  if(GPU){
+    ModVal = TripCount;  
+    //B.SetInsertPoint(F->getEntryBlock().getFirstNonPHI()); 
+
+    Instruction* bloc; 
+    if(Instruction* I = dyn_cast<Instruction>(TripCount)){
+      bloc = I->getNextNode();
+    } else {
+      bloc = F->getEntryBlock().getTerminator();
+    }
+
+    IRBuilder<> B2(bloc); 
+    StepSize = B2.CreateCall(
+      Intrinsic::getOrInsertDeclaration(M, Intrinsic::tapir_loop_grainsize,
+                                { TripCount->getType() }), { TripCount });
+
+
+    BranchVal = B.CreateICmpULE(ModVal, ConstantInt::get(ModVal->getType(), 0)); 
+  }
+  else {
+    // Calculate ModVal = (BECount + 1) % Count.
+    // Note that TripCount is BECount + 1.
+    if (isPowerOf2_32(Count)) {
+      // When Count is power of 2 we don't BECount for epilog case.  However we'll
+      // need it for a branch around stripmined loop for prolog case.
+      ModVal = B.CreateAnd(TripCount, Count - 1, "xtraiter");
+      //  1. There are no iterations to be run in the prolog/epilog loop.
+      // OR
+      //  2. The addition computing TripCount overflowed.
+      //
+      // If (2) is true, we know that TripCount really is (1 << BEWidth) and so
+      // the number of iterations that remain to be run in the original loop is a
+      // multiple Count == (1 << Log2(Count)) because Log2(Count) <= BEWidth (we
+      // explicitly check this above).
+      if (TL.isInclusiveRange())
+        ModVal = B.CreateAdd(ModVal, ConstantInt::get(ModVal->getType(), 1));
+    } else {
+      // As (BECount + 1) can potentially unsigned overflow we count
+      // (BECount % Count) + 1 which is overflow safe as BECount % Count < Count.
+      Value *ModValTmp = B.CreateURem(BECount,
+                                      ConstantInt::get(BECount->getType(),
+                                                       Count));
+      Value *ModValAdd = B.CreateAdd(ModValTmp,
+                                     ConstantInt::get(ModValTmp->getType(), 1));
+      // At that point (BECount % Count) + 1 could be equal to Count.
+      // To handle this case we need to take mod by Count one more time.
+      ModVal = B.CreateURem(ModValAdd,
+                            ConstantInt::get(BECount->getType(), Count),
+                            "xtraiter");
+    }
+    BranchVal = B.CreateICmpSLT(
+        BECount, ConstantInt::get(BECount->getType(),
+                                  TL.isInclusiveRange() ? Count : Count - 1));
   }
-  Value *BranchVal = B.CreateICmpSLT(
-      BECount, ConstantInt::get(BECount->getType(),
-                                TL.isInclusiveRange() ? Count : Count - 1));
   BasicBlock *RemainderLoopBB = NewExit;
   BasicBlock *StripminedLoopBB = NewPreheader;
   // Branch to either remainder (extra iterations) loop or stripmined loop.
@@ -980,7 +1026,6 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
   if (DT)
     DT->changeImmediateDominator(NewExit, Preheader);
 
-  Function *F = Header->getParent();
   // Get an ordered list of blocks in the loop to help with the ordering of the
   // cloned blocks in the prolog/epilog code
   LoopBlocksDFS LoopBlocks(L);
@@ -1049,21 +1094,22 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
   // TODO: For stripmine factor 2 remainder loop will have 1 iterations.
   // Do not create 1 iteration loop.
   // bool CreateRemainderLoop = (Count != 2);
-  bool CreateRemainderLoop = true;
+  bool CreateRemainderLoop = !GPU;
 
   // Clone all the basic blocks in the loop. If Count is 2, we don't clone
   // the loop, otherwise we create a cloned loop to execute the extra
   // iterations. This function adds the appropriate CFG connections.
   BasicBlock *InsertBot = LatchExit;
   BasicBlock *InsertTop = EpilogPreheader;
-  *RemainderLoop =
-      cloneLoopBlocks(L, ModVal, CreateRemainderLoop, true, UnrollRemainder,
-                      InsertTop, InsertBot, NewPreheader, NewBlocks, LoopBlocks,
-                      ExtraTaskBlocks, SharedEHTaskBlocks, VMap, DT, LI, Count);
+  if(CreateRemainderLoop){
+    *RemainderLoop =
+        cloneLoopBlocks(L, ModVal, CreateRemainderLoop, true, UnrollRemainder,
+                        InsertTop, InsertBot, NewPreheader, NewBlocks, LoopBlocks,
+                        ExtraTaskBlocks, SharedEHTaskBlocks, VMap, DT, LI, Count);
 
-  // Insert the cloned blocks into the function.
-  F->splice(InsertBot->getIterator(), &*F, NewBlocks[0]->getIterator(),
-            F->end());
+    // Insert the cloned blocks into the function.
+    F->splice(InsertBot->getIterator(), &*F, NewBlocks[0]->getIterator(),
+              F->end());
 
   // Loop structure should be the following:
   //  Epilog
@@ -1082,51 +1128,51 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
 
   // Rewrite the cloned instruction operands to use the values created when the
   // clone is created.
-  for (BasicBlock *BB : NewBlocks)
-    for (Instruction &I : *BB)
-      RemapInstruction(&I, VMap,
-                       RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+    for (BasicBlock *BB : NewBlocks)
+      for (Instruction &I : *BB)
+        RemapInstruction(&I, VMap,
+                         RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
 
-  // Serialize the cloned loop body to render the inner loop serial.
-  {
-    // Translate all the analysis for the new cloned task.
-    SmallVector<Instruction *, 1> ClonedReattaches;
-    for (Instruction *I : Reattaches)
-      ClonedReattaches.push_back(cast<Instruction>(VMap[I]));
-    SmallPtrSet<BasicBlock *, 4> ClonedEHBlockPreds;
-    for (BasicBlock *B : EHBlockPreds)
-      ClonedEHBlockPreds.insert(cast<BasicBlock>(VMap[B]));
-    SmallVector<BasicBlock *, 4> ClonedEHBlocks;
-    for (BasicBlock *B : EHBlocksToClone)
-      ClonedEHBlocks.push_back(cast<BasicBlock>(VMap[B]));
-    // Landing pads and detached-rethrow instructions may or may not have been
-    // cloned.
-    SmallPtrSet<LandingPadInst *, 1> ClonedInlinedLPads;
-    for (LandingPadInst *LPad : InlinedLPads) {
-      if (VMap[LPad])
-        ClonedInlinedLPads.insert(cast<LandingPadInst>(VMap[LPad]));
-      else
-        ClonedInlinedLPads.insert(LPad);
-    }
-    SmallVector<Instruction *, 1> ClonedDetachedRethrows;
-    for (Instruction *DR : DetachedRethrows) {
-      if (VMap[DR])
-        ClonedDetachedRethrows.push_back(cast<Instruction>(VMap[DR]));
-      else
-        ClonedDetachedRethrows.push_back(DR);
+    // Serialize the cloned loop body to render the inner loop serial.
+    {
+      // Translate all the analysis for the new cloned task.
+      SmallVector<Instruction *, 1> ClonedReattaches;
+      for (Instruction *I : Reattaches)
+        ClonedReattaches.push_back(cast<Instruction>(VMap[I]));
+      SmallPtrSet<BasicBlock *, 4> ClonedEHBlockPreds;
+      for (BasicBlock *B : EHBlockPreds)
+        ClonedEHBlockPreds.insert(cast<BasicBlock>(VMap[B]));
+      SmallVector<BasicBlock *, 4> ClonedEHBlocks;
+      for (BasicBlock *B : EHBlocksToClone)
+        ClonedEHBlocks.push_back(cast<BasicBlock>(VMap[B]));
+      // Landing pads and detached-rethrow instructions may or may not have been
+      // cloned.
+      SmallPtrSet<LandingPadInst *, 1> ClonedInlinedLPads;
+      for (LandingPadInst *LPad : InlinedLPads) {
+        if (VMap[LPad])
+          ClonedInlinedLPads.insert(cast<LandingPadInst>(VMap[LPad]));
+        else
+          ClonedInlinedLPads.insert(LPad);
+      }
+      SmallVector<Instruction *, 1> ClonedDetachedRethrows;
+      for (Instruction *DR : DetachedRethrows) {
+        if (VMap[DR])
+          ClonedDetachedRethrows.push_back(cast<Instruction>(VMap[DR]));
+        else
+          ClonedDetachedRethrows.push_back(DR);
+      }
+      DetachInst *ClonedDI = cast<DetachInst>(VMap[DI]);
+      // Serialize the new task.
+      SerializeDetach(ClonedDI, ParentEntry, EHCont, EHContLPadVal,
+                      ClonedReattaches, &ClonedEHBlocks, &ClonedEHBlockPreds,
+                      &ClonedInlinedLPads, &ClonedDetachedRethrows,
+                      NeedToInsertTaskFrame, DT, nullptr, LI);
     }
-    DetachInst *ClonedDI = cast<DetachInst>(VMap[DI]);
-    // Serialize the new task.
-    SerializeDetach(ClonedDI, ParentEntry, EHCont, EHContLPadVal,
-                    ClonedReattaches, &ClonedEHBlocks, &ClonedEHBlockPreds,
-                    &ClonedInlinedLPads, &ClonedDetachedRethrows,
-                    NeedToInsertTaskFrame, DT, nullptr, LI);
   }
 
   // Detach the stripmined loop.
   Value *SyncReg = DI->getSyncRegion(), *NewSyncReg;
   BasicBlock *EpilogPred, *LoopDetEntry, *LoopReattach;
-  Module *M = F->getParent();
   if (ParallelEpilog) {
     ORE->emit([&]() {
                 return OptimizationRemark(LSM_NAME, "ParallelEpil",
@@ -1339,48 +1385,66 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
   //
   // TODO: Generalize to handle non-power-of-2 counts.
   assert(isPowerOf2_32(Count) && "Count is not a power of 2.");
-  Value *TestVal = B2.CreateUDiv(TripCount,
-                                 ConstantInt::get(TripCount->getType(), Count),
-                                 "stripiter");
-  // Value *TestVal = B2.CreateSub(TripCount, ModVal, "stripiter", true, true);
-
-  // Value *TestCmp = B2.CreateICmpUGT(TestVal,
-  //                                   ConstantInt::get(TestVal->getType(), 0),
-  //                                   TestVal->getName() + ".ncmp");
-  // ReplaceInstWithInst(NewPreheader->getTerminator(),
-  //                     BranchInst::Create(Header, LatchExit, TestCmp));
-  // DT->changeImmediateDominator(LatchExit,
-  //                              DT->findNearestCommonDominator(LatchExit,
-  //                                                             NewPreheader));
-
-  // Add new counter for new outer loop.
-  //
-  // We introduce a new primary induction variable, NewIdx, into the outer loop,
-  // which counts up to the outer-loop trip count from 0, stepping by 1.  In
-  // contrast to counting down from the outer-loop trip count, this new variable
-  // ensures that future loop passes, including LoopSpawning, can process this
-  // outer loop when we're done.
-  PHINode *NewIdx = PHINode::Create(TestVal->getType(), 2, "niter");
-  NewIdx->insertBefore(NewHeader->getFirstNonPHIIt());
-  B2.SetInsertPoint(NewLatch->getTerminator());
-  // Instruction *IdxSub = cast<Instruction>(
-  //     B2.CreateSub(NewIdx, ConstantInt::get(NewIdx->getType(), 1),
-  //                  NewIdx->getName() + ".nsub"));
-  // IdxSub->copyIRFlags(PrimaryInc);
-  Instruction *IdxAdd = cast<Instruction>(
-      B2.CreateAdd(NewIdx, ConstantInt::get(NewIdx->getType(), 1),
-                   NewIdx->getName() + ".nadd"));
-  IdxAdd->copyIRFlags(PrimaryInc);
-
-  // NewIdx->addIncoming(TestVal, NewPreheader);
-  // NewIdx->addIncoming(IdxSub, NewLatch);
-  // Value *IdxCmp = B2.CreateIsNull(IdxSub, NewIdx->getName() + ".ncmp");
-  NewIdx->addIncoming(ConstantInt::get(TestVal->getType(), 0), LoopDetEntry);
-  NewIdx->addIncoming(IdxAdd, NewLatch);
-  Value *IdxCmp = B2.CreateICmpEQ(IdxAdd, TestVal,
-                                  NewIdx->getName() + ".ncmp");
-  ReplaceInstWithInst(NewLatch->getTerminator(),
-                      BranchInst::Create(LoopReattach, NewHeader, IdxCmp));
+  PHINode *NewIdx; 
+  if(GPU){
+    Value *TestVal = StepSize;
+    NewIdx = PHINode::Create(TestVal->getType(), 2, "niter",
+                                      NewHeader->getFirstNonPHIIt());
+    B2.SetInsertPoint(NewLatch->getTerminator());
+    Instruction *IdxAdd = cast<Instruction>(
+        B2.CreateAdd(NewIdx, ConstantInt::get(NewIdx->getType(), 1),
+                     NewIdx->getName() + ".nadd"));
+    IdxAdd->copyIRFlags(PrimaryInc);
+    NewIdx->addIncoming(ConstantInt::get(TestVal->getType(), 0), LoopDetEntry);
+    NewIdx->addIncoming(IdxAdd, NewLatch);
+    Value *IdxCmp = B2.CreateICmpEQ(IdxAdd, TestVal,
+                                    NewIdx->getName() + ".ncmp");
+    ReplaceInstWithInst(NewLatch->getTerminator(),
+                        BranchInst::Create(LoopReattach, NewHeader, IdxCmp));
+  } else {
+    Value *TestVal = B2.CreateUDiv(TripCount,
+                                   ConstantInt::get(TripCount->getType(), Count),
+                                   "stripiter");
+    // Value *TestVal = B2.CreateSub(TripCount, ModVal, "stripiter", true, true);
+
+    // Value *TestCmp = B2.CreateICmpUGT(TestVal,
+    //                                   ConstantInt::get(TestVal->getType(), 0),
+    //                                   TestVal->getName() + ".ncmp");
+    // ReplaceInstWithInst(NewPreheader->getTerminator(),
+    //                     BranchInst::Create(Header, LatchExit, TestCmp));
+    // DT->changeImmediateDominator(LatchExit,
+    //                              DT->findNearestCommonDominator(LatchExit,
+    //                                                             NewPreheader));
+
+    // Add new counter for new outer loop.
+    //
+    // We introduce a new primary induction variable, NewIdx, into the outer loop,
+    // which counts up to the outer-loop trip count from 0, stepping by 1.  In
+    // contrast to counting down from the outer-loop trip count, this new variable
+    // ensures that future loop passes, including LoopSpawning, can process this
+    // outer loop when we're done.
+    NewIdx = PHINode::Create(TestVal->getType(), 2, "niter",
+                                      NewHeader->getFirstNonPHIIt());
+    B2.SetInsertPoint(NewLatch->getTerminator());
+    // Instruction *IdxSub = cast<Instruction>(
+    //     B2.CreateSub(NewIdx, ConstantInt::get(NewIdx->getType(), 1),
+    //                  NewIdx->getName() + ".nsub"));
+    // IdxSub->copyIRFlags(PrimaryInc);
+    Instruction *IdxAdd = cast<Instruction>(
+        B2.CreateAdd(NewIdx, ConstantInt::get(NewIdx->getType(), 1),
+                     NewIdx->getName() + ".nadd"));
+    IdxAdd->copyIRFlags(PrimaryInc);
+
+    // NewIdx->addIncoming(TestVal, NewPreheader);
+    // NewIdx->addIncoming(IdxSub, NewLatch);
+    // Value *IdxCmp = B2.CreateIsNull(IdxSub, NewIdx->getName() + ".ncmp");
+    NewIdx->addIncoming(ConstantInt::get(TestVal->getType(), 0), LoopDetEntry);
+    NewIdx->addIncoming(IdxAdd, NewLatch);
+    Value *IdxCmp = B2.CreateICmpEQ(IdxAdd, TestVal,
+                                    NewIdx->getName() + ".ncmp");
+    ReplaceInstWithInst(NewLatch->getTerminator(),
+                        BranchInst::Create(LoopReattach, NewHeader, IdxCmp));
+  }
   DT->changeImmediateDominator(NewLatch, NewHeader);
   // The block structure of the stripmined loop should now look like so:
   //
@@ -1460,75 +1524,112 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
 
   // Update all of the old PHI nodes
   B2.SetInsertPoint(NewEntry->getTerminator());
-  Instruction *CountVal = cast<Instruction>(
-      B2.CreateMul(ConstantInt::get(NewIdx->getType(), Count),
-                   NewIdx));
-  CountVal->copyIRFlags(PrimaryInduction);
-  for (auto &InductionEntry : *TL.getInductionVars()) {
-    PHINode *OrigPhi = InductionEntry.first;
-    const InductionDescriptor &II = InductionEntry.second;
-    if (II.getStep()->isZero())
-      // Nothing to do for this Phi
-      continue;
-    // Get the new step value for this Phi.
-    Value *PhiCount = !II.getStep()->getType()->isIntegerTy()
-      ? B2.CreateCast(Instruction::SIToFP, CountVal,
-                      II.getStep()->getType())
-      : B2.CreateSExtOrTrunc(CountVal, II.getStep()->getType());
-    Value *NewStart = emitTransformedIndex(B2, PhiCount, SE, DL, II);
-
-    // Get the old increment instruction for this Phi
-    int Idx = OrigPhi->getBasicBlockIndex(NewEntry);
-    OrigPhi->setIncomingValue(Idx, NewStart);
+  if(GPU){
+  }
+  else{
+    Instruction *CountVal = cast<Instruction>(
+        B2.CreateMul(ConstantInt::get(NewIdx->getType(), Count),
+                     NewIdx));
+    CountVal->copyIRFlags(PrimaryInduction);
+    for (auto &InductionEntry : *TL.getInductionVars()) {
+      PHINode *OrigPhi = InductionEntry.first;
+      const InductionDescriptor &II = InductionEntry.second;
+      if (II.getStep()->isZero())
+        // Nothing to do for this Phi
+        continue;
+      // Get the new step value for this Phi.
+      Value *PhiCount = !II.getStep()->getType()->isIntegerTy()
+        ? B2.CreateCast(Instruction::SIToFP, CountVal,
+                        II.getStep()->getType())
+        : B2.CreateSExtOrTrunc(CountVal, II.getStep()->getType());
+      Value *NewStart = emitTransformedIndex(B2, PhiCount, SE, DL, II);
+
+      // Get the old increment instruction for this Phi
+      int Idx = OrigPhi->getBasicBlockIndex(NewEntry);
+      OrigPhi->setIncomingValue(Idx, NewStart);
+    }
   }
 
-  // Add new induction variable for inner loop.
-  PHINode *InnerIdx =
-      PHINode::Create(PrimaryInduction->getType(), 2, "inneriter");
-  InnerIdx->insertBefore(Header->getFirstNonPHIIt());
-  Value *InnerTestVal = ConstantInt::get(PrimaryInduction->getType(), Count);
-  B2.SetInsertPoint(LatchBR);
-  Instruction *InnerSub = cast<Instruction>(
-      B2.CreateSub(InnerIdx, ConstantInt::get(InnerIdx->getType(), 1),
-                   InnerIdx->getName() + ".nsub"));
-  InnerSub->copyIRFlags(PrimaryInc);
-  // Instruction *InnerAdd = cast<Instruction>(
-  //     B2.CreateAdd(InnerIdx, ConstantInt::get(InnerIdx->getType(), 1),
-  //                  InnerIdx->getName() + ".nadd"));
-  // InnerAdd->copyIRFlags(PrimaryInc);
-  Value *InnerCmp;
-  if (LatchBR->getSuccessor(0) == Header)
-    InnerCmp = B2.CreateIsNotNull(InnerSub, InnerIdx->getName() + ".ncmp");
-  else
-    InnerCmp = B2.CreateIsNull(InnerSub, InnerIdx->getName() + ".ncmp");
-  InnerIdx->addIncoming(InnerTestVal, NewEntry);
-  InnerIdx->addIncoming(InnerSub, Latch);
-  // if (LatchBR->getSuccessor(0) == Header)
-  //   InnerCmp = B2.CreateICmpNE(InnerAdd, InnerTestVal,
-  //                              InnerIdx->getName() + ".ncmp");
-  // else
-  //   InnerCmp = B2.CreateICmpEQ(InnerAdd, InnerTestVal,
-  //                              InnerIdx->getName() + ".ncmp");
-  // InnerIdx->addIncoming(ConstantInt::get(InnerIdx->getType(), 0), NewEntry);
-  // InnerIdx->addIncoming(InnerAdd, Latch);
-  LatchBR->setCondition(InnerCmp);
+  if(GPU){
+    // GPU mode: inner loop strides by grainsize.
+    PHINode *InnerIdx =
+      PHINode::Create(PrimaryInduction->getType(), 2, "inneriter", 
+                      Header->getFirstNonPHIIt());
+    // Initialize inner index to zero.
+    //Value *Zero = ConstantInt::get(PrimaryInduction->getType(), 0);
+    B2.SetInsertPoint(LatchBR->getParent()->getFirstNonPHIIt());
+    // Instead of subtracting one, add the grainsize.
+
+    Value *NextIdx = B2.CreateAdd(InnerIdx, StepSize,
+                                  InnerIdx->getName() + ".nadd_stride");
+
+    //NextIdx->copyIRFlags(PrimaryInc);
+    // Check if the new index is still within the original trip count.
+    InnerIdx->addIncoming(NewIdx, NewEntry);
+    InnerIdx->addIncoming(NextIdx, Latch);
+    Value *InnerCmp;
+    if (LatchBR->getSuccessor(0) == Header)
+      InnerCmp = B2.CreateICmpULT(NextIdx, TripCount,
+                                         InnerIdx->getName() + ".ncmp_final");
+    
+    else
+      InnerCmp = B2.CreateICmpUGE(NextIdx, TripCount,
+                                         InnerIdx->getName() + ".ncmp_final");
+
+    LatchBR->setCondition(InnerCmp);
+    // In the gpu case, we actually want to replace the induction variable
+    PrimaryInduction->replaceAllUsesWith(InnerIdx); 
+  } else {
+    // Add new induction variable for inner loop.
+    PHINode *InnerIdx = PHINode::Create(PrimaryInduction->getType(), 2,
+                                        "inneriter",
+                                        Header->getFirstNonPHIIt());
+    Value *InnerTestVal = ConstantInt::get(PrimaryInduction->getType(), Count);
+    B2.SetInsertPoint(LatchBR);
+    Instruction *InnerSub = cast<Instruction>(
+        B2.CreateSub(InnerIdx, ConstantInt::get(InnerIdx->getType(), 1),
+                     InnerIdx->getName() + ".nsub"));
+    InnerSub->copyIRFlags(PrimaryInc);
+    // Instruction *InnerAdd = cast<Instruction>(
+    //     B2.CreateAdd(InnerIdx, ConstantInt::get(InnerIdx->getType(), 1),
+    //                  InnerIdx->getName() + ".nadd"));
+    // InnerAdd->copyIRFlags(PrimaryInc);
+    Value *InnerCmp;
+    if (LatchBR->getSuccessor(0) == Header)
+      InnerCmp = B2.CreateIsNotNull(InnerSub, InnerIdx->getName() + ".ncmp");
+    else
+      InnerCmp = B2.CreateIsNull(InnerSub, InnerIdx->getName() + ".ncmp");
+    InnerIdx->addIncoming(InnerTestVal, NewEntry);
+    InnerIdx->addIncoming(InnerSub, Latch);
+    // if (LatchBR->getSuccessor(0) == Header)
+    //   InnerCmp = B2.CreateICmpNE(InnerAdd, InnerTestVal,
+    //                              InnerIdx->getName() + ".ncmp");
+    // else
+    //   InnerCmp = B2.CreateICmpEQ(InnerAdd, InnerTestVal,
+    //                              InnerIdx->getName() + ".ncmp");
+    // InnerIdx->addIncoming(ConstantInt::get(InnerIdx->getType(), 0), NewEntry);
+    // InnerIdx->addIncoming(InnerAdd, Latch);
+    LatchBR->setCondition(InnerCmp);
+  }
 
   // Connect the epilog code to the original loop and update the PHI functions.
   B2.SetInsertPoint(EpilogPreheader->getTerminator());
 
-  // Compute the start of the epilog iterations.  We use a divide and multiply
-  // by the power-of-2 count to simplify the SCEV's of the induction variables
-  // for later analysis passes.
-  // Value *EpilStartIter = B2.CreateSub(TripCount, ModVal);
-  Value *EpilStartIter =
-    B2.CreateMul(B2.CreateUDiv(TripCount,
-                               ConstantInt::get(TripCount->getType(), Count)),
-                 ConstantInt::get(TripCount->getType(), Count));
-  if (Instruction *ESIInst = dyn_cast<Instruction>(EpilStartIter))
-    ESIInst->copyIRFlags(PrimaryInc);
-  connectEpilog(TL, EpilStartIter, ModVal, EpilogPred, LoopReattach, NewExit,
-                LatchExit, Preheader, EpilogPreheader, VMap, DT, LI, SE, DL,
-                PreserveLCSSA);
+  if(!GPU){
+    // Compute the start of the epilog iterations.  We use a divide and multiply
+    // by the power-of-2 count to simplify the SCEV's of the induction variables
+    // for later analysis passes.
+    // Value *EpilStartIter = B2.CreateSub(TripCount, ModVal);
+    Value *EpilStartIter =
+      B2.CreateMul(B2.CreateUDiv(TripCount,
+                                 ConstantInt::get(TripCount->getType(), Count)),
+                   ConstantInt::get(TripCount->getType(), Count));
+    if (Instruction *ESIInst = dyn_cast<Instruction>(EpilStartIter))
+      ESIInst->copyIRFlags(PrimaryInc);
+    connectEpilog(TL, EpilStartIter, ModVal, EpilogPred, LoopReattach, NewExit,
+                  LatchExit, Preheader, EpilogPreheader, VMap, DT, LI, SE, DL,
+                  PreserveLCSSA);
+  }
 
   // If this loop is nested, then the loop stripminer changes the code in the
   // any of its parent loops, so the Scalar Evolution pass needs to be run
@@ -1548,29 +1649,276 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
   // }
 
   // Record that the remainder loop was derived from a Tapir loop.
-  (*RemainderLoop)->setDerivedFromTapirLoop();
+  if(!GPU)
+    (*RemainderLoop)->setDerivedFromTapirLoop();
 
   // At this point, the code is well formed.  We now simplify the new loops,
   // doing constant propagation and dead code elimination as we go.
   simplifyLoopAfterStripMine(L, /*SimplifyIVs*/ true, LI, SE, DT, TTI, AC);
   simplifyLoopAfterStripMine(NewLoop, /*SimplifyIVs*/ true, LI, SE, DT, TTI,
                              AC);
-  simplifyLoopAfterStripMine(*RemainderLoop, /*SimplifyIVs*/ true, LI, SE, DT,
-                             TTI, AC);
+  if(!GPU)
+    simplifyLoopAfterStripMine(*RemainderLoop, /*SimplifyIVs*/ true, LI, SE, DT,
+                               TTI, AC);
 
+  // TODO: update all the analyses manually
 #ifndef NDEBUG
-  DT->verify();
-  LI->verify(*DT);
+  //DT->verify();
+  //LI->verify(*DT);
 #endif
 
   // Record that the old loop was derived from a Tapir loop.
   L->setDerivedFromTapirLoop();
 
   // Update TaskInfo manually using the updated DT.
-  if (TI)
+  //if (TI)
     // FIXME: Recalculating TaskInfo for the whole function is wasteful.
     // Optimize this routine in the future.
-    TI->recalculate(*F, *DT);
+    //TI->recalculate(*F, *DT);
+    
+  // Reductions take a parallel loop
+  // forall(i=0; i<n; i++)
+  //   Can be an aribrary parallel loop with multiple reductions
+  //   BODY
+  //   sum(&red, a[i]; 0.0)
+  //
+  // GPU stripmine converts to 
+  //   nred = gpuGridSize(n);
+  //   reds = managedMalloc(nred);
+  //   forall(i=0; i<nred; i++){
+  //     for(j=i; j<n; j+=nred) {
+  //       BODY
+  //       sum(&reds[i], a[j]; 0.0)
+  //     }
+  //   }
+  //   for(i=0; i<nred; i++)
+  //     sum(&red, reds[i], 0.0);
+  //
+  // CPU stripmine converts to
+  //   nred = n/K; // K defaults to 2048 (DefaultCoarseningFactor)
+  //   reds = managedMalloc();
+  //   forall(i=0; i<n; i+=nred){ // not quite right, need to handle case with epilogue
+  //     reds[i] = 0.0
+  //     for(j=i; j<i+nred; j++){
+  //       BODY
+  //       sum(&reds[i], a[j]; 0.0)
+  //     }
+  //   }
+  //   // Epilogue (leftover iterations)
+  //   if(nred * k > N){ 
+  //     for(...) // epilogue logic
+  //       BODY
+  //       sum(&reds[i], a[j]; 0.0)
+  //   }
+  //   for(i=0; i<nred; i++)
+  //     sum(&red, reds[i], 0.0);
+  //
+  // record calls to reduction functions in loop for later reference
+  const std::vector<BasicBlock*>& blocks = L->getBlocks(); 
+  std::set<std::pair<CallInst*, Type*>> reductions;
+  for (BasicBlock *BB : blocks){
+    for (Instruction &I : *BB) {
+      if(auto ci = dyn_cast<CallInst>(&I)){
+        auto f = ci->getCalledFunction(); 
+        if(f->getAttributes().hasAttrSomewhere(Attribute::KitsuneReduction)){
+          LLVM_DEBUG(dbgs() << "Found reduction var: " << ci->getArgOperand(0)->getName() << 
+                               "with reduction function: " << f->getName() << "\n"); 
+          auto ty = ci->getArgOperand(1)->getType(); 
+          reductions.insert(std::make_pair(ci, ty)); 
+          //TODO: check the type to confirm valid reduction
+        }
+      }
+    }
+  }
+   
+  // To make the stripmining work for multiple backends, we parameterize on the step and the termination condition
+  // Roughly speaking, we want the CPU to look like
+  // n = p*s + k
+  // forall(i = 0; i<p; j++)
+  //   for(j = i*s; j < (i+1)*s, j++) 
+  //     B
+  // 
+  // forall(i = p*s; i<n; i++)
+  //   B
+  //
+  // while GPU should look like
+  // forall(i=0..p)
+  //   for(j = 0; j < n; j+=s)
+  //     B
+  //
+  //
+  // accumulate reductions in epilog loop
+  LLVM_DEBUG(dbgs() << "Found " << reductions.size() << " reduction variables in loop\n");
+
+  // Associates calls to reduction functions, first argument to reduction
+  // function,  local reduction allocation, type of unit, unit
+  std::vector<std::tuple<CallInst *, Value *, Value *, Type *, Value *>> redMap;
+  // TODO: Modify the strip mining outer loop to be smaller: currently we are
+  // stack allocating n/2048 reduction values.
+  // TODO: Initialize local reductions with unit values
+  // TODO: move insertion point for reduction allocation
+  // TODO: free reduction allocation
+  Instruction* bloc = nullptr;
+  if(Instruction* I = dyn_cast<Instruction>(TripCount)){
+    bloc = I->getParent()->getTerminator();
+  } else {
+    bloc = F->getEntryBlock().getTerminator();
+  }
+  IRBuilder<> RB(bloc); 
+  Value *outerIters;
+  if(!GPU) 
+    outerIters = RB.CreateUDiv(TripCount,
+                               ConstantInt::get(TripCount->getType(), Count),
+                               "stripiter");
+  else 
+    outerIters = StepSize;
+
+  // Here we iterate over the reductions (calls to reduction functions), and
+  // allocate the local reduction variable array, and build the association
+  // array redMap, and replace references to the original reduction variable
+  // with references to the new local reduction variable in the body of the
+  // inner loop
+  auto nred = RB.CreateAdd(outerIters, ConstantInt::get(outerIters->getType(), 1)); 
+  for(auto &pair : reductions){
+    // TODO: generic allocation/free calls
+    auto ci = pair.first; 
+    auto ptr = ci->getArgOperand(0); 
+    auto unit = ci->getArgOperand(2); 
+    auto ty = pair.second; 
+    auto gmmTy = FunctionType::get(ptr->getType(), { nred->getType() }, false); 
+    auto arrSize = RB.CreateMul(nred, ConstantInt::get(nred->getType(), DL.getTypeAllocSize(ty))); 
+    auto al = RB.CreateCall(M->getOrInsertFunction("gpuManagedMalloc", gmmTy), {arrSize}); 
+    //auto al = RB.CreateBitCast(rm, ty); 
+    //auto al = RB.CreateAlloca(ty, nred, ptr->getName() + "_reduction");
+    IRBuilder<> BH(NewLoop->getHeader()->getTerminator()); 
+    auto lptr = BH.CreateBitCast(
+      BH.CreateGEP(ty, al, NewIdx), 
+      ptr->getType());                             
+    redMap.push_back(std::make_tuple(ci, ptr, al, ty, unit)); 
+    // Assume there is more than one element, and
+    // use the first element for the first iteration of the loop.
+    // roughly: 
+    //   red = init; 
+    //   forall(i = ...){
+    //     red = reduce(red, body(i)); 
+    //   }
+    //   red = init; 
+    //   localred[m+1]; 
+    //   
+    //   forall(k ∈ 0..m-1){
+    //     localred[i] = body(j_0); 
+    //     for(j ∈ j_k_1..j_k_l-1)
+    //       reduce(localred+i, body(j));
+    //   }
+    //   for( j ∈ j_k_m .. n )
+    //     reduce(localred+m, body(j)); 
+    //   
+    //   for(k ∈ 0..m) 
+    //     reduce(&red, localred[k]); 
+    //
+    ptr->replaceUsesWithIf(lptr, [L](Use &u){
+      if(auto I = dyn_cast<Instruction>(u.getUser())){
+        return L->contains(I->getParent()); 
+      } else {
+        return false;
+      }; 
+    });
+  }
+  
+  // Epilog "join" of reduction values stored in local reduction value arrays.
+  // Should be able to use redMap to map original pointer (which is still used
+  // to reduce the remainder of the strimined loop, so you probably want to
+  // start the reduction with that value).
+  LLVM_DEBUG(dbgs() << "Function after strip mining, before reduction epilogue\n" << *F); 
+   
+  if(!reductions.empty()){
+    ValueToValueMapTy VMap; 
+    SmallVector<Instruction*, 4> CIS;
+    for(auto &BB : NewLoop->blocks()){
+      // We find the location that we reduce into and create a store of unit
+      // TODO: Get unit value for reduction
+      for(auto &I : *BB){
+        if(auto *CI = dyn_cast<CallInst>(&I)){
+          auto *F = CI->getCalledFunction(); 
+          if(F->getAttributes().hasAttrSomewhere(Attribute::KitsuneReduction)){
+            // this must be defined in the outer parallel loop but before the inner loop 
+            IRBuilder<> PB(dyn_cast<Instruction>(CI->getArgOperand(0))->getNextNode()); 
+            PB.CreateStore(CI->getArgOperand(2), CI->getArgOperand(0)); 
+            CIS.push_back(&I); 
+            F->removeFnAttr(Attribute::NoInline); 
+            F->removeFnAttr(Attribute::OptimizeNone); 
+          }
+        }
+      }
+    }
+
+  // We insert the reduction code at every sync corresponding to the strimined
+  // loop
+  //
+  // Sync 
+  // RedEpiHeader
+  //   RedEpiBody
+  // RedEpiExit
+
+    Instruction* term = LatchExit->getTerminator(); 
+    BasicBlock *PostSync = term->getSuccessor(0);
+    BasicBlock* RedEpiHeader = BasicBlock::Create(LatchExit->getContext(), "reductionEpilogue", LatchExit->getParent(), LatchExit); 
+    RedEpiHeader->moveAfter(LatchExit); 
+    ReplaceInstWithInst(term, SyncInst::Create(RedEpiHeader, SyncReg)); 
+    BranchInst::Create(PostSync, RedEpiHeader);
+    PHINode *Idx = PHINode::Create(outerIters->getType(), 2,
+                                   "reductionepilogueidx",
+                                   RedEpiHeader->getFirstNonPHIIt());
+    IRBuilder<> BH(RedEpiHeader, RedEpiHeader->getFirstNonPHIIt()); 
+    Idx->addIncoming(ConstantInt::get(outerIters->getType(), 0), LatchExit);
+    Instruction *bodyTerm, *exitTerm;
+    Value *cmp = BH.CreateCmp(CmpInst::ICMP_NE, Idx, outerIters); 
+    SplitBlockAndInsertIfThenElse(cmp, RedEpiHeader->getTerminator(), &bodyTerm, &exitTerm);
+
+    IRBuilder<> BB(bodyTerm); 
+    // For each reduction, get the allocated thread local reduced values and
+    // reduce them. 
+    for(auto& kv : redMap){
+      const auto [ ci, ptr, al, ty, unit ] = kv; 
+      auto lptr = BB.CreateBitCast(
+        BB.CreateGEP(ty, al, Idx), 
+        ptr->getType());                             
+      auto x = BB.CreateLoad(ty, lptr); 
+      BB.SetCurrentDebugLocation(ci->getDebugLoc()); 
+      BB.CreateCall(ci->getCalledFunction(), { ptr, x , unit}); 
+    }
+    Value *IdxAdd =
+        BB.CreateAdd(Idx, ConstantInt::get(Idx->getType(), 1),
+                          Idx->getName() + ".add");
+    BasicBlock* body = bodyTerm->getParent(); 
+    BasicBlock* loopExit = exitTerm->getParent(); 
+    Idx->addIncoming(IdxAdd, body); 
+    ReplaceInstWithInst(bodyTerm, BranchInst::Create(RedEpiHeader)); 
+
+    // Update Loopinfo with reduction loop
+    Loop* RL = LI->AllocateLoop(); 
+    if(ParentLoop) ParentLoop->addChildLoop(RL); 
+    else LI->addTopLevelLoop(RL); 
+    if(!ParentLoop){
+      RL->addBasicBlockToLoop(RedEpiHeader, *LI); 
+      RL->addBasicBlockToLoop(body, *LI); 
+    } else {
+      LI->changeLoopFor(RedEpiHeader, RL);
+      RL->addBlockEntry(RedEpiHeader);
+      LI->changeLoopFor(body, RL); 
+      RL->addBlockEntry(body); 
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "Function after reduction epilogue\n" << *F);  
+
+  // TODO: fix DT updates
+  //DT->recalculate(*F); 
+
+#ifndef NDEBUG
+  //DT->verify();
+  //LI->verify(*DT);
+#endif
 
   return NewLoop;
 }
diff --git a/llvm/lib/Transforms/Tapir/LoopStripMinePass.cpp b/llvm/lib/Transforms/Tapir/LoopStripMinePass.cpp
index 9866dcd1fb9ff..56c81b0396bbb 100644
--- a/llvm/lib/Transforms/Tapir/LoopStripMinePass.cpp
+++ b/llvm/lib/Transforms/Tapir/LoopStripMinePass.cpp
@@ -130,8 +130,8 @@ static bool tryToStripMineLoop(
                   "form.\n");
     return false;
   }
-  bool StripMiningRequested =
-      (hasLoopStripmineTransformation(L) == TM_ForcedByUser);
+  //bool StripMiningRequested =
+  //    (hasLoopStripmineTransformation(L) == TM_ForcedByUser);
   TargetTransformInfo::StripMiningPreferences SMP =
     gatherStripMiningPreferences(L, SE, TTI, ProvidedCount);
 
@@ -152,12 +152,12 @@ static bool tryToStripMineLoop(
 
   // If the loop size is unknown, then we cannot compute a stripmining count for
   // it.
-  if (!ExplicitCount && UnknownSize) {
-    LLVM_DEBUG(dbgs() << "  Not stripmining loop with unknown size.\n");
-    ORE.emit(createMissedAnalysis("UnknownSize", L)
-             << "Cannot stripmine loop with unknown size.");
-    return false;
-  }
+  //if (!ExplicitCount && UnknownSize) {
+  //  LLVM_DEBUG(dbgs() << "  Not stripmining loop with unknown size.\n");
+  //  ORE.emit(createMissedAnalysis("UnknownSize", L)
+  //           << "Cannot stripmine loop with unknown size.");
+  //  return false;
+ // }
 
   // If the loop size is enormous, then we might want to use a stripmining count
   // of 1 for it.
@@ -212,12 +212,14 @@ static bool tryToStripMineLoop(
 
   // If the loop contains potentially expensive function calls, then we don't
   // want to stripmine it.
+  /*
   if (NumCalls > 0 && !ExplicitCount && !StripMiningRequested) {
     LLVM_DEBUG(dbgs() << "  Skipping loop with expensive function calls.\n");
     ORE.emit(createMissedAnalysis("ExpensiveCalls", L)
              << "Not stripmining loop with potentially expensive calls.");
     return false;
   }
+  */
 
   // Make sure the count is a power of 2.
   if (!isPowerOf2_32(SMP.Count))
@@ -282,12 +284,30 @@ static bool tryToStripMineLoop(
   // Save loop properties before it is transformed.
   MDNode *OrigLoopID = L->getLoopID();
 
+  // TODO: change this to check tapir loop attributes for custom target
+  bool GPU = false;
+
+  TTID target = TGI.hasTTID() ? TGI.getTTID() : TTID::OpenCilk;
+
+  switch(target){
+    // We don't want to stripmine for serial targets
+    case TTID::Serial:
+    case TTID::Cuda:
+    case TTID::Hip:
+      return false; 
+    case TTID::GPU:
+      GPU = true;
+      break;
+    default:
+      break; 
+  }
+
   // Stripmine the loop
   Loop *RemainderLoop = nullptr;
   Loop *NewLoop = StripMineLoop(L, SMP.Count, SMP.AllowExpensiveTripCount,
                                 SMP.UnrollRemainder, LI, &SE, &DT, TTI, &AC, TI,
                                 &ORE, PreserveLCSSA, ParallelEpilog,
-                                NeedNestedSync, &RemainderLoop);
+                                NeedNestedSync, &RemainderLoop, GPU);
   if (!NewLoop)
     return false;
 
@@ -307,13 +327,10 @@ static bool tryToStripMineLoop(
   return true;
 }
 
-PreservedAnalyses LoopStripMinePass::run(Function &F,
-                                         FunctionAnalysisManager &AM) {
-  Module& M  = *F.getParent();
-
-  auto &MAM = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
-  const TapirTargetInfo &TGI = *MAM.getCachedResult<TapirTargetAnalysis>(M);
 
+static bool loopStripMineImpl(Function &F,
+                              FunctionAnalysisManager &AM,
+                              TapirTargetInfo &TGI) {
   auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
   auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
   auto &LI = AM.getResult<LoopAnalysis>(F);
@@ -372,7 +389,7 @@ PreservedAnalyses LoopStripMinePass::run(Function &F,
     // The parent must not be damaged by stripmining!
 #ifndef NDEBUG
     if (LoopChanged && ParentL)
-      ParentL->verifyLoop();
+      //ParentL->verifyLoop();
 #endif
 
     // Clear any cached analysis results for L if we removed it completely.
@@ -380,8 +397,29 @@ PreservedAnalyses LoopStripMinePass::run(Function &F,
       LAM->clear(L, LoopName);
   }
 
+  return Changed;
+}
+
+PreservedAnalyses LoopStripMinePass::run(Module &M,
+                                         ModuleAnalysisManager &AM){
+  auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  auto &TGI = AM.getResult<TapirTargetAnalysis>(M);
+
+  bool Changed = false;
+
+  for(auto &F: M){
+    if(!F.empty())
+      Changed |= loopStripMineImpl(F, FAM, TGI);
+  }
+
   if (!Changed)
     return PreservedAnalyses::all();
 
-  return getLoopPassPreservedAnalyses();
+  PreservedAnalyses PA = PreservedAnalyses::none();
+  // If we've changed, assume we've not preserved anything
+  PA.preserve<ModuleAnalysisManagerFunctionProxy>();
+  return PA;
+  //return PreservedAnalyses::none(); 
+
 }
+                                          
diff --git a/llvm/lib/Transforms/Tapir/LoweringUtils.cpp b/llvm/lib/Transforms/Tapir/LoweringUtils.cpp
index e90970df06f0f..fd17fbce9d882 100644
--- a/llvm/lib/Transforms/Tapir/LoweringUtils.cpp
+++ b/llvm/lib/Transforms/Tapir/LoweringUtils.cpp
@@ -25,6 +25,10 @@
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Transforms/IPO/FunctionAttrs.h"
 #include "llvm/Transforms/Tapir/Outline.h"
+#include "llvm/Transforms/Tapir/QthreadsABI.h"
+#include "llvm/Transforms/Tapir/RealmABI.h"
+#include "llvm/Transforms/Tapir/GPUABI.h"
+#include "llvm/Transforms/Tapir/SerialABI.h"
 #include "llvm/Transforms/Tapir/TapirLoopInfo.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
diff --git a/llvm/test/Transforms/Tapir/jump-threading-detach-continue-2.ll b/llvm/test/Transforms/Tapir/jump-threading-detach-continue-2.ll
index 990c577e2e781..a338ee84a92b9 100644
--- a/llvm/test/Transforms/Tapir/jump-threading-detach-continue-2.ll
+++ b/llvm/test/Transforms/Tapir/jump-threading-detach-continue-2.ll
@@ -1,7 +1,7 @@
 ; Check that jump threading does not thread a detach-continue edge if
 ; it does not also thread the corresponding reattach-continue edge.
 ;
-; RUN: opt < %s -passes="cgscc(devirt<4>(inline,function<eager-inv>(loop(indvars,loop-unroll-full),gvn<>,instcombine,loop-mssa(licm<allowspeculation>)))),function<eager-inv>(loop-stripmine,jump-threading)" -unroll-peel-max-count=0 -require-parallel-epilog -S | FileCheck %s
+; RUN: opt < %s -passes="cgscc(devirt<4>(inline,function<eager-inv>(loop(indvars,loop-unroll-full),gvn<>,instcombine,loop-mssa(licm<allowspeculation>)))),loop-stripmine,function<eager-inv>(jump-threading)" -unroll-peel-max-count=0 -require-parallel-epilog -S | FileCheck %s
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Transforms/Tapir/loop-stripmine-zero-iter-pfor-loop.ll b/llvm/test/Transforms/Tapir/loop-stripmine-zero-iter-pfor-loop.ll
index 338af8c741b74..b1cd73cfed784 100644
--- a/llvm/test/Transforms/Tapir/loop-stripmine-zero-iter-pfor-loop.ll
+++ b/llvm/test/Transforms/Tapir/loop-stripmine-zero-iter-pfor-loop.ll
@@ -1,6 +1,6 @@
 ; Check that loop-stripmining generates the correct IR to enable a zero-iteration parallel loop to be optimized away.
 ;
-; RUN: opt < %s -passes="cgscc(devirt<4>(inline,function<eager-inv;no-rerun>(loop(indvars),sroa<modify-cfg>))),function<eager-inv>(loop-stripmine,early-cse<memssa>,instcombine<max-iterations=1;no-verify-fixpoint>),function(simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;no-sink-common-insts;speculate-blocks;simplify-cond-branch;no-speculate-unpredictables>)" -S | FileCheck %s
+; RUN: opt < %s -passes="cgscc(devirt<4>(inline,function<eager-inv;no-rerun>(loop(indvars),sroa<modify-cfg>))),loop-stripmine,function<eager-inv>(early-cse<memssa>,instcombine<max-iterations=1;no-verify-fixpoint>),function(simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;no-sink-common-insts;speculate-blocks;simplify-cond-branch;no-speculate-unpredictables>)" -S | FileCheck %s
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/llvm/test/Transforms/Tapir/missed-loop-opts-test.ll b/llvm/test/Transforms/Tapir/missed-loop-opts-test.ll
index fecf057069ec4..b217ffb6059fb 100644
--- a/llvm/test/Transforms/Tapir/missed-loop-opts-test.ll
+++ b/llvm/test/Transforms/Tapir/missed-loop-opts-test.ll
@@ -1,6 +1,6 @@
 ; Check that Tapir loops can be peeled and subsequently stripmined.
 ;
-; RUN: opt < %s -passes='loop(loop-unroll-full),gvn,loop(tapir-indvars),loop-stripmine' -S | FileCheck %s
+; RUN: opt < %s -passes='function(loop(loop-unroll-full),gvn,loop(tapir-indvars)),loop-stripmine' -S | FileCheck %s
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"