diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 40e9848e513a8..05cdb8a377345 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -4725,6 +4725,14 @@ def ClspvLibclcBuiltin: InheritableAttr {
   let SimpleHandler = 1;
 }
 
+// +===== kitsune-/tapir-centric attributes
+
+def KitsuneReduction : InheritableAttr {
+  let Spellings = [Clang<"kitsune_reduction">];
+  let Subjects = SubjectList<[FunctionLike]>;
+  let Documentation = [StrandMallocDocs];
+}
+
 def TapirTarget : StmtAttr {
   let Spellings = [CXX11<"tapir","target">];
 
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 283fe88bf2f72..4be6d0b4920ee 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -2408,6 +2408,9 @@ void CodeGenModule::ConstructAttributeList(StringRef Name,
       FuncAttrs.addMemoryAttr(llvm::MemoryEffects::inaccessibleOrArgMemOnly());
       FuncAttrs.addAttribute(llvm::Attribute::NoUnwind);
     }
+    if (TargetDecl->hasAttr<KitsuneReductionAttr>()) {
+      FuncAttrs.addAttribute(llvm::Attribute::KitsuneReduction);
+    }
     if (TargetDecl->hasAttr<RestrictAttr>())
       RetAttrs.addAttribute(llvm::Attribute::NoAlias);
     if (TargetDecl->hasAttr<ReturnsNonNullAttr>() &&
diff --git a/clang/lib/Headers/kitsune.h.cmake b/clang/lib/Headers/kitsune.h.cmake
new file mode 100644
index 0000000000000..d8e6cd4bf86ee
--- /dev/null
+++ b/clang/lib/Headers/kitsune.h.cmake
@@ -0,0 +1,145 @@
+
+/*
+ * Copyright (c) 2020 Triad National Security, LLC
+ *                         All rights reserved.
+ *
+ * This file is part of the kitsune/llvm project.  It is released under
+ * the LLVM license.
+ */
+#ifndef __CLANG_KITSUNE_H__
+#define __CLANG_KITSUNE_H__
+
+#include <stdint.h>
+#include <stddef.h>
+
+#cmakedefine01 KITSUNE_ENABLE_OPENMP_ABI_TARGET
+#cmakedefine01 KITSUNE_ENABLE_QTHREADS_ABI_TARGET
+#cmakedefine01 KITSUNE_ENABLE_CUDA_ABI_TARGET
+#cmakedefine01 KITSUNE_ENABLE_GPU_ABI_TARGET
+#cmakedefine01 KITSUNE_ENABLE_REALM_ABI_TARGET
+#cmakedefine01 KITSUNE_ENABLE_OPENCL_ABI_TARGET
+#cmakedefine01 KITSUNE_ENABLE_HIP_ABI_TARGET
+
+#include "kitsune_rt.h"
+
+#if defined(reduction)
+#warning found reduction definition: try puttin kitsune lower in include order
+#else
+#define reduction __attribute__((noinline, kitsune_reduction))
+#endif
+
+
+#if defined(KITSUNE_ENABLE_OPENCL_ABI_TARGET)
+#define ocl_mmap(a, n) __kitsune_opencl_mmap_marker((void*)a, n)
+#ifdef __cplusplus
+extern "C" {
+#endif
+  void __kitsune_opencl_mmap_marker(void* ptr, uint64_t n);
+#ifdef __cplusplus
+}
+#endif
+#endif
+
+#if defined(spawn)
+#warning encountered multiple definitions of spawn!
+#else
+#define spawn _kitsune_spawn
+#endif
+
+#if defined(sync)
+#warning encountered multiple definitions of sync!
+#else
+#define sync _kitsune_sync
+#endif
+
+#if defined(forall)
+#warning encountered multiple definitions of forall!
+#else
+#define forall _kitsune_forall
+#endif
+
+
+#if defined(_tapir_cuda_target)
+  #ifdef __cplusplus
+    extern "C" __attribute__((malloc)) void* __kitrt_cuMemAllocManaged(size_t);
+    template <typename T>
+    inline __attribute__((always_inline))
+      T* alloc(size_t N) {
+      return (T*)__kitrt_cuMemAllocManaged(sizeof(T) * N);
+    }
+
+    extern "C" void __kitrt_cuMemFree(void*);
+    template <typename T>
+    void dealloc(T* array) {
+      __kitrt_cuMemFree((void*)array);
+    }
+  #else
+    void* __attribute__((malloc)) __kitrt_cuMemAllocManaged(size_t);
+    inline __attribute__((always_inline))
+    void *alloc(size_t total_bytes) {
+      return __kitrt_cuMemAllocManaged(total_bytes);
+    }
+
+    void __kitrt_cuMemFree(void*);
+    inline __attribute__((always_inline))
+    void dealloc(void *array) {
+      __kitrt_cuMemFree(array);
+    }
+  #endif
+#elif defined(_tapir_hip_target)
+  #ifdef __cplusplus
+    extern "C" __attribute__((malloc)) void* __kitrt_hipMemAllocManaged(size_t);
+    template <typename T>
+    inline __attribute__((always_inline))
+      T* alloc(size_t N) {
+      return (T*)__kitrt_hipMemAllocManaged(sizeof(T) * N);
+    }
+
+    extern "C" void __kitrt_hipMemFree(void*);
+    template <typename T>
+    void dealloc(T* array) {
+      __kitrt_hipMemFree((void*)array);
+    }
+  #else
+    void* __attribute__((malloc)) __kitrt_hipMemAllocManaged(size_t);
+    inline __attribute__((always_inline))
+    void *alloc(size_t total_bytes) {
+      return __kitrt_hipMemAllocManaged(total_bytes);
+    }
+
+    void __kitrt_hipMemFree(void*);
+    inline __attribute__((always_inline))
+    void dealloc(void *array) {
+       __kitrt_hipMemFree(array);
+    }
+  #endif
+#else
+  #ifdef __cplusplus
+    extern "C" __attribute__((malloc)) void* __kitrt_defaultMemAlloc(size_t);
+    template <typename T>
+    inline __attribute__((always_inline))
+    T* alloc(size_t N) {
+      return (T*)__kitrt_defaultMemAlloc(sizeof(T) * N);
+    }
+
+    extern "C" void __kitrt_defaultMemFree(void*);
+    template <typename T>
+    void dealloc(T* array) {
+      __kitrt_defaultMemFree(array);
+    }
+  #else
+    void* __attribute__((malloc)) __kitrt_defaultMemAlloc(size_t);
+    inline __attribute__((always_inline))
+    void *alloc(size_t total_bytes) {
+      return __kitrt_defaultMemAlloc(total_bytes);
+    }
+
+    void __kitrt_defaultMemFree(void*);
+    inline __attribute__((always_inline))
+    void dealloc(void* array) {
+       __kitrt_defaultMemFree(array);
+    }
+  #endif // __cplusplus
+#endif // cpu targets
+
+#endif
diff --git a/clang/lib/Headers/magma.h b/clang/lib/Headers/magma.h
new file mode 100644
index 0000000000000..43c4fb8f1075f
--- /dev/null
+++ b/clang/lib/Headers/magma.h
@@ -0,0 +1,43 @@
+#include<string>
+#include<limits>
+
+template <typename a> 
+struct Magma {
+  virtual a op(a x, a y) = 0; 
+}; 
+
+template <typename a>
+struct UnitalMagma : public Magma<a> {
+  virtual a id() = 0; 
+}; 
+
+// Example unital magmas
+template <typename a>
+struct Sum : UnitalMagma<a>{
+  a op(a x, a y){ return x + y; }
+  a id(){ return 0; }  // look into this more
+}; 
+
+template <typename a>
+struct Product : UnitalMagma<a> {
+  a op(a x, a y){ return x * y; }
+  a id(){ return 1; }
+}; 
+
+struct StringApp : UnitalMagma<std::string> {
+  std::string op(std::string x, std::string y){ return x.append(y); }
+  std::string id() { return ""; }
+}; 
+
+template <typename a>
+struct Max : UnitalMagma<a> {
+  a op(a x, a y){ return x > y ? x : y; }
+  a id() { return std::numeric_limits<a>::min(); }
+}; 
+
+template <typename a>
+struct Min : UnitalMagma<a> {
+  a op(a x, a y){ return x < y ? x : y; }
+  a id() { return std::numeric_limits<a>::max(); }
+}; 
+
diff --git a/clang/lib/Headers/reductions.h b/clang/lib/Headers/reductions.h
new file mode 100644
index 0000000000000..d9abf9cf276b5
--- /dev/null
+++ b/clang/lib/Headers/reductions.h
@@ -0,0 +1,56 @@
+#include"magma.h"
+#include<assert.h>
+#include<unistd.h>
+#include<stdio.h>
+//#include<kitsune.h>
+
+template <typename a, typename um, typename v> 
+a reduce(um m, v& xs){
+  auto acc = m.id(); 
+  for(auto x : xs){
+    acc = m.op(acc, x); 
+  }
+  return acc; 
+}
+
+template <typename a, typename um, typename v>
+a parReduce(um m, v& xs, uint64_t nthreads){
+  uint64_t linesize = sysconf(_SC_LEVEL1_DCACHE_LINESIZE); 
+  assert(linesize % sizeof(a) == 0); 
+  uint64_t linenum = linesize / sizeof(a); 
+  a* accs = new a[nthreads * linenum]; 
+  uint64_t size = xs.end() - xs.begin(); 
+  assert(size % nthreads == 0); 
+  uint64_t grainsize = size / nthreads; 
+  for(uint64_t i=0; i<nthreads; i++){
+    accs[i*linenum] = m.id();
+    for(uint64_t j = i*grainsize; j<(i+1)*grainsize; j++){
+      accs[i*linenum] = m.op(accs[i*linenum], xs[j]);
+    }
+  }
+  a acc = m.id(); 
+  for(uint64_t i=0; i<nthreads; i++){
+    acc = m.op(acc, accs[i*linenum]); 
+  }
+  delete[] accs; 
+  return acc; 
+}
+
+template<typename a, typename um, typename v> 
+a treeReduce(um m, v& xs, uint64_t start, uint64_t end, uint64_t gs){
+  if(end-start < gs){
+    a acc = m.id();
+    for(uint64_t i=start; i<end; i++){
+      acc+=xs[i];
+    }
+    return acc; 
+  }
+  else{
+    uint64_t mid = (start + end) / 2; 
+    return treeReduce(m, xs, start, mid, gs) 
+         + treeReduce(m, xs, mid, end, gs); 
+  }
+}
+
+  
+
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 2f426bba3c74b..a159eb8e55287 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -7120,10 +7120,12 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
   case ParsedAttr::AT_TypeNullable:
     handleNullableTypeAttr(S, D, AL);
     break;
-
   case ParsedAttr::AT_VTablePointerAuthentication:
     handleVTablePointerAuthentication(S, D, AL);
     break;
+  case ParsedAttr::AT_KitsuneReduction:
+    handleSimpleAttribute<KitsuneReductionAttr>(S, D, AL);
+    break;
   }
 }
 
diff --git a/kitsune-tests/reductions/l2.c b/kitsune-tests/reductions/l2.c
new file mode 100644
index 0000000000000..c1918baee371c
--- /dev/null
+++ b/kitsune-tests/reductions/l2.c
@@ -0,0 +1,53 @@
+#include<time.h>
+#include<math.h>
+#include<stdio.h>
+#include<stdlib.h>
+#include<kitsune.h>
+#include<omp.h>
+#include<gpu.h>
+
+reduction
+void sum(double *a, double b){
+  *a += b;
+}
+
+__attribute__((noinline))
+double l2(uint64_t n, double* a){
+  double red = 0; 
+  forall(uint64_t i=0; i<n; i++){
+    sum(&red, x); 
+  }
+
+  return sqrt(red); 
+}
+
+int main(int argc, char** argv){
+  int e = argc > 1 ? atoi(argv[1]) : 28; 
+  int niter = argc > 2 ? atoi(argv[2]) : 100; 
+  uint64_t n = 1ULL<<e; 
+  double* arr = (double*)gpuManagedMalloc(sizeof(double) * n); 
+
+  forall(uint64_t i=0; i<n; i++){
+    arr[i] = (double)i; 
+  }
+
+  printf("result: %f \n", l2(n, arr));
+
+  double res[niter];
+  double before = omp_get_wtime(); 
+  for(int i=0; i<niter; i++){
+    double red = l2(n, arr); 
+    //printf("red: %f\n", red); 
+    res[i]=red; 
+  }
+  double after = omp_get_wtime(); 
+
+  double partime = (double)(after - before); 
+  double bw = (double)((1ULL<<e) * niter * sizeof(double)) / (1000000000.0 * partime);  
+  printf("bandwidth: %f GB/s \n" , bw);
+
+  //double time = (double)(after - before) / 1000000; 
+  //double bw = (double)((1ULL<<e) * niter * sizeof(double)) / (1000000000.0 * time);  
+  //printf("bandwidth: %f GB/s \n" , bw);
+}
+
diff --git a/kitsune-tests/reductions/l2_openmp.c b/kitsune-tests/reductions/l2_openmp.c
new file mode 100644
index 0000000000000..70f83a4cd3ec4
--- /dev/null
+++ b/kitsune-tests/reductions/l2_openmp.c
@@ -0,0 +1,68 @@
+#include<time.h>
+#include<math.h>
+#include<stdio.h>
+#include<stdlib.h>
+#include<gpu.h>
+#include<omp.h>
+
+double l2(uint64_t n, double* a){
+  double red = 0; 
+  #pragma omp parallel for reduction(+:red)
+  for(uint64_t i=0; i<n; i++){
+    red += a[i]*a[i]; 
+  }
+
+  return sqrt(red);
+}
+
+double l2_grid(uint64_t n, double* a){
+  double red[12];
+  #pragma omp parallel for
+  for(uint64_t i=0; i<12; i++){
+    red[i] = 0; 
+    for(uint64_t j=i; j<n; j+=12){
+      red[i] += a[j]*a[j]; 
+    }
+  }
+
+  double res = 0; 
+  for(int i=0; i<12; i++) res += red[i]; 
+  return sqrt(res); 
+}
+
+int main(int argc, char** argv){
+  int e = argc > 1 ? atoi(argv[1]) : 28; 
+  int niter = argc > 2 ? atoi(argv[2]) : 100; 
+  uint64_t n = 1ULL<<e; 
+  double* arr = malloc(sizeof(double) * n); 
+
+  #pragma omp parallel for
+  for(uint64_t i=0; i<n; i++){
+    arr[i] = i; 
+  }
+
+  l2(n, arr);
+
+  double before = omp_get_wtime(); 
+  double par; 
+  for(int i=0; i<niter; i++){
+    par = l2(n, arr);
+  }
+  double after = omp_get_wtime(); 
+  double partime = (double)(after - before); 
+  printf("par: %f in %f s\n" , par, partime);
+  double bw = (double)((1ULL<<e) * niter * sizeof(double)) / (1000000000.0 * partime);  
+  printf("par bandwidth: %f GB/s \n" , bw);
+
+  double before_grid = omp_get_wtime();
+  double grid; 
+  for(int i=0; i<niter; i++){
+    grid = l2_grid(n, arr);
+  }
+  double after_grid = omp_get_wtime(); 
+  double partime_grid = (double)(after_grid - before_grid); 
+  printf("grid: %f in %f s\n" , grid, partime_grid);
+  double bw_grid = (double)((1ULL<<e) * niter * sizeof(double)) / (1000000000.0 * partime_grid);  
+  printf("grid bandwidth: %f GB/s \n" , bw_grid);
+}
+
diff --git a/kitsune-tests/reductions/makefile b/kitsune-tests/reductions/makefile
new file mode 100644
index 0000000000000..67eb8ddf1118d
--- /dev/null
+++ b/kitsune-tests/reductions/makefile
@@ -0,0 +1,29 @@
+exes=l2 l2_gpu l2_serial l2_openmp
+all:${exes}
+
+l2.ll : l2.c
+	clang -S -emit-llvm l2.c -O1 -ftapir=none 
+
+l2_stripmined.ll: l2.ll
+	opt -S -passes="loop-stripmine" -o l2_stripmined.ll l2.ll 
+
+l2_stripmined_opt.ll: l2_stripmined.ll
+	opt -S -O2 -o l2_stripmined_opt.ll l2_stripmined.ll
+
+l2: l2_stripmined_opt.ll
+	clang -ftapir=opencilk -O1 l2_stripmined_opt.ll -o l2 -lm  -lllvm-gpu -lpthread -lomp
+
+l2_gpu.ll: l2.ll
+	clang -ftapir=gpu -O1 -S -emit-llvm $< -o $@ -mllvm --debug-only=gpuabi
+
+l2_gpu: l2_gpu.ll
+	clang l2_gpu.ll -fPIC -O1 -o l2_gpu -ftapir=gpu -lm -lllvm-gpu -lpthread -lomp
+
+l2_serial: l2.ll
+	clang l2.ll -o l2_serial -ftapir=serial -lllvm-gpu -lm -lomp
+
+l2_openmp: l2_openmp.c
+	clang l2_openmp.c -fopenmp -O2 -o l2_openmp -lm
+
+clean: 
+	rm -f ${exes} l2.ll
diff --git a/kitsune/docs/using.md b/kitsune/docs/using.md
index b07a2fa4ea435..e843f6f2f1e76 100644
--- a/kitsune/docs/using.md
+++ b/kitsune/docs/using.md
@@ -99,3 +99,32 @@ Each special mode and runtime transformation/ABI target has its own named config
   * ``opencl.cfg``: OpenCL runtime ABI target specific flags. 
 
 These files can reduce complexity for end users by providing configuration- and build-specific flags.  This can be important when version-specific bitcode files and other details are used.  In addition, these files can provide developers additional flexibility for debugging, testing, and experimenting.  Obviously, all these features can also be hardcoded onto the command line for a more traditional use case.  In addition, to override any of the Kitsune or system configuration files you can place an empty config file within the user directory (no kitsune or system configuration files will be read in this case). 
+
+## Reductions 
+We provide two approaches to reductions. The first (still very much a work in
+progress and likely to break) is implicit reductions. This allows you to write
+basic reductions in the way you would for sequential code, and have them be
+optimized for parallelism, e.g. 
+
+```
+forall(auto x : xs) {
+  acc += x;
+}
+```
+
+should generate efficient parallel reduction code.
+
+Second, we provide a c++ interface for parallel reduction via user-defined
+reduction operators. Formally, we require a unital magma, which is just a
+reduction operator and a unit value, e.g. 0 for sums and 1 for products.
+
+This allows for the following style of reductions: 
+
+```
+#include<reductions.h>
+...
+  double sum = reduce<double>(Sum<double>(), big);
+```
+
+
+
diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
index 40ae85cf16919..fa5bdd2c5cc4b 100644
--- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
@@ -764,6 +764,8 @@ enum AttributeKindCodes {
   ATTR_KIND_HYBRID_PATCHABLE = 95,
   ATTR_KIND_SANITIZE_CILK = 96,
   ATTR_KIND_STEALABLE = 97,
+  ATTR_KIND_SANITIZE_CILK = 98,
+  ATTR_KIND_KITSUNE_REDUCTION = 99,
 };
 
 enum ComdatSelectionKindCodes {
diff --git a/llvm/include/llvm/IR/Attributes.td b/llvm/include/llvm/IR/Attributes.td
index bea25b9ed4191..63f17c762ca6b 100644
--- a/llvm/include/llvm/IR/Attributes.td
+++ b/llvm/include/llvm/IR/Attributes.td
@@ -239,6 +239,16 @@ def ReadNone : EnumAttr<"readnone", [ParamAttr]>;
 /// Function only reads from memory.
 def ReadOnly : EnumAttr<"readonly", [ParamAttr]>;
 
+<<<<<<< HEAD
+=======
+/// Tapir reducer-related attributes.
+def HyperView : EnumAttr<"hyper_view", [FnAttr]>;
+def HyperToken : EnumAttr<"hyper_token", [FnAttr]>;
+def ReducerRegister : EnumAttr<"reducer_register", [FnAttr]>;
+def ReducerUnregister : EnumAttr<"reducer_unregister", [FnAttr]>;
+def KitsuneReduction : EnumAttr<"kitsune_reduction", [FnAttr]>;
+
+>>>>>>> e2078e0739d8 (Working reductions, sequential semantics preserved for commutative associative magmas)
 /// Return value is always equal to this argument.
 def Returned : EnumAttr<"returned", [ParamAttr]>;
 
diff --git a/llvm/include/llvm/IR/GlobalValue.h b/llvm/include/llvm/IR/GlobalValue.h
index 59ba8919d8605..d98c0c5f4dfd0 100644
--- a/llvm/include/llvm/IR/GlobalValue.h
+++ b/llvm/include/llvm/IR/GlobalValue.h
@@ -571,7 +571,6 @@ class GlobalValue : public Constant {
 
   // FIXME: need a better solution but also too difficult to re-outline 
   // everything w/ tapir for GPU targets (AMD in particular)... 
-  //#warning "FIXME: need better solution than mutateType()."
   void mutateValueType(Type *Ty) {
     ValueType = Ty;
   }
diff --git a/llvm/include/llvm/Transforms/Tapir/GPUABI.h b/llvm/include/llvm/Transforms/Tapir/GPUABI.h
index 63bdd19cf6e63..3b45e2cc3754f 100644
--- a/llvm/include/llvm/Transforms/Tapir/GPUABI.h
+++ b/llvm/include/llvm/Transforms/Tapir/GPUABI.h
@@ -60,13 +60,14 @@ class LLVMLoop : public LoopOutlineProcessor {
   static unsigned NextKernelID;
   unsigned MyKernelID;
   Module LLVMM;
-  TargetMachine *LLVMTargetMachine;
   GlobalVariable *LLVMGlobal;
 
   FunctionCallee GetThreadIdx = nullptr;
   FunctionCallee GPUInit = nullptr;
   FunctionCallee GPULaunchKernel = nullptr;
   FunctionCallee GPUWaitKernel = nullptr;
+  FunctionCallee GPUManagedMalloc = nullptr; 
+  FunctionCallee GPUGridSize = nullptr; 
 
   SmallVector<Value *, 5> OrderedInputs; 
 public:
@@ -86,8 +87,9 @@ class LLVMLoop : public LoopOutlineProcessor {
                           ValueToValueMapTy &VMap) override final;
   void processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
                                DominatorTree &DT) override final;
+  void preProcessTapirLoop(TapirLoopInfo &TL, ValueToValueMapTy &VMap, LoopInfo& LI) override final;
 };
-}
+} //namespace llvm
 
 #endif
 /*
diff --git a/llvm/include/llvm/Transforms/Tapir/LoweringUtils.h b/llvm/include/llvm/Transforms/Tapir/LoweringUtils.h
index e2e65b4000fed..be0c8a5ed8367 100644
--- a/llvm/include/llvm/Transforms/Tapir/LoweringUtils.h
+++ b/llvm/include/llvm/Transforms/Tapir/LoweringUtils.h
@@ -17,6 +17,7 @@
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Transforms/Tapir/TapirTargetIDs.h"
@@ -449,6 +450,9 @@ class LoopOutlineProcessor {
   virtual void preProcessTapirLoop(TapirLoopInfo &TL, ValueToValueMapTy &VMap)
   { /* no-op */ }
 
+  virtual void preProcessTapirLoop(TapirLoopInfo &TL, ValueToValueMapTy &VMap, LoopInfo &LI)
+  { preProcessTapirLoop(TL, VMap); }
+
   /// Processes an outlined Function Helper for a Tapir loop, just after the
   /// function has been outlined.
   virtual void postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out,
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 92411913c43ee..09306d88ae805 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -2189,6 +2189,8 @@ static Attribute::AttrKind getAttrFromCode(uint64_t Code) {
     return Attribute::Range;
   case bitc::ATTR_KIND_INITIALIZES:
     return Attribute::Initializes;
+  case bitc::ATTR_KIND_KITSUNE_REDUCTION:
+    return Attribute::KitsuneReduction;
   }
 }
 
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 85f9d1556c26c..5251549c36f2d 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -889,6 +889,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) {
     return bitc::ATTR_KIND_RANGE;
   case Attribute::Initializes:
     return bitc::ATTR_KIND_INITIALIZES;
+  case Attribute::KitsuneReduction:
+    return bitc::ATTR_KIND_KITSUNE_REDUCTION; 
   case Attribute::EndAttrKinds:
     llvm_unreachable("Can not encode end-attribute kinds marker.");
   case Attribute::None:
diff --git a/llvm/lib/Transforms/Tapir/GPUABI.cpp b/llvm/lib/Transforms/Tapir/GPUABI.cpp
index af302214d2545..66ec4c98d9d4e 100644
--- a/llvm/lib/Transforms/Tapir/GPUABI.cpp
+++ b/llvm/lib/Transforms/Tapir/GPUABI.cpp
@@ -16,13 +16,17 @@
 #if 0
 
 #include "llvm/Transforms/Tapir/GPUABI.h"
+#include "llvm/Transforms/Tapir/TapirToTarget.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Tapir/Outline.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/LoopPeel.h"
+#include "llvm/Transforms/Utils/LoopSimplify.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/GVN.h"
 #include "llvm/Transforms/Vectorize.h"
@@ -37,9 +41,18 @@ using namespace llvm;
 
 #define DEBUG_TYPE "gpuabi"
 
-Value *GPUABI::lowerGrainsizeCall(CallInst *GrainsizeCall) {
-  Value *Grainsize = ConstantInt::get(GrainsizeCall->getType(), 8);
+// JIT compiler kernel at containing function entry, makes timing easier at the
+// cost of less laziness
+static cl::opt<bool>
+    JIT("jit-callsite", cl::init(false), cl::NotHidden,
+          cl::desc("Wait until parallel loop is called to jit kernel. " 
+          "(default=false)"));
 
+Value *GPUABI::lowerGrainsizeCall(CallInst *GrainsizeCall) {
+  //Value *Grainsize = ConstantInt::get(GrainsizeCall->getType(), 8);
+  Module *M = GrainsizeCall->getModule(); 
+  Type *Int64Ty = Type::getInt64Ty(M->getContext());
+  Value* Grainsize = CallInst::Create(M->getOrInsertFunction("gpuGridSize", Int64Ty), {},  GrainsizeCall); 
   // Replace uses of grainsize intrinsic call with this grainsize value.
   GrainsizeCall->replaceAllUsesWith(Grainsize);
   return Grainsize;
@@ -96,19 +109,17 @@ LLVMLoop::LLVMLoop(Module &M)
 
   // Insert runtime-function declarations in LLVM host modules.
   Type *LLVMInt32Ty = Type::getInt32Ty(LLVMM.getContext());
-  Type *LLVMInt64Ty = Type::getInt64Ty(LLVMM.getContext());
   GetThreadIdx = LLVMM.getOrInsertFunction("gtid", LLVMInt32Ty);
-  Function* getid = LLVMM.getFunction("gtid");
 
   Type *VoidTy = Type::getVoidTy(M.getContext());
   Type *VoidPtrTy = Type::getInt8PtrTy(M.getContext());
   Type *VoidPtrPtrTy = VoidPtrTy->getPointerTo();
-  Type *Int8Ty = Type::getInt8Ty(M.getContext());
-  Type *Int32Ty = Type::getInt32Ty(M.getContext());
   Type *Int64Ty = Type::getInt64Ty(M.getContext());
   GPUInit = M.getOrInsertFunction("initRuntime", VoidTy);
   GPULaunchKernel = M.getOrInsertFunction("launchBCKernel", VoidPtrTy, VoidPtrTy, Int64Ty, VoidPtrPtrTy, Int64Ty);
   GPUWaitKernel = M.getOrInsertFunction("waitKernel", VoidTy, VoidPtrTy);
+  GPUManagedMalloc = M.getOrInsertFunction("gpuManagedMalloc", VoidPtrTy, Int64Ty);  
+  GPUGridSize = M.getOrInsertFunction("gpuGridSize", Int64Ty);  
 }
 
 void LLVMLoop::setupLoopOutlineArgs(
@@ -176,19 +187,25 @@ unsigned LLVMLoop::getLimitArgIndex(const Function &F, const ValueSet &Args)
 
 void LLVMLoop::postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out,
                                    ValueToValueMapTy &VMap) {
-  LLVMContext &Ctx = M.getContext();
-  Type *Int8Ty = Type::getInt8Ty(Ctx);
-  Type *Int32Ty = Type::getInt32Ty(Ctx);
-  //Type *Int64Ty = Type::getInt64Ty(Ctx);
-  //Type *VoidPtrTy = Type::getInt8PtrTy(Ctx);
   Task *T = TL.getTask();
   Loop *L = TL.getLoop();
 
+  Function *Helper = Out.Outline;
+
+  DominatorTree DT;
+  DT.recalculate(*Helper); 
+  LoopInfo LI(DT); 
+
+  // The loop we care about is the outermost loop in the 
 
   BasicBlock *Entry = cast<BasicBlock>(VMap[L->getLoopPreheader()]);
   BasicBlock *Header = cast<BasicBlock>(VMap[L->getHeader()]);
   BasicBlock *Exit = cast<BasicBlock>(VMap[TL.getExitBlock()]);
   PHINode *PrimaryIV = cast<PHINode>(VMap[TL.getPrimaryInduction().first]);
+
+  Loop *NewLoop = LI.getLoopFor(Header); 
+
+  InductionDescriptor ID = TL.getPrimaryInduction().second; 
   Value *PrimaryIVInput = PrimaryIV->getIncomingValueForBlock(Entry);
   Instruction *ClonedSyncReg = cast<Instruction>(
       VMap[T->getDetach()->getSyncRegion()]);
@@ -205,8 +222,7 @@ void LLVMLoop::postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out,
   Value *ThreadID = B.CreateIntCast(ThreadIdx, PrimaryIV->getType(), false);
 
 
-  Function *Helper = Out.Outline;
-  Helper->setName("kitsune_kernel");
+  Helper->setName("kitsune_kernel"); 
   // Fix argument pointer types to global, nocapture
   // TODO: read/write attributes?
   LLVM_DEBUG(dbgs() << "Function type after globalization of argument pointers << " << *Helper->getType() << "\n");
@@ -231,11 +247,7 @@ void LLVMLoop::postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out,
       // Grainsize argument is the third LC arg.
       Grainsize = &*++(++OutlineArgsIter);
   }
-  ThreadID = B.CreateMul(ThreadID, Grainsize);
-  Value *ThreadEndGrain = B.CreateAdd(ThreadID, Grainsize);
-  Value *Cmp = B.CreateICmp(ICmpInst::ICMP_ULT, ThreadEndGrain, End);
-  Value *ThreadEnd = B.CreateSelect(Cmp, ThreadEndGrain, End);
-  Value *Cond = B.CreateICmpUGE(ThreadID, ThreadEnd);
+  Value *Cond = B.CreateICmpUGE(ThreadID, End);
 
   ReplaceInstWithInst(Entry->getTerminator(), BranchInst::Create(Exit, Header,
                                                                  Cond));
@@ -245,24 +257,73 @@ void LLVMLoop::postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out,
   // Update cloned loop condition to use the thread-end value.
   unsigned TripCountIdx = 0;
   ICmpInst *ClonedCond = cast<ICmpInst>(VMap[TL.getCondition()]);
-  if (ClonedCond->getOperand(0) != ThreadEnd)
+  if (ClonedCond->getOperand(0) != End)
     ++TripCountIdx;
-  ClonedCond->setOperand(TripCountIdx, ThreadEnd);
-  assert(ClonedCond->getOperand(TripCountIdx) == ThreadEnd &&
+  ClonedCond->setOperand(TripCountIdx, End);
+  assert(ClonedCond->getOperand(TripCountIdx) == End &&
          "End argument not used in condition");
+  // we replace the step
+  assert(TL.getPrimaryInduction().second.getInductionBinOp()->getOpcode() == Instruction::BinaryOps::Add &&
+        "Only support gpu kernels with addition for induction step"); 
+  ConstantInt *CStep = cast<ConstantInt>(VMap[ID.getConstIntStepValue()]);  
+  BinaryOperator *BinOp = cast<BinaryOperator>(VMap[ID.getInductionBinOp()]); 
+  assert(CStep && BinOp && "Couldn't infer step or operator from primary induction gpu variable"); 
+  IRBuilder<> IncB(&BinOp->getParent()->front());  
+  Value* NewStep = IncB.CreateMul(CStep, Grainsize); 
+  BinOp->setOperand(1, NewStep); 
+  ClonedCond->setPredicate(ICmpInst::Predicate::ICMP_UGE); 
+
+  // Make each thread reduce into its local memory
+  for(auto &BB : NewLoop->getBlocks()){
+    for(auto &I : *BB){
+      if(auto *CI = dyn_cast<CallInst>(&I)){
+        auto *f = CI->getCalledFunction(); 
+        if(f->getAttributes().hasAttrSomewhere(Attribute::KitsuneReduction)){
+          IRBuilder<> PB(&I); 
+          auto ptr = CI->getOperand(0); 
+          auto ty = CI->getOperand(1)->getType();  
+          auto lptr = PB.CreateGEP(ty, ptr, ThreadIdx); 
+          CI->setOperand(0, lptr); 
+          f->removeFnAttr(Attribute::NoInline); 
+        }
+      }
+    }
+  }
 
+  TargetLibraryInfoImpl TLII(Triple(M.getTargetTriple()));
+  TargetLibraryInfo TLI(TLII); 
+  AssumptionCache AC(*Helper); 
+  ScalarEvolution SE(*Helper, TLI, AC, DT, LI); 
+  ValueToValueMapTy peelVMap; 
+  simplifyLoop(NewLoop, &DT, &LI, &SE, &AC, nullptr, false); 
+  peelLoop(NewLoop, 1, &LI, &SE, DT, &AC, false, peelVMap); 
+  SmallVector<Instruction*, 4> cis;
+  for(auto &BB : *Helper){
+    if(!NewLoop->contains(&BB)){ // better way?
+      for(auto &I : BB){
+        if(auto *CI = dyn_cast<CallInst>(&I)){
+          auto *f = CI->getCalledFunction(); 
+          if(f->getAttributes().hasAttrSomewhere(Attribute::KitsuneReduction)){
+            IRBuilder<> pb(&I); 
+            pb.CreateStore(CI->getArgOperand(1), CI->getArgOperand(0)); 
+            cis.push_back(&I); 
+          }
+        }
+      }
+    }
+  }
+  for(auto &I : cis){
+    I->eraseFromParent(); 
+  }
 }
 
 void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
                                       DominatorTree &DT) {
   LLVMContext &Ctx = M.getContext();
   Type *Int8Ty = Type::getInt8Ty(Ctx);
-  Type *Int32Ty = Type::getInt32Ty(Ctx);
   Type *Int64Ty = Type::getInt64Ty(Ctx);
   Type *VoidPtrTy = Type::getInt8PtrTy(Ctx);
 
-  //Task *T = TL.getTask();
-  //Instruction *ReplCall = cast<CallBase>(TOI.ReplCall);
   LLVM_DEBUG(dbgs() << "Running processOutlinedLoopCall: " << LLVMM);
   Function *Parent = TOI.ReplCall->getFunction();
   Value *TripCount = OrderedInputs[0];
@@ -272,12 +333,7 @@ void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
 
   IRBuilder<> B(&NBB->front());
 
-  // Compile the kernel
-  //LLVMM.getFunctionList().remove(TOI.Outline);
-  //TOI.Outline->eraseFromParent();
-  LLVMContext &LLVMCtx = LLVMM.getContext();
-
-  ValueToValueMapTy VMap;
+  ValueToValueMapTy VMap; 
   // We recursively add definitions and declarations to the device module
   SmallVector<Function*> todo;
   todo.push_back(LLVMM.getFunction("kitsune_kernel"));
@@ -304,7 +360,8 @@ void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
                   SmallVector<ReturnInst*, 8> Returns;
                   CloneFunctionInto(deviceF, f, VMap, CloneFunctionChangeType::DifferentModule, Returns);
                   // GPU calls are slow as balls, try to force inlining
-                  deviceF->addFnAttr(Attribute::AlwaysInline);
+                  if(!deviceF->hasFnAttribute(Attribute::NoInline))
+                    deviceF->addFnAttr(Attribute::AlwaysInline); 
                   todo.push_back(deviceF);
                 }
               }
@@ -388,20 +445,10 @@ void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
                                  GlobalValue::PrivateLinkage, LLVMBC,
                                  "gpu_" + Twine("kitsune_kernel"));
 
-  //Value* TripCount = isSRetInput(TOI.InputSet[0]) ? TOI.InputSet[1] : TOI.InputSet[0];
-  //Value *RunStart = ReplCall->getArgOperand(getIVArgIndex(*Parent,
-  //                                                        TOI.InputSet));
-  //Value *TripCount = ReplCall->getArgOperand(getLimitArgIndex(*Parent,
-  //                                                            TOI.InputSet));
-
-  Value *KernelID = ConstantInt::get(Int32Ty, MyKernelID);
-  Value *LLVMPtr = B.CreateBitCast(LLVMGlobal, VoidPtrTy);
-  Type *VoidPtrPtrTy = VoidPtrTy->getPointerTo();
-
-  Constant *kernelSize = ConstantInt::get(Int64Ty,
-    LLVMGlobal->getInitializer()->getType()->getArrayNumElements());
-  BasicBlock &EBB = Parent->getEntryBlock();
-  IRBuilder<> EB(&EBB.front());
+  Constant *kernelSize = ConstantInt::get(Int64Ty, 
+    LLVMGlobal->getInitializer()->getType()->getArrayNumElements()); 
+  BasicBlock &EBB = Parent->getEntryBlock(); 
+  IRBuilder<> EB(&EBB.front()); 
   EB.CreateCall(GPUInit, {});
 
   ArrayType* arrayType = ArrayType::get(VoidPtrTy, OrderedInputs.size());
@@ -417,23 +464,103 @@ void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
     B.CreateStore(VoidVPtr, argPtr);
   }
 
-  Value *Grainsize = TL.getGrainsize() ?
-    ConstantInt::get(TripCount->getType(), TL.getGrainsize()) :
-    OrderedInputs[2];
-
-  //Type *Int64Ty = Type::getInt64Ty(LLVMM.getContext());
-  Value *RunSizeQ = B.CreateUDiv(TripCount, Grainsize);
-  Value *RunRem = B.CreateURem(TripCount, Grainsize);
-  Value *IsRem = B.CreateICmp(ICmpInst::ICMP_UGT, RunRem, ConstantInt::get(RunRem->getType(), 0));
-  Value *IsRemAdd = B.CreateZExt(IsRem, RunSizeQ->getType());
-  Value *RunSize = B.CreateZExt(B.CreateAdd(RunSizeQ, IsRemAdd), Int64Ty);
-
-  Value* argsPtr = B.CreateConstInBoundsGEP2_32(arrayType, argArray, 0, 0);
-  Value* bcPtr = B.CreateConstInBoundsGEP2_32(LLVMGlobal->getValueType(), LLVMGlobal, 0, 0);
-  Value* stream = B.CreateCall(GPULaunchKernel, { bcPtr, kernelSize, argsPtr, RunSize });
+  Value* argsPtr = B.CreateConstInBoundsGEP2_32(arrayType, argArray, 0, 0); 
+  Value* bcPtr = B.CreateConstInBoundsGEP2_32(LLVMGlobal->getValueType(), LLVMGlobal, 0, 0); 
+  Value* stream = B.CreateCall(GPULaunchKernel, { bcPtr, kernelSize, argsPtr, TripCount });
   B.CreateCall(GPUWaitKernel, stream);
 
   LLVM_DEBUG(dbgs() << "Finished processOutlinedLoopCall: " << M);
 }
 
-#endif
+void LLVMLoop::preProcessTapirLoop(TapirLoopInfo &TL, ValueToValueMapTy &VMap, LoopInfo &LI) {
+  Loop *L = TL.getLoop();  
+  BasicBlock *PH = L->getLoopPreheader(); 
+  BasicBlock *Header = L->getHeader();
+  BasicBlock *Latch = L->getLoopLatch();
+  BranchInst *LatchBR = cast<BranchInst>(Latch->getTerminator());
+  BasicBlock &Entry = Header->getParent()->getEntryBlock(); 
+  unsigned ExitIndex = LatchBR->getSuccessor(0) == Header ? 1 : 0;
+  BasicBlock *LatchExit = LatchBR->getSuccessor(ExitIndex);
+  DetachInst *DI = cast<DetachInst>(Header->getTerminator());
+  Value *SyncReg = DI->getSyncRegion();
+  Value *GS; 
+
+  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+
+  // accumulate Reductions in main loop
+  const std::vector<BasicBlock*>& Blocks = L->getBlocks(); 
+  std::set<std::pair<CallInst*, Type*>> Reductions;
+  for (BasicBlock *BB : Blocks){
+    for (Instruction &I : *BB) {
+      if(auto *CI = dyn_cast<CallInst>(&I)){
+        auto *F = CI->getCalledFunction(); 
+        if(F->getAttributes().hasAttrSomewhere(Attribute::KitsuneReduction)){
+          LLVM_DEBUG(dbgs() << "Found reduction var: " << CI->getArgOperand(0)->getName() << 
+                               "with reduction function: " << F->getName() << "\n"); 
+          auto *Ty = CI->getArgOperand(1)->getType(); 
+          Reductions.insert(std::make_pair(CI, Ty)); 
+          //TODO: check the type to confirm valid reduction
+        }
+      }
+    }
+  }
+
+  std::vector<std::tuple<CallInst*, Value* , Value*, Type*>> RedMap; 
+
+  for(auto &Red : Reductions){
+    // We assume there's a preheader, and use it to insert allocations for reductions:
+    // preheader: 
+    //   gs = gridSize()
+    //   reds = gpuManagedMalloc(gs)
+    //   br header
+    IRBuilder<> EB(Entry.getFirstNonPHI()); 
+    CallInst *CI = Red.first; 
+    Value *Ptr = CI->getArgOperand(0); 
+    Type *Ty = Red.second; 
+    GS = EB.CreateCall(GPUGridSize); 
+    Value *NBytes = EB.CreateMul(GS, ConstantInt::get(GS->getType(), DL.getTypeAllocSize(Ty))); 
+    CallInst *Alloc = EB.CreateCall(GPUManagedMalloc, {NBytes}); 
+    // We overwrite in the body the location to Alloc, which will be replaced with a GEP using the tid in postprocessing
+    CI->setOperand(0, Alloc); 
+
+    RedMap.push_back(std::make_tuple(CI, Ptr, Alloc, Ty)); 
+  }
+
+  if(!Reductions.empty()){
+    Instruction* Term = LatchExit->getTerminator(); 
+    BasicBlock *PostSync = Term->getSuccessor(0);
+    BasicBlock* RedEpiHeader = BasicBlock::Create(LatchExit->getContext(), "reductionEpilogue", LatchExit->getParent(), LatchExit); 
+    RedEpiHeader->moveAfter(LatchExit); 
+    ReplaceInstWithInst(Term, SyncInst::Create(RedEpiHeader, SyncReg)); 
+    BranchInst::Create(PostSync, RedEpiHeader);
+    PHINode *Idx = PHINode::Create(GS->getType(), 2,
+                                   "reductionepilogueidx",
+                                   RedEpiHeader->getFirstNonPHI());
+    IRBuilder<> BH(RedEpiHeader->getFirstNonPHI()); 
+    Idx->addIncoming(ConstantInt::get(GS->getType(), 0), LatchExit);
+    Instruction *BodyTerm, *ExitTerm;
+    Value *Cmp = BH.CreateCmp(CmpInst::ICMP_NE, Idx, GS); 
+    SplitBlockAndInsertIfThenElse(Cmp, RedEpiHeader->getTerminator(), &BodyTerm, &ExitTerm);
+
+    IRBuilder<> BB(BodyTerm); 
+    // For each reduction, get the allocated thread local reduced values and
+    // reduce them. 
+    for(auto& KV : RedMap){
+      const auto [ CI, Ptr, Al, Ty ] = KV; 
+      Value* Lptr = BB.CreateBitCast(
+        BB.CreateGEP(Ty, Al, Idx), 
+        Ptr->getType());                             
+      Value *LR = BB.CreateLoad(Ty, Lptr); 
+      BB.SetCurrentDebugLocation(CI->getDebugLoc()); 
+      BB.CreateCall(CI->getCalledFunction(), { Ptr, LR }); 
+    }
+    Value *IdxAdd =
+        BB.CreateAdd(Idx, ConstantInt::get(Idx->getType(), 1),
+                          Idx->getName() + ".add");
+    BasicBlock* Body = BodyTerm->getParent(); 
+    Idx->addIncoming(IdxAdd, Body); 
+    ReplaceInstWithInst(BodyTerm, BranchInst::Create(RedEpiHeader)); 
+  }
+  
+  LLVM_DEBUG(dbgs() << "Finished preProcessTapirLoop: " << *PH->getParent());
+}
diff --git a/llvm/lib/Transforms/Tapir/LoopSpawningTI.cpp b/llvm/lib/Transforms/Tapir/LoopSpawningTI.cpp
index d98fdad17043a..8d2320688143c 100644
--- a/llvm/lib/Transforms/Tapir/LoopSpawningTI.cpp
+++ b/llvm/lib/Transforms/Tapir/LoopSpawningTI.cpp
@@ -1535,11 +1535,18 @@ Function *LoopSpawningImpl::createHelperForTapirLoop(
 
 /// Outline all recorded Tapir loops in the function.
 TaskOutlineMapTy LoopSpawningImpl::outlineAllTapirLoops() {
+
+  std::map<Loop *, ValueToValueMapTy> LoopVMaps;   
+
   // Prepare Tapir loops for outlining.
   for (Task *T : post_order(TI.getRootTask())) {
+    // Run a pre-processing step before we create the helper function.
+
     if (TapirLoopInfo *TL = getTapirLoop(T)) {
+      
       PredicatedScalarEvolution PSE(SE, *TL->getLoop());
-      bool CanOutline = TL->prepareForOutlining(DT, LI, TI, PSE, AC, LS_NAME,
+
+      bool canOutline = TL->prepareForOutlining(DT, LI, TI, PSE, AC, LS_NAME,
                                                 ORE, TTI);
       if (!CanOutline) {
         const Loop *L = TL->getLoop();
@@ -1552,9 +1559,15 @@ TaskOutlineMapTy LoopSpawningImpl::outlineAllTapirLoops() {
       // Get an outline processor for each Tapir loop.
       OutlineProcessors[TL] =
         std::unique_ptr<LoopOutlineProcessor>(getOutlineProcessor(TL));
+
+      Loop *L = TL->getLoop(); 
+      OutlineProcessors[TL]->preProcessTapirLoop(*TL, LoopVMaps[L], LI);
     }
   }
 
+  LI.releaseMemory(); 
+  LI.analyze(DT); 
+
   TaskOutlineMapTy TaskToOutline;
   DenseMap<Loop *, ValueSet> LoopInputSets;
   DenseMap<Loop *, SmallVector<Value *, 3>> LoopCtlArgs;
@@ -1589,12 +1602,14 @@ TaskOutlineMapTy LoopSpawningImpl::outlineAllTapirLoops() {
     } // end timed region
 
     TapirLoopInfo *TL = getTapirLoop(T);
+
     if (!TL)
       continue;
 
     Loop *L = TL->getLoop();
     LLVM_DEBUG(dbgs() << "Outlining Tapir " << *L << "\n");
 
+
     // Convert the inputs of the Tapir loop to inputs to the helper.
     ValueSet TLInputsFixed;
     ValueToValueMapTy InputMap;
@@ -1611,7 +1626,6 @@ TaskOutlineMapTy LoopSpawningImpl::outlineAllTapirLoops() {
                           OutlineProcessors[TL]->getArgStructMode(), InputMap,
                           L);
     } // end timed region
-
     ValueSet HelperArgs;
     SmallVector<Value *, 8> HelperInputs;
     {
@@ -1619,6 +1633,7 @@ TaskOutlineMapTy LoopSpawningImpl::outlineAllTapirLoops() {
                          "Setup inputs to Tapir-loop helper function",
                          TimerGroupName, TimerGroupDescription,
                          TimePassesIsEnabled);
+
     OutlineProcessors[TL]->setupLoopOutlineArgs(
         F, HelperArgs, HelperInputs, LoopInputSets[L], LoopCtlArgs[L],
         LoopCtlInputs[L], TLInputsFixed);
@@ -1638,11 +1653,8 @@ TaskOutlineMapTy LoopSpawningImpl::outlineAllTapirLoops() {
       LoopInputs[L].push_back(V);
     LoopArgStarts[L] = ArgStart;
 
-    ValueToValueMapTy VMap;
-
-    // Run a pre-processing step before we create the helper function.
-    OutlineProcessors[TL]->preProcessTapirLoop(*TL, VMap);
-
+    ValueToValueMapTy &VMap = LoopVMaps[L]; 
+    
     // Create the helper function.
     Function *Outline = createHelperForTapirLoop(
         TL, LoopArgs[L], OutlineProcessors[TL]->getIVArgIndex(F, LoopArgs[L]),
diff --git a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
index 99bc2c9a962b7..dfb9cfaeda4f7 100644
--- a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
+++ b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
@@ -37,6 +37,7 @@
 #include "llvm/Transforms/Utils/SimplifyIndVar.h"
 #include "llvm/Transforms/Utils/TapirUtils.h"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
+#include "llvm/Transforms/Utils/LoopPeel.h"
 
 using namespace llvm;
 
@@ -804,6 +805,10 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
   unsigned ExitIndex = LatchBR->getSuccessor(0) == Header ? 1 : 0;
   BasicBlock *LatchExit = LatchBR->getSuccessor(ExitIndex);
 
+  
+  Function *F = Header->getParent();
+  LLVM_DEBUG(dbgs() << "Function before strip mining\n" << *F); 
+
   // We will use the increment of the primary induction variable to derive
   // wrapping flags.
   Instruction *PrimaryInc =
@@ -1528,6 +1533,190 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
 
   // Record that the remainder loop was derived from a Tapir loop.
   (*RemainderLoop)->setDerivedFromTapirLoop();
+  // Record that the old loop was derived from a Tapir loop.
+  L->setDerivedFromTapirLoop();
+
+#ifndef NDEBUG
+  DT->verify();
+  LI->verify(*DT);
+#endif
+
+  // accumulate reductions in main loop
+  const std::vector<BasicBlock*>& blocks = L->getBlocks(); 
+  std::set<std::pair<CallInst*, Type*>> reductions;
+  for (BasicBlock *BB : blocks){
+    for (Instruction &I : *BB) {
+      if(auto ci = dyn_cast<CallInst>(&I)){
+        auto f = ci->getCalledFunction(); 
+        if(f->getAttributes().hasAttrSomewhere(Attribute::KitsuneReduction)){
+          LLVM_DEBUG(dbgs() << "Found reduction var: " << ci->getArgOperand(0)->getName() << 
+                               "with reduction function: " << f->getName() << "\n"); 
+          auto ty = ci->getArgOperand(1)->getType(); 
+          reductions.insert(std::make_pair(ci, ty)); 
+          //TODO: check the type to confirm valid reduction
+        }
+      }
+    }
+  }
+   
+  // accumulate reductions in epilog loop
+  LLVM_DEBUG(dbgs() << "Found " << reductions.size() << " reduction variables in loop\n"); 
+
+  std::vector<std::tuple<CallInst*, Value* , Value*, Type*>> redMap; 
+  // TODO: Modify the strip mining outer loop to be smaller: currently we are
+  // stack allocating n/2048 reduction values.
+  // TODO: Initialize local reductions with unit values
+  Instruction *bloc = nullptr;
+  if(Instruction* I = dyn_cast<Instruction>(TripCount)){
+    bloc = I->getNextNode();
+  } else {
+    bloc = F->getEntryBlock().getTerminator();
+  }
+  IRBuilder<> RB(bloc); 
+  Value *outerIters = RB.CreateUDiv(TripCount,
+                               ConstantInt::get(TripCount->getType(), Count),
+                               "stripiter");
+  auto nred = RB.CreateAdd(outerIters, ConstantInt::get(outerIters->getType(), 1)); 
+  for(auto &pair : reductions){
+    // TODO: generic allocation/free calls
+    auto ci = pair.first; 
+    auto ptr = ci->getArgOperand(0); 
+    auto ty = pair.second; 
+    auto gmmTy = FunctionType::get(ptr->getType(), { nred->getType() }, false); 
+    auto arrSize = RB.CreateMul(nred, ConstantInt::get(nred->getType(), DL.getTypeAllocSize(ty))); 
+    auto al = RB.CreateCall(M->getOrInsertFunction("gpuManagedMalloc", gmmTy), {arrSize}); 
+    //auto al = RB.CreateBitCast(rm, ty); 
+    //auto al = RB.CreateAlloca(ty, nred, ptr->getName() + "_reduction");
+    IRBuilder<> BH(NewLoop->getHeader()->getTerminator()); 
+    auto lptr = BH.CreateBitCast(
+      BH.CreateGEP(ty, al, NewIdx), 
+      ptr->getType());                             
+    redMap.push_back(std::make_tuple(ci, ptr, al, ty)); 
+    // Assume there is more than one element, and
+    // use the first element for the first iteration of the loop.
+    // roughly: 
+    //   red = init; 
+    //   forall(i = ...){
+    //     red = reduce(red, body(i)); 
+    //   }
+    //   red = init; 
+    //   localred[m+1]; 
+    //   
+    //   forall(k ∈ 0..m-1){
+    //     localred[i] = body(j_0); 
+    //     for(j ∈ j_k_1..j_k_l-1)
+    //       reduce(localred+i, body(j));
+    //   }
+    //   for( j ∈ j_k_m .. n )
+    //     reduce(localred+m, body(j)); 
+    //   }
+    //   for(k ∈ 0..m) 
+    //     reduce(&red, localred[k]); 
+    //
+    ptr->replaceUsesWithIf(lptr, [L](Use &u){
+      if(auto I = dyn_cast<Instruction>(u.getUser())){
+        return L->contains(I->getParent()); 
+      } else {
+        return false;
+      }; 
+    });
+  }
+  
+  // Epilog "join" of reduction values stored in local reduction value arrays.
+  // Should be able to use redMap to map original pointer (which is still used
+  // to reduce the remainder of the strimined loop, so you probably want to
+  // start the reduction with that value).
+  LLVM_DEBUG(dbgs() << "Function after strip mining, before reduction epilogue\n" << *F); 
+   
+  // We insert the reduction code at every sync corresponding to the strimined
+  // loop
+  //
+  // Sync 
+  // RedEpiHeader
+  //   RedEpiBody
+  // RedEpiExit
+
+  // Todo: re-order epilogue and reduction epilogue to preserve associativity
+  if(!reductions.empty()){
+    // Peel the first iteration of the loop and replace the reduction calls in
+    // the peeled code with stores
+    ValueToValueMapTy VMap; 
+    peelLoop(L, 1, LI, SE, *DT, AC, PreserveLCSSA, VMap); 
+    SmallVector<Instruction*, 4> cis;
+    for(auto &BB : NewLoop->blocks()){
+      if(!L->contains(BB)){ // better way?
+        for(auto &I : *BB){
+          if(auto *CI = dyn_cast<CallInst>(&I)){
+            auto *f = CI->getCalledFunction(); 
+            if(f->getAttributes().hasAttrSomewhere(Attribute::KitsuneReduction)){
+              IRBuilder<> pb(&I); 
+              pb.CreateStore(CI->getArgOperand(1), CI->getArgOperand(0)); 
+              cis.push_back(&I); 
+              f->removeFnAttr(Attribute::NoInline); 
+            }
+          }
+        }
+      }
+    }
+    
+    for(auto &I : cis){
+      I->eraseFromParent();  
+    }
+
+    Instruction* term = LatchExit->getTerminator(); 
+    BasicBlock *PostSync = term->getSuccessor(0);
+    BasicBlock *RedEpiHeader = SplitBlock(PostSync, PostSync->getTerminator(), DT, LI, nullptr, "reductionEpilogue"); 
+    //BasicBlock* RedEpiHeader = BasicBlock::Create(LatchExit->getContext(), "reductionEpilogue", LatchExit->getParent(), LatchExit); 
+    RedEpiHeader->moveAfter(LatchExit); 
+    ReplaceInstWithInst(PostSync->getTerminator(), SyncInst::Create(RedEpiHeader, SyncReg)); 
+    //BranchInst::Create(PostSync, RedEpiHeader);
+    PHINode *Idx = PHINode::Create(outerIters->getType(), 2,
+                                   "reductionepilogueidx",
+                                   RedEpiHeader->getFirstNonPHI());
+    IRBuilder<> BH(RedEpiHeader->getFirstNonPHI()); 
+    Idx->addIncoming(ConstantInt::get(outerIters->getType(), 0), PostSync);
+    Instruction *bodyTerm, *exitTerm;
+    Value *cmp = BH.CreateCmp(CmpInst::ICMP_NE, Idx, outerIters); 
+    SplitBlockAndInsertIfThenElse(cmp, RedEpiHeader->getTerminator(), &bodyTerm, &exitTerm);
+
+    IRBuilder<> BB(bodyTerm); 
+    // For each reduction, get the allocated thread local reduced values and
+    // reduce them. 
+    for(auto& kv : redMap){
+      const auto [ ci, ptr, al, ty ] = kv; 
+      auto lptr = BB.CreateBitCast(
+        BB.CreateGEP(ty, al, Idx), 
+        ptr->getType());                             
+      auto x = BB.CreateLoad(ty, lptr); 
+      BB.SetCurrentDebugLocation(ci->getDebugLoc()); 
+      BB.CreateCall(ci->getCalledFunction(), { ptr, x }); 
+    }
+    Value *IdxAdd =
+        BB.CreateAdd(Idx, ConstantInt::get(Idx->getType(), 1),
+                          Idx->getName() + ".add");
+    BasicBlock* body = bodyTerm->getParent(); 
+    BasicBlock* loopExit = exitTerm->getParent(); 
+    Idx->addIncoming(IdxAdd, body); 
+    ReplaceInstWithInst(bodyTerm, BranchInst::Create(RedEpiHeader)); 
+
+    // Update Loopinfo with reduction loop
+    Loop* RL = LI->AllocateLoop(); 
+    if(!ParentLoop){
+      LI->addTopLevelLoop(RL); 
+      RL->addBasicBlockToLoop(RedEpiHeader, *LI); 
+      RL->addBasicBlockToLoop(body, *LI); 
+    }  
+    else {
+      ParentLoop->addChildLoop(RL);
+      LI->changeLoopFor(RedEpiHeader, RL);
+      RL->addBlockEntry(RedEpiHeader);
+      LI->changeLoopFor(body, RL); 
+      RL->addBlockEntry(body);
+    }
+    simplifyLoop(RL, DT, LI, SE, AC, nullptr, PreserveLCSSA); 
+  }
+
+  LLVM_DEBUG(dbgs() << "Function after reduction epilogue\n" << *F);  
 
   // At this point, the code is well formed.  We now simplify the new loops,
   // doing constant propagation and dead code elimination as we go.
@@ -1537,13 +1726,11 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
   simplifyLoopAfterStripMine(*RemainderLoop, /*SimplifyIVs*/ true, LI, SE, DT,
                              TTI, AC);
 
-#ifndef NDEBUG
-  DT->verify();
-  LI->verify(*DT);
-#endif
 
-  // Record that the old loop was derived from a Tapir loop.
-  L->setDerivedFromTapirLoop();
+  // TODO: fix DT updates
+  DT->recalculate(*F); 
+  LI->releaseMemory(); 
+  LI->analyze(*DT);
 
   // Update TaskInfo manually using the updated DT.
   if (TI)
@@ -1551,5 +1738,11 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
     // Optimize this routine in the future.
     TI->recalculate(*F, *DT);
 
+
+#ifndef NDEBUG
+  DT->verify();
+  LI->verify(*DT);
+#endif
+
   return NewLoop;
 }
diff --git a/llvm/lib/Transforms/Tapir/OpenCilkABI.cpp b/llvm/lib/Transforms/Tapir/OpenCilkABI.cpp
index e048cdaf57c58..be51e49d0d5d7 100644
--- a/llvm/lib/Transforms/Tapir/OpenCilkABI.cpp
+++ b/llvm/lib/Transforms/Tapir/OpenCilkABI.cpp
@@ -151,8 +151,20 @@ void OpenCilkABI::prepareModule() {
     if ("" != ClOpenCilkRuntimeBCPath)
       RuntimeBCPath = ClOpenCilkRuntimeBCPath;
 
-    if ("" == RuntimeBCPath)
-      C.emitError("OpenCilkABI: No OpenCilk bitcode ABI file given.");
+    std::optional<std::string> path;
+    if("" == RuntimeBCPath){
+      path = sys::Process::FindInEnvPath("LD_LIBRARY_PATH", "libopencilk-abi.bc");
+      if (! path)
+        // TODO: This is an in-tree build solution for now...
+        #if defined(OPENCILK_BC_PATH)
+        path = OPENCILK_BC_PATH;
+        #else
+        report_fatal_error("Could not find OpenCilk runtime bitcode file "
+                           "(libopencilk-abi.bc) in LD_LIBRARY_PATH.");
+        #endif
+    } else {
+      path = ClOpenCilkRuntimeBCPath.getValue();
+    }
 
     LLVM_DEBUG(dbgs() << "Using external bitcode file for OpenCilk ABI: "
                       << RuntimeBCPath << "\n");