From 0aa95c2eb0acd5a0971ef5d51384e21e99c14567 Mon Sep 17 00:00:00 2001
From: George Stelle <stelleg@gmail.com>
Date: Fri, 10 Dec 2021 09:37:44 -0700
Subject: [PATCH 01/14] Basic loop stripmine-based implicit parallel reduction
 (sum only) codegen

---
 llvm/lib/Transforms/Tapir/LoopStripMine.cpp | 112 ++++++++++++++++++++
 1 file changed, 112 insertions(+)
diff --git a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
index 99bc2c9a962b7..3f328d9b4db2a 100644
--- a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
+++ b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
@@ -1551,5 +1551,117 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
     // Optimize this routine in the future.
     TI->recalculate(*F, *DT);
 
+  // iterate through the stores that should be treated as reductions
+  const std::vector<BasicBlock*>& blocks = L->getBlocks(); 
+  std::set<Value*> reductions;  
+  for (BasicBlock *BB : blocks){
+    for (Instruction &I : *BB) {
+      if(auto si = dyn_cast<StoreInst>(&I)){
+        // TODO: better check if the store should be treated as a
+        // reduction. What we're doing is just checking if it's
+        // storing to a loop invariant pointer
+        Value* ptr = si->getPointerOperand(); 
+        if(L->isLoopInvariant(ptr))
+          reductions.insert(ptr); 
+      }
+    }
+  }
+
+  ValueToValueMap redMap; 
+  // TODO: Modify the strip mining outer loop to be smaller: currently we are
+  // stack allocating n/2048 reduction values.
+  // TODO: Initialize local reductions with unit values
+  for(Value* ptr : reductions){
+    IRBuilder<> B(F->getEntryBlock().getTerminator()); 
+    auto ty = dyn_cast<PointerType>(ptr->getType())->getElementType(); 
+    auto al = B.CreateAlloca(ty, TripCount, ptr->getName() + "_reduction");
+    IRBuilder<> BH(L->getHeader()->getTerminator()); 
+    auto lptr = BH.CreateBitCast(
+      BH.CreateGEP(al, NewIdx), 
+      ptr->getType());                             
+    redMap[ptr] = al; 
+    // TODO: for now, just initializing with the initial sequential
+    // reduction value, which is often unit, but if it isn't this is
+    // wrong.
+    ptr->replaceUsesWithIf(lptr, [L](Use &u){
+      if(auto I = dyn_cast<Instruction>(u.getUser())){
+        I->dump(); 
+        return L->contains(I->getParent()); 
+      } else {
+        return false;
+      }; 
+    });
+    BH.CreateStore(BH.CreateLoad(ptr), lptr);  
+  }
+  
+  // Epilog "join" of reduction values stored in local reduction value arrays.
+  // Should be able to use redMap to map original pointer (which is still used
+  // to reduce the remainder of the strimined loop, so you probably want to
+  // start the reduction with that value).
+  LLVM_DEBUG(dbgs() << "Function after strip mining, before reduction epilogue\n" << *F); 
+   
+  // We insert the reduction code at every sync corresponding to the strimined
+  // loop
+  //
+  // Sync 
+  // RedEpiHeader
+  //   RedEpiBody
+  // RedEpiExit
+
+  if(!reductions.empty()){
+    SmallVector<SyncInst*,2> syncs; 
+    for(auto &bb : *F){
+      if(auto *sync = dyn_cast<SyncInst>(bb.getTerminator())){
+        syncs.push_back(sync);
+      }
+    }
+    for(auto *sync : syncs){
+      if(sync->getSyncRegion() == SyncReg){
+        BasicBlock *PostSync = sync->getSuccessor(0);
+        BasicBlock* RedEpiHeader = SplitBlock(PostSync, PostSync->getTerminator(), DT); 
+        PHINode *Idx = PHINode::Create(TripCount->getType(), 2,
+                                       "reductionepilogueidx",
+                                       RedEpiHeader->getFirstNonPHI());
+        IRBuilder<> BH(RedEpiHeader->getFirstNonPHI()); 
+        Idx->addIncoming(TripCount, PostSync);
+        Instruction *bodyTerm, *exitTerm;
+        Value *cmp = BH.CreateIsNotNull(Idx); 
+        SplitBlockAndInsertIfThenElse(cmp, RedEpiHeader->getTerminator(), &bodyTerm, &exitTerm);
+
+        IRBuilder<> BB(bodyTerm); 
+        // For each reduction, get the allocated thread local reduced values and
+        // reduce them. For now defaults to sums on primitive types.
+        // TODO: Add custom unital magmas and/or infer unital magma
+        for(auto& kv : redMap){
+          auto al = kv.second;
+          Value* ptr = const_cast<Value*>(kv.first);  
+          auto lptr = BB.CreateBitCast(
+            BB.CreateGEP(al, Idx), 
+            ptr->getType());                             
+          auto acc = BB.CreateLoad(ptr); 
+          auto x = BB.CreateLoad(lptr); 
+          auto newacc = acc->getType()->isFloatingPointTy() ? BB.CreateFAdd(acc,x) : BB.CreateAdd(acc,x);
+          BB.CreateStore(newacc, ptr); 
+        }
+        Value *IdxSub =
+            BB.CreateSub(Idx, ConstantInt::get(Idx->getType(), 1),
+                              Idx->getName() + ".sub");
+        Idx->addIncoming(IdxSub, bodyTerm->getParent()); 
+        ReplaceInstWithInst(bodyTerm, BranchInst::Create(RedEpiHeader)); 
+      }
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "Function after reduction epilogue\n" << *F);  
+
+  // TODO: fix DT updates
+  DT->recalculate(*F); 
+  /*
+#ifndef NDEBUG
+  DT->verify();
+  LI->verify(*DT);
+#endif
+  */
+
   return NewLoop;
 }

From b77e385bd96bd904aee6d82d6407afb03996fb34 Mon Sep 17 00:00:00 2001
From: George Stelle <stelleg@gmail.com>
Date: Fri, 10 Dec 2021 09:39:32 -0700
Subject: [PATCH 02/14] Explicit parallel reductions via unital magmas added

---
 clang/lib/Headers/magma.h      | 43 ++++++++++++++++++++++++++
 clang/lib/Headers/reductions.h | 56 ++++++++++++++++++++++++++++++++++
 2 files changed, 99 insertions(+)
 create mode 100644 clang/lib/Headers/magma.h
 create mode 100644 clang/lib/Headers/reductions.h

diff --git a/clang/lib/Headers/magma.h b/clang/lib/Headers/magma.h
new file mode 100644
index 0000000000000..43c4fb8f1075f
--- /dev/null
+++ b/clang/lib/Headers/magma.h
@@ -0,0 +1,43 @@
+#include<string>
+#include<limits>
+
+template <typename a> 
+struct Magma {
+  virtual a op(a x, a y) = 0; 
+}; 
+
+template <typename a>
+struct UnitalMagma : public Magma<a> {
+  virtual a id() = 0; 
+}; 
+
+// Example unital magmas
+template <typename a>
+struct Sum : UnitalMagma<a>{
+  a op(a x, a y){ return x + y; }
+  a id(){ return 0; }  // look into this more
+}; 
+
+template <typename a>
+struct Product : UnitalMagma<a> {
+  a op(a x, a y){ return x * y; }
+  a id(){ return 1; }
+}; 
+
+struct StringApp : UnitalMagma<std::string> {
+  std::string op(std::string x, std::string y){ return x.append(y); }
+  std::string id() { return ""; }
+}; 
+
+template <typename a>
+struct Max : UnitalMagma<a> {
+  a op(a x, a y){ return x > y ? x : y; }
+  a id() { return std::numeric_limits<a>::min(); }
+}; 
+
+template <typename a>
+struct Min : UnitalMagma<a> {
+  a op(a x, a y){ return x < y ? x : y; }
+  a id() { return std::numeric_limits<a>::max(); }
+}; 
+
diff --git a/clang/lib/Headers/reductions.h b/clang/lib/Headers/reductions.h
new file mode 100644
index 0000000000000..d9abf9cf276b5
--- /dev/null
+++ b/clang/lib/Headers/reductions.h
@@ -0,0 +1,56 @@
+#include"magma.h"
+#include<assert.h>
+#include<unistd.h>
+#include<stdio.h>
+//#include<kitsune.h>
+
+template <typename a, typename um, typename v> 
+a reduce(um m, v& xs){
+  auto acc = m.id(); 
+  for(auto x : xs){
+    acc = m.op(acc, x); 
+  }
+  return acc; 
+}
+
+template <typename a, typename um, typename v>
+a parReduce(um m, v& xs, uint64_t nthreads){
+  uint64_t linesize = sysconf(_SC_LEVEL1_DCACHE_LINESIZE); 
+  assert(linesize % sizeof(a) == 0); 
+  uint64_t linenum = linesize / sizeof(a); 
+  a* accs = new a[nthreads * linenum]; 
+  uint64_t size = xs.end() - xs.begin(); 
+  assert(size % nthreads == 0); 
+  uint64_t grainsize = size / nthreads; 
+  for(uint64_t i=0; i<nthreads; i++){
+    accs[i*linenum] = m.id();
+    for(uint64_t j = i*grainsize; j<(i+1)*grainsize; j++){
+      accs[i*linenum] = m.op(accs[i*linenum], xs[j]);
+    }
+  }
+  a acc = m.id(); 
+  for(uint64_t i=0; i<nthreads; i++){
+    acc = m.op(acc, accs[i*linenum]); 
+  }
+  delete[] accs; 
+  return acc; 
+}
+
+template<typename a, typename um, typename v> 
+a treeReduce(um m, v& xs, uint64_t start, uint64_t end, uint64_t gs){
+  if(end-start < gs){
+    a acc = m.id();
+    for(uint64_t i=start; i<end; i++){
+      acc+=xs[i];
+    }
+    return acc; 
+  }
+  else{
+    uint64_t mid = (start + end) / 2; 
+    return treeReduce(m, xs, start, mid, gs) 
+         + treeReduce(m, xs, mid, end, gs); 
+  }
+}
+
+  
+

From 213a51f67199ee4be2029c103935ff8fd798ace9 Mon Sep 17 00:00:00 2001
From: George Stelle <stelleg@gmail.com>
Date: Fri, 10 Dec 2021 09:45:22 -0700
Subject: [PATCH 03/14] Added reductions documentation

---
 kitsune/docs/using.md | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/kitsune/docs/using.md b/kitsune/docs/using.md
index b07a2fa4ea435..e843f6f2f1e76 100644
--- a/kitsune/docs/using.md
+++ b/kitsune/docs/using.md
@@ -99,3 +99,32 @@ Each special mode and runtime transformation/ABI target has its own named config
   * ``opencl.cfg``: OpenCL runtime ABI target specific flags. 
 
 These files can reduce complexity for end users by providing configuration- and build-specific flags.  This can be important when version-specific bitcode files and other details are used.  In addition, these files can provide developers additional flexibility for debugging, testing, and experimenting.  Obviously, all these features can also be hardcoded onto the command line for a more traditional use case.  In addition, to override any of the Kitsune or system configuration files you can place an empty config file within the user directory (no kitsune or system configuration files will be read in this case). 
+
+## Reductions 
+We provide two approaches to reductions. The first (still very much a work in
+progress and likely to break) is implicit reductions. This allows you to write
+basic reductions in the way you would for sequential code, and have them be
+optimized for parallelism, e.g. 
+
+```
+forall(auto x : xs) {
+  acc += x;
+}
+```
+
+should generate efficient parallel reduction code.
+
+Second, we provide a c++ interface for parallel reduction via user-defined
+reduction operators. Formally, we require a unital magma, which is just a
+reduction operator and a unit value, e.g. 0 for sums and 1 for products.
+
+This allows for the following style of reductions: 
+
+```
+#include<reductions.h>
+...
+  double sum = reduce<double>(Sum<double>(), big);
+```
+
+
+

From 46a062919d7fba678ccbcc4e172cd5bea79e64bf Mon Sep 17 00:00:00 2001
From: George Stelle <stelleg@gmail.com>
Date: Tue, 28 Jun 2022 10:34:21 -0600
Subject: [PATCH 04/14] Implicit sum reductions functional

---
 llvm/lib/Transforms/Tapir/LoopStripMine.cpp | 28 ++++++++++++++++-----
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
index 3f328d9b4db2a..9f49e6cec8d6d 100644
--- a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
+++ b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
@@ -1561,20 +1561,32 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
         // reduction. What we're doing is just checking if it's
         // storing to a loop invariant pointer
         Value* ptr = si->getPointerOperand(); 
-        if(L->isLoopInvariant(ptr))
+        if(L->isLoopInvariant(ptr)){
+          LLVM_DEBUG(dbgs() << "Found reduction var: " << ptr->getName() << "\n"); 
           reductions.insert(ptr); 
+        }
       }
     }
   }
+  LLVM_DEBUG(dbgs() << "Found " << reductions.size() << " reduction variables in loop\n"); 
 
   ValueToValueMap redMap; 
   // TODO: Modify the strip mining outer loop to be smaller: currently we are
   // stack allocating n/2048 reduction values.
   // TODO: Initialize local reductions with unit values
+  Instruction *bloc = nullptr;
+  if(Instruction* I = dyn_cast<Instruction>(TripCount)){
+    bloc = I->getNextNode();
+  } else {
+    bloc = F->getEntryBlock().getTerminator();
+  }
+  IRBuilder<> RB(bloc); 
+  Value *outerIters = RB.CreateUDiv(TripCount,
+                               ConstantInt::get(TripCount->getType(), Count),
+                               "stripiter");
   for(Value* ptr : reductions){
-    IRBuilder<> B(F->getEntryBlock().getTerminator()); 
     auto ty = dyn_cast<PointerType>(ptr->getType())->getElementType(); 
-    auto al = B.CreateAlloca(ty, TripCount, ptr->getName() + "_reduction");
+    auto al = RB.CreateAlloca(ty, outerIters, ptr->getName() + "_reduction");
     IRBuilder<> BH(L->getHeader()->getTerminator()); 
     auto lptr = BH.CreateBitCast(
       BH.CreateGEP(al, NewIdx), 
@@ -1617,13 +1629,17 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
     }
     for(auto *sync : syncs){
       if(sync->getSyncRegion() == SyncReg){
+        BasicBlock *Sync = sync->getParent(); 
         BasicBlock *PostSync = sync->getSuccessor(0);
-        BasicBlock* RedEpiHeader = SplitBlock(PostSync, PostSync->getTerminator(), DT); 
-        PHINode *Idx = PHINode::Create(TripCount->getType(), 2,
+        BasicBlock* RedEpiHeader = BasicBlock::Create(Sync->getContext(), "reductionEpilogue", Sync->getParent(), Sync); 
+        RedEpiHeader->moveAfter(Sync); 
+        sync->setSuccessor(0, RedEpiHeader); 
+        BranchInst::Create(PostSync, RedEpiHeader); 
+        PHINode *Idx = PHINode::Create(outerIters->getType(), 2,
                                        "reductionepilogueidx",
                                        RedEpiHeader->getFirstNonPHI());
         IRBuilder<> BH(RedEpiHeader->getFirstNonPHI()); 
-        Idx->addIncoming(TripCount, PostSync);
+        Idx->addIncoming(outerIters, sync->getParent());
         Instruction *bodyTerm, *exitTerm;
         Value *cmp = BH.CreateIsNotNull(Idx); 
         SplitBlockAndInsertIfThenElse(cmp, RedEpiHeader->getTerminator(), &bodyTerm, &exitTerm);

From ce98bc63b260c80ce5039a2bd43298804e5ee1b5 Mon Sep 17 00:00:00 2001
From: George Stelle <stelleg@gmail.com>
Date: Wed, 1 Feb 2023 08:48:02 -0700
Subject: [PATCH 05/14] Outlined implementation approach for updated reduction
 implementation

---
 llvm/lib/Transforms/Tapir/LoopStripMine.cpp | 31 ++++++++++++++++-----
 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
index 9f49e6cec8d6d..b83bcb7da6a49 100644
--- a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
+++ b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
@@ -1589,12 +1589,28 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
     auto al = RB.CreateAlloca(ty, outerIters, ptr->getName() + "_reduction");
     IRBuilder<> BH(L->getHeader()->getTerminator()); 
     auto lptr = BH.CreateBitCast(
-      BH.CreateGEP(al, NewIdx), 
+      BH.CreateGEP(ty, al, NewIdx), 
       ptr->getType());                             
     redMap[ptr] = al; 
     // TODO: for now, just initializing with the initial sequential
     // reduction value, which is often unit, but if it isn't this is
-    // wrong.
+    // wrong. What we need to do is assume there is more than one element, and
+    // use the first element for the first iteration of the loop.
+    // roughly: 
+    //   red = init; 
+    //   forall(i = ...){
+    //     red = reduce(red, body(i)); 
+    //   }
+    //   red = init; 
+    //   localred[m]; 
+    //   forall(k = ...){
+    //     localred[i] = body(j_0); 
+    //     for(j = j_1 ...)
+    //       localred = reduce(localred, body(j));
+    //   }
+    //   for(k = ...)
+    //     red = reduce(red, localred[k]); 
+    //
     ptr->replaceUsesWithIf(lptr, [L](Use &u){
       if(auto I = dyn_cast<Instruction>(u.getUser())){
         I->dump(); 
@@ -1603,7 +1619,7 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
         return false;
       }; 
     });
-    BH.CreateStore(BH.CreateLoad(ptr), lptr);  
+    BH.CreateStore(BH.CreateLoad(ty, ptr), lptr);  
   }
   
   // Epilog "join" of reduction values stored in local reduction value arrays.
@@ -1651,11 +1667,12 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
         for(auto& kv : redMap){
           auto al = kv.second;
           Value* ptr = const_cast<Value*>(kv.first);  
+          auto ty = ptr->getType(); 
           auto lptr = BB.CreateBitCast(
-            BB.CreateGEP(al, Idx), 
-            ptr->getType());                             
-          auto acc = BB.CreateLoad(ptr); 
-          auto x = BB.CreateLoad(lptr); 
+            BB.CreateGEP(ty, al, Idx), 
+            ty);                             
+          auto acc = BB.CreateLoad(ty, ptr); 
+          auto x = BB.CreateLoad(ty, lptr); 
           auto newacc = acc->getType()->isFloatingPointTy() ? BB.CreateFAdd(acc,x) : BB.CreateAdd(acc,x);
           BB.CreateStore(newacc, ptr); 
         }

From 0f86c20e99185ce469aad5decdf5ab8a26fdb0d5 Mon Sep 17 00:00:00 2001
From: George Stelle <stelleg@gmail.com>
Date: Tue, 21 Feb 2023 16:17:19 -0700
Subject: [PATCH 06/14] Working reductions, sequential semantics preserved for
 commutative associative magmas

---
 clang/include/clang/Basic/Attr.td           |   8 ++
 clang/lib/CodeGen/CGCall.cpp                |   3 +
 clang/lib/Headers/kitsune.h.cmake           | 145 ++++++++++++++++++++
 clang/lib/Sema/SemaDeclAttr.cpp             |   4 +-
 kitsune-tests/reductions/l2.c               |  36 +++++
 llvm/include/llvm/IR/Attributes.td          |  10 ++
 llvm/lib/Transforms/Tapir/LoopStripMine.cpp | 140 ++++++++++---------
 llvm/lib/Transforms/Tapir/OpenCilkABI.cpp   |  18 ++-
 8 files changed, 296 insertions(+), 68 deletions(-)
 create mode 100644 clang/lib/Headers/kitsune.h.cmake
 create mode 100644 kitsune-tests/reductions/l2.c

diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 40e9848e513a8..05cdb8a377345 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -4725,6 +4725,14 @@ def ClspvLibclcBuiltin: InheritableAttr {
   let SimpleHandler = 1;
 }
 
+// +===== kitsune-/tapir-centric attributes
+
+def KitsuneReduction : InheritableAttr {
+  let Spellings = [Clang<"kitsune_reduction">];
+  let Subjects = SubjectList<[FunctionLike]>;
+  let Documentation = [StrandMallocDocs];
+}
+
 def TapirTarget : StmtAttr {
   let Spellings = [CXX11<"tapir","target">];
 
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 283fe88bf2f72..4be6d0b4920ee 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -2408,6 +2408,9 @@ void CodeGenModule::ConstructAttributeList(StringRef Name,
       FuncAttrs.addMemoryAttr(llvm::MemoryEffects::inaccessibleOrArgMemOnly());
       FuncAttrs.addAttribute(llvm::Attribute::NoUnwind);
     }
+    if (TargetDecl->hasAttr<KitsuneReductionAttr>()) {
+      FuncAttrs.addAttribute(llvm::Attribute::KitsuneReduction);
+    }
     if (TargetDecl->hasAttr<RestrictAttr>())
       RetAttrs.addAttribute(llvm::Attribute::NoAlias);
     if (TargetDecl->hasAttr<ReturnsNonNullAttr>() &&
diff --git a/clang/lib/Headers/kitsune.h.cmake b/clang/lib/Headers/kitsune.h.cmake
new file mode 100644
index 0000000000000..d8e6cd4bf86ee
--- /dev/null
+++ b/clang/lib/Headers/kitsune.h.cmake
@@ -0,0 +1,145 @@
+
+/*
+ * Copyright (c) 2020 Triad National Security, LLC
+ *                         All rights reserved.
+ *
+ * This file is part of the kitsune/llvm project.  It is released under
+ * the LLVM license.
+ */
+#ifndef __CLANG_KITSUNE_H__
+#define __CLANG_KITSUNE_H__
+
+#include <stdint.h>
+#include <stddef.h>
+
+#cmakedefine01 KITSUNE_ENABLE_OPENMP_ABI_TARGET
+#cmakedefine01 KITSUNE_ENABLE_QTHREADS_ABI_TARGET
+#cmakedefine01 KITSUNE_ENABLE_CUDA_ABI_TARGET
+#cmakedefine01 KITSUNE_ENABLE_GPU_ABI_TARGET
+#cmakedefine01 KITSUNE_ENABLE_REALM_ABI_TARGET
+#cmakedefine01 KITSUNE_ENABLE_OPENCL_ABI_TARGET
+#cmakedefine01 KITSUNE_ENABLE_HIP_ABI_TARGET
+
+#include "kitsune_rt.h"
+
+#if defined(reduction)
+#warning found reduction definition: try puttin kitsune lower in include order
+#else
+#define reduction __attribute__((noinline, kitsune_reduction))
+#endif
+
+
+#if defined(KITSUNE_ENABLE_OPENCL_ABI_TARGET)
+#define ocl_mmap(a, n) __kitsune_opencl_mmap_marker((void*)a, n)
+#ifdef __cplusplus
+extern "C" {
+#endif
+  void __kitsune_opencl_mmap_marker(void* ptr, uint64_t n);
+#ifdef __cplusplus
+}
+#endif
+#endif
+
+#if defined(spawn)
+#warning encountered multiple definitions of spawn!
+#else
+#define spawn _kitsune_spawn
+#endif
+
+#if defined(sync)
+#warning encountered multiple definitions of sync!
+#else
+#define sync _kitsune_sync
+#endif
+
+#if defined(forall)
+#warning encountered multiple definitions of forall!
+#else
+#define forall _kitsune_forall
+#endif
+
+
+#if defined(_tapir_cuda_target)
+  #ifdef __cplusplus
+    extern "C" __attribute__((malloc)) void* __kitrt_cuMemAllocManaged(size_t);
+    template <typename T>
+    inline __attribute__((always_inline))
+      T* alloc(size_t N) {
+      return (T*)__kitrt_cuMemAllocManaged(sizeof(T) * N);
+    }
+
+    extern "C" void __kitrt_cuMemFree(void*);
+    template <typename T>
+    void dealloc(T* array) {
+      __kitrt_cuMemFree((void*)array);
+    }
+  #else
+    void* __attribute__((malloc)) __kitrt_cuMemAllocManaged(size_t);
+    inline __attribute__((always_inline))
+    void *alloc(size_t total_bytes) {
+      return __kitrt_cuMemAllocManaged(total_bytes);
+    }
+
+    void __kitrt_cuMemFree(void*);
+    inline __attribute__((always_inline))
+    void dealloc(void *array) {
+      __kitrt_cuMemFree(array);
+    }
+  #endif
+#elif defined(_tapir_hip_target)
+  #ifdef __cplusplus
+    extern "C" __attribute__((malloc)) void* __kitrt_hipMemAllocManaged(size_t);
+    template <typename T>
+    inline __attribute__((always_inline))
+      T* alloc(size_t N) {
+      return (T*)__kitrt_hipMemAllocManaged(sizeof(T) * N);
+    }
+
+    extern "C" void __kitrt_hipMemFree(void*);
+    template <typename T>
+    void dealloc(T* array) {
+      __kitrt_hipMemFree((void*)array);
+    }
+  #else
+    void* __attribute__((malloc)) __kitrt_hipMemAllocManaged(size_t);
+    inline __attribute__((always_inline))
+    void *alloc(size_t total_bytes) {
+      return __kitrt_hipMemAllocManaged(total_bytes);
+    }
+
+    void __kitrt_hipMemFree(void*);
+    inline __attribute__((always_inline))
+    void dealloc(void *array) {
+       __kitrt_hipMemFree(array);
+    }
+  #endif
+#else
+  #ifdef __cplusplus
+    extern "C" __attribute__((malloc)) void* __kitrt_defaultMemAlloc(size_t);
+    template <typename T>
+    inline __attribute__((always_inline))
+    T* alloc(size_t N) {
+      return (T*)__kitrt_defaultMemAlloc(sizeof(T) * N);
+    }
+
+    extern "C" void __kitrt_defaultMemFree(void*);
+    template <typename T>
+    void dealloc(T* array) {
+      __kitrt_defaultMemFree(array);
+    }
+  #else
+    void* __attribute__((malloc)) __kitrt_defaultMemAlloc(size_t);
+    inline __attribute__((always_inline))
+    void *alloc(size_t total_bytes) {
+      return __kitrt_defaultMemAlloc(total_bytes);
+    }
+
+    void __kitrt_defaultMemFree(void*);
+    inline __attribute__((always_inline))
+    void dealloc(void* array) {
+       __kitrt_defaultMemFree(array);
+    }
+  #endif // __cplusplus
+#endif // cpu targets
+
+#endif
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 2f426bba3c74b..a159eb8e55287 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -7120,10 +7120,12 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
   case ParsedAttr::AT_TypeNullable:
     handleNullableTypeAttr(S, D, AL);
     break;
-
   case ParsedAttr::AT_VTablePointerAuthentication:
     handleVTablePointerAuthentication(S, D, AL);
     break;
+  case ParsedAttr::AT_KitsuneReduction:
+    handleSimpleAttribute<KitsuneReductionAttr>(S, D, AL);
+    break;
   }
 }
 
diff --git a/kitsune-tests/reductions/l2.c b/kitsune-tests/reductions/l2.c
new file mode 100644
index 0000000000000..fc81e0057e8a5
--- /dev/null
+++ b/kitsune-tests/reductions/l2.c
@@ -0,0 +1,36 @@
+#include<math.h>
+#include<stdio.h>
+#include<stdlib.h>
+#include<kitsune.h>
+
+reduction
+void sum(float *a, float b){
+  *a += b;
+}
+
+float l2(int n, float* a){
+  float red = 3.14159; 
+  forall(int i=0; i<n; i++){
+    sum(&red, a[i]*a[i]); 
+  }
+
+  return sqrt(red);
+}
+
+float l2_seq(int n, float* a){
+  float red = 3.14159; 
+  for(int i=0; i < n; i++){
+    sum(&red, a[i]*a[i]); 
+  }
+  return sqrt(red);
+}
+
+int main(int argc, char** argv){
+  int n = argc > 1 ? atoi(argv[1]) : 4096 ; 
+  float* arr = (float*)malloc(sizeof(float) * n); 
+  for(int i=0 ; i<n; i++){
+    arr[i] = i; 
+  }
+  printf("par: %f , seq: %f\n" , l2(n, arr), l2_seq(n, arr));
+}
+
diff --git a/llvm/include/llvm/IR/Attributes.td b/llvm/include/llvm/IR/Attributes.td
index bea25b9ed4191..63f17c762ca6b 100644
--- a/llvm/include/llvm/IR/Attributes.td
+++ b/llvm/include/llvm/IR/Attributes.td
@@ -239,6 +239,16 @@ def ReadNone : EnumAttr<"readnone", [ParamAttr]>;
 /// Function only reads from memory.
 def ReadOnly : EnumAttr<"readonly", [ParamAttr]>;
 
+<<<<<<< HEAD
+=======
+/// Tapir reducer-related attributes.
+def HyperView : EnumAttr<"hyper_view", [FnAttr]>;
+def HyperToken : EnumAttr<"hyper_token", [FnAttr]>;
+def ReducerRegister : EnumAttr<"reducer_register", [FnAttr]>;
+def ReducerUnregister : EnumAttr<"reducer_unregister", [FnAttr]>;
+def KitsuneReduction : EnumAttr<"kitsune_reduction", [FnAttr]>;
+
+>>>>>>> e2078e0739d8 (Working reductions, sequential semantics preserved for commutative associative magmas)
 /// Return value is always equal to this argument.
 def Returned : EnumAttr<"returned", [ParamAttr]>;
 
diff --git a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
index b83bcb7da6a49..fa1af8e829a31 100644
--- a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
+++ b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
@@ -37,6 +37,7 @@
 #include "llvm/Transforms/Utils/SimplifyIndVar.h"
 #include "llvm/Transforms/Utils/TapirUtils.h"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
+#include "llvm/Transforms/Utils/LoopPeel.h"
 
 using namespace llvm;
 
@@ -804,6 +805,10 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
   unsigned ExitIndex = LatchBR->getSuccessor(0) == Header ? 1 : 0;
   BasicBlock *LatchExit = LatchBR->getSuccessor(ExitIndex);
 
+  
+  Function *F = Header->getParent();
+  LLVM_DEBUG(dbgs() << "Function before strip mining\n" << *F); 
+
   // We will use the increment of the primary induction variable to derive
   // wrapping flags.
   Instruction *PrimaryInc =
@@ -1551,26 +1556,25 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
     // Optimize this routine in the future.
     TI->recalculate(*F, *DT);
 
-  // iterate through the stores that should be treated as reductions
+  // accumulate reductions
   const std::vector<BasicBlock*>& blocks = L->getBlocks(); 
-  std::set<Value*> reductions;  
+  std::set<CallInst*> reductions;
   for (BasicBlock *BB : blocks){
     for (Instruction &I : *BB) {
-      if(auto si = dyn_cast<StoreInst>(&I)){
-        // TODO: better check if the store should be treated as a
-        // reduction. What we're doing is just checking if it's
-        // storing to a loop invariant pointer
-        Value* ptr = si->getPointerOperand(); 
-        if(L->isLoopInvariant(ptr)){
-          LLVM_DEBUG(dbgs() << "Found reduction var: " << ptr->getName() << "\n"); 
-          reductions.insert(ptr); 
+      if(auto ci = dyn_cast<CallInst>(&I)){
+        auto f = ci->getCalledFunction(); 
+        if(f->getAttributes().hasAttrSomewhere(Attribute::KitsuneReduction)){
+          LLVM_DEBUG(dbgs() << "Found reduction var: " << ci->getArgOperand(0)->getName() << 
+                               "with reduction function: " << f->getName() << "\n"); 
+          reductions.insert(ci); 
+          //TODO: check the type to confirm valid reduction
         }
       }
     }
   }
   LLVM_DEBUG(dbgs() << "Found " << reductions.size() << " reduction variables in loop\n"); 
 
-  ValueToValueMap redMap; 
+  std::vector<std::tuple<CallInst*, Value* , Value*>> redMap; 
   // TODO: Modify the strip mining outer loop to be smaller: currently we are
   // stack allocating n/2048 reduction values.
   // TODO: Initialize local reductions with unit values
@@ -1584,17 +1588,16 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
   Value *outerIters = RB.CreateUDiv(TripCount,
                                ConstantInt::get(TripCount->getType(), Count),
                                "stripiter");
-  for(Value* ptr : reductions){
+  for(CallInst* ci : reductions){
+    auto ptr = ci->getArgOperand(0); 
     auto ty = dyn_cast<PointerType>(ptr->getType())->getElementType(); 
     auto al = RB.CreateAlloca(ty, outerIters, ptr->getName() + "_reduction");
-    IRBuilder<> BH(L->getHeader()->getTerminator()); 
+    IRBuilder<> BH(NewLoop->getHeader()->getTerminator()); 
     auto lptr = BH.CreateBitCast(
       BH.CreateGEP(ty, al, NewIdx), 
       ptr->getType());                             
-    redMap[ptr] = al; 
-    // TODO: for now, just initializing with the initial sequential
-    // reduction value, which is often unit, but if it isn't this is
-    // wrong. What we need to do is assume there is more than one element, and
+    redMap.push_back(std::make_tuple(ci, ptr, al)); 
+    // Assume there is more than one element, and
     // use the first element for the first iteration of the loop.
     // roughly: 
     //   red = init; 
@@ -1606,20 +1609,18 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
     //   forall(k = ...){
     //     localred[i] = body(j_0); 
     //     for(j = j_1 ...)
-    //       localred = reduce(localred, body(j));
+    //       reduce(localred+i, body(j));
     //   }
     //   for(k = ...)
-    //     red = reduce(red, localred[k]); 
+    //     reduce(&red, localred[k]); 
     //
     ptr->replaceUsesWithIf(lptr, [L](Use &u){
       if(auto I = dyn_cast<Instruction>(u.getUser())){
-        I->dump(); 
         return L->contains(I->getParent()); 
       } else {
         return false;
       }; 
     });
-    BH.CreateStore(BH.CreateLoad(ty, ptr), lptr);  
   }
   
   // Epilog "join" of reduction values stored in local reduction value arrays.
@@ -1636,53 +1637,64 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
   //   RedEpiBody
   // RedEpiExit
 
+  // Todo: re-order epilogue and reduction epilogue to preserve associativity
   if(!reductions.empty()){
-    SmallVector<SyncInst*,2> syncs; 
-    for(auto &bb : *F){
-      if(auto *sync = dyn_cast<SyncInst>(bb.getTerminator())){
-        syncs.push_back(sync);
-      }
-    }
-    for(auto *sync : syncs){
-      if(sync->getSyncRegion() == SyncReg){
-        BasicBlock *Sync = sync->getParent(); 
-        BasicBlock *PostSync = sync->getSuccessor(0);
-        BasicBlock* RedEpiHeader = BasicBlock::Create(Sync->getContext(), "reductionEpilogue", Sync->getParent(), Sync); 
-        RedEpiHeader->moveAfter(Sync); 
-        sync->setSuccessor(0, RedEpiHeader); 
-        BranchInst::Create(PostSync, RedEpiHeader); 
-        PHINode *Idx = PHINode::Create(outerIters->getType(), 2,
-                                       "reductionepilogueidx",
-                                       RedEpiHeader->getFirstNonPHI());
-        IRBuilder<> BH(RedEpiHeader->getFirstNonPHI()); 
-        Idx->addIncoming(outerIters, sync->getParent());
-        Instruction *bodyTerm, *exitTerm;
-        Value *cmp = BH.CreateIsNotNull(Idx); 
-        SplitBlockAndInsertIfThenElse(cmp, RedEpiHeader->getTerminator(), &bodyTerm, &exitTerm);
-
-        IRBuilder<> BB(bodyTerm); 
-        // For each reduction, get the allocated thread local reduced values and
-        // reduce them. For now defaults to sums on primitive types.
-        // TODO: Add custom unital magmas and/or infer unital magma
-        for(auto& kv : redMap){
-          auto al = kv.second;
-          Value* ptr = const_cast<Value*>(kv.first);  
-          auto ty = ptr->getType(); 
-          auto lptr = BB.CreateBitCast(
-            BB.CreateGEP(ty, al, Idx), 
-            ty);                             
-          auto acc = BB.CreateLoad(ty, ptr); 
-          auto x = BB.CreateLoad(ty, lptr); 
-          auto newacc = acc->getType()->isFloatingPointTy() ? BB.CreateFAdd(acc,x) : BB.CreateAdd(acc,x);
-          BB.CreateStore(newacc, ptr); 
+    // Peel the first iteration of the loop and replace the reduction calls in
+    // the peeled code with stores
+    peelLoop(L, 1, LI, SE, *DT, AC, PreserveLCSSA); 
+    SmallVector<Instruction*, 4> cis;
+    for(auto &BB : NewLoop->blocks()){
+      if(!L->contains(BB)){ // better way?
+        for(auto &I : *BB){
+          if(auto *CI = dyn_cast<CallInst>(&I)){
+            auto *f = CI->getCalledFunction(); 
+            if(f->getAttributes().hasAttrSomewhere(Attribute::KitsuneReduction)){
+              IRBuilder<> pb(&I); 
+              pb.CreateStore(CI->getArgOperand(1), CI->getArgOperand(0)); 
+              cis.push_back(&I); 
+              f->removeFnAttr(Attribute::NoInline); 
+            }
+          }
         }
-        Value *IdxSub =
-            BB.CreateSub(Idx, ConstantInt::get(Idx->getType(), 1),
-                              Idx->getName() + ".sub");
-        Idx->addIncoming(IdxSub, bodyTerm->getParent()); 
-        ReplaceInstWithInst(bodyTerm, BranchInst::Create(RedEpiHeader)); 
       }
     }
+    
+    for(auto &I : cis){
+      I->eraseFromParent();  
+    }
+
+    Instruction* term = LatchExit->getTerminator(); 
+    BasicBlock *PostSync = term->getSuccessor(0);
+    BasicBlock* RedEpiHeader = BasicBlock::Create(LatchExit->getContext(), "reductionEpilogue", LatchExit->getParent(), LatchExit); 
+    RedEpiHeader->moveAfter(LatchExit); 
+    ReplaceInstWithInst(term, SyncInst::Create(RedEpiHeader, SyncReg)); 
+    BranchInst::Create(PostSync, RedEpiHeader);
+    PHINode *Idx = PHINode::Create(outerIters->getType(), 2,
+                                   "reductionepilogueidx",
+                                   RedEpiHeader->getFirstNonPHI());
+    IRBuilder<> BH(RedEpiHeader->getFirstNonPHI()); 
+    Idx->addIncoming(ConstantInt::get(outerIters->getType(), 0), LatchExit);
+    Instruction *bodyTerm, *exitTerm;
+    Value *cmp = BH.CreateCmp(CmpInst::ICMP_NE, Idx, outerIters); 
+    SplitBlockAndInsertIfThenElse(cmp, RedEpiHeader->getTerminator(), &bodyTerm, &exitTerm);
+
+    IRBuilder<> BB(bodyTerm); 
+    // For each reduction, get the allocated thread local reduced values and
+    // reduce them. 
+    for(auto& kv : redMap){
+      const auto [ ci, ptr, al ] = kv; 
+      auto ty = dyn_cast<PointerType>(ptr->getType())->getElementType(); 
+      auto lptr = BB.CreateBitCast(
+        BB.CreateGEP(ty, al, Idx), 
+        ptr->getType());                             
+      auto x = BB.CreateLoad(ty, lptr); 
+      BB.CreateCall(ci->getCalledFunction(), { ptr, x }); 
+    }
+    Value *IdxAdd =
+        BB.CreateAdd(Idx, ConstantInt::get(Idx->getType(), 1),
+                          Idx->getName() + ".add");
+    Idx->addIncoming(IdxAdd, bodyTerm->getParent()); 
+    ReplaceInstWithInst(bodyTerm, BranchInst::Create(RedEpiHeader)); 
   }
 
   LLVM_DEBUG(dbgs() << "Function after reduction epilogue\n" << *F);  
diff --git a/llvm/lib/Transforms/Tapir/OpenCilkABI.cpp b/llvm/lib/Transforms/Tapir/OpenCilkABI.cpp
index e048cdaf57c58..98339299f009d 100644
--- a/llvm/lib/Transforms/Tapir/OpenCilkABI.cpp
+++ b/llvm/lib/Transforms/Tapir/OpenCilkABI.cpp
@@ -151,8 +151,20 @@ void OpenCilkABI::prepareModule() {
     if ("" != ClOpenCilkRuntimeBCPath)
       RuntimeBCPath = ClOpenCilkRuntimeBCPath;
 
-    if ("" == RuntimeBCPath)
-      C.emitError("OpenCilkABI: No OpenCilk bitcode ABI file given.");
+    std::optional<std::string> path;
+    if("" == RuntimeBCPath){
+      path = sys::Process::FindInEnvPath("LD_LIBRARY_PATH", "libopencilk-abi.bc");
+      if (! path)
+        // TODO: This is an in-tree build solution for now...
+        #if defined(OPENCILK_BC_PATH)
+        path = OPENCILK_BC_PATH;
+        #else
+        report_fatal_error("Could not find OpenCilk runtime bitcode file "
+                           "(libopencilk-abi.bc) in LD_LIBRARY_PATH.");
+        #endif
+    } else {
+      path = ClOpenCilkRuntimeBCPath.getValue();
+    }
 
     LLVM_DEBUG(dbgs() << "Using external bitcode file for OpenCilk ABI: "
                       << RuntimeBCPath << "\n");
@@ -161,7 +173,7 @@ void OpenCilkABI::prepareModule() {
     // Parse the bitcode file.  This call imports structure definitions, but not
     // function definitions.
     if (std::unique_ptr<Module> ExternalModule =
-        parseIRFile(RuntimeBCPath, SMD, C)) {
+        parseIRFile(path.getValue(), SMD, C)) {
       // Get the original DiagnosticHandler for this context.
       std::unique_ptr<DiagnosticHandler> OrigDiagHandler =
           C.getDiagnosticHandler();

From 983ddbac55f6cb749bec793f36d792224c6968dc Mon Sep 17 00:00:00 2001
From: George Stelle <stelleg@gmail.com>
Date: Tue, 13 Jun 2023 11:32:26 -0600
Subject: [PATCH 07/14] updated reduction example with timing

---
 kitsune-tests/reductions/l2.c               | 40 +++++++++++++------
 llvm/lib/Transforms/Tapir/GPUABI.cpp        | 44 +++++++--------------
 llvm/lib/Transforms/Tapir/LoopStripMine.cpp | 39 +++++++++++++-----
 3 files changed, 73 insertions(+), 50 deletions(-)

diff --git a/kitsune-tests/reductions/l2.c b/kitsune-tests/reductions/l2.c
index fc81e0057e8a5..e39b8765da54d 100644
--- a/kitsune-tests/reductions/l2.c
+++ b/kitsune-tests/reductions/l2.c
@@ -1,36 +1,54 @@
+#include<time.h>
 #include<math.h>
 #include<stdio.h>
 #include<stdlib.h>
 #include<kitsune.h>
+#include<gpu.h>
 
 reduction
-void sum(float *a, float b){
+void sum(double *a, double b){
   *a += b;
 }
 
-float l2(int n, float* a){
-  float red = 3.14159; 
-  forall(int i=0; i<n; i++){
+double l2(uint64_t n, double* a){
+  double red = 0; 
+  forall(uint64_t i=0; i<n; i++){
     sum(&red, a[i]*a[i]); 
   }
 
   return sqrt(red);
 }
 
-float l2_seq(int n, float* a){
-  float red = 3.14159; 
-  for(int i=0; i < n; i++){
+double l2_seq(uint64_t n, double* a){
+  double red = 0; 
+  for(uint64_t i=0; i < n; i++){
     sum(&red, a[i]*a[i]); 
   }
   return sqrt(red);
 }
 
 int main(int argc, char** argv){
-  int n = argc > 1 ? atoi(argv[1]) : 4096 ; 
-  float* arr = (float*)malloc(sizeof(float) * n); 
-  for(int i=0 ; i<n; i++){
+  uint64_t n = argc > 1 ? atoi(argv[1]) : 2ULL<<28 ; 
+  double* arr = (double*)gpuManagedMalloc(sizeof(double) * n); 
+
+  forall(uint64_t i=0; i<n; i++){
     arr[i] = i; 
   }
-  printf("par: %f , seq: %f\n" , l2(n, arr), l2_seq(n, arr));
+
+//  l2(n, arr);
+
+  clock_t before = clock();
+  double par = l2(n, arr);
+  clock_t after = clock(); 
+  double partime = (double)(after - before) / 1000000; 
+
+  before = clock();
+  double seq = l2_seq(n, arr);
+  after = clock(); 
+  double seqtime = (double)(after - before) / 1000000; 
+
+  printf("par: %f in %f s , seq: %f in %f s\n" , par, partime, seq, seqtime);
+  double bw = (double)((2ULL<<28) * sizeof(double)) / (1000000000.0 * partime);  
+  printf("par bandwidth: %f GB/s \n" , bw);
 }
 
diff --git a/llvm/lib/Transforms/Tapir/GPUABI.cpp b/llvm/lib/Transforms/Tapir/GPUABI.cpp
index af302214d2545..12922e3e65aa8 100644
--- a/llvm/lib/Transforms/Tapir/GPUABI.cpp
+++ b/llvm/lib/Transforms/Tapir/GPUABI.cpp
@@ -37,6 +37,13 @@ using namespace llvm;
 
 #define DEBUG_TYPE "gpuabi"
 
+// JIT compiler kernel at containing function entry, makes timing easier at the
+// cost of less laziness
+static cl::opt<bool>
+    JIT("jit-callsite", cl::init(false), cl::NotHidden,
+          cl::desc("Wait until parallel loop is called to jit kernel. " 
+          "(default=false)"));
+
 Value *GPUABI::lowerGrainsizeCall(CallInst *GrainsizeCall) {
   Value *Grainsize = ConstantInt::get(GrainsizeCall->getType(), 8);
 
@@ -231,11 +238,7 @@ void LLVMLoop::postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out,
       // Grainsize argument is the third LC arg.
       Grainsize = &*++(++OutlineArgsIter);
   }
-  ThreadID = B.CreateMul(ThreadID, Grainsize);
-  Value *ThreadEndGrain = B.CreateAdd(ThreadID, Grainsize);
-  Value *Cmp = B.CreateICmp(ICmpInst::ICMP_ULT, ThreadEndGrain, End);
-  Value *ThreadEnd = B.CreateSelect(Cmp, ThreadEndGrain, End);
-  Value *Cond = B.CreateICmpUGE(ThreadID, ThreadEnd);
+  Value *Cond = B.CreateICmpUGE(ThreadID, End);
 
   ReplaceInstWithInst(Entry->getTerminator(), BranchInst::Create(Exit, Header,
                                                                  Cond));
@@ -245,10 +248,10 @@ void LLVMLoop::postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out,
   // Update cloned loop condition to use the thread-end value.
   unsigned TripCountIdx = 0;
   ICmpInst *ClonedCond = cast<ICmpInst>(VMap[TL.getCondition()]);
-  if (ClonedCond->getOperand(0) != ThreadEnd)
+  if (ClonedCond->getOperand(0) != End)
     ++TripCountIdx;
-  ClonedCond->setOperand(TripCountIdx, ThreadEnd);
-  assert(ClonedCond->getOperand(TripCountIdx) == ThreadEnd &&
+  ClonedCond->setOperand(TripCountIdx, End);
+  assert(ClonedCond->getOperand(TripCountIdx) == End &&
          "End argument not used in condition");
 
 }
@@ -261,8 +264,6 @@ void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
   Type *Int64Ty = Type::getInt64Ty(Ctx);
   Type *VoidPtrTy = Type::getInt8PtrTy(Ctx);
 
-  //Task *T = TL.getTask();
-  //Instruction *ReplCall = cast<CallBase>(TOI.ReplCall);
   LLVM_DEBUG(dbgs() << "Running processOutlinedLoopCall: " << LLVMM);
   Function *Parent = TOI.ReplCall->getFunction();
   Value *TripCount = OrderedInputs[0];
@@ -388,12 +389,6 @@ void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
                                  GlobalValue::PrivateLinkage, LLVMBC,
                                  "gpu_" + Twine("kitsune_kernel"));
 
-  //Value* TripCount = isSRetInput(TOI.InputSet[0]) ? TOI.InputSet[1] : TOI.InputSet[0];
-  //Value *RunStart = ReplCall->getArgOperand(getIVArgIndex(*Parent,
-  //                                                        TOI.InputSet));
-  //Value *TripCount = ReplCall->getArgOperand(getLimitArgIndex(*Parent,
-  //                                                            TOI.InputSet));
-
   Value *KernelID = ConstantInt::get(Int32Ty, MyKernelID);
   Value *LLVMPtr = B.CreateBitCast(LLVMGlobal, VoidPtrTy);
   Type *VoidPtrPtrTy = VoidPtrTy->getPointerTo();
@@ -417,20 +412,9 @@ void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
     B.CreateStore(VoidVPtr, argPtr);
   }
 
-  Value *Grainsize = TL.getGrainsize() ?
-    ConstantInt::get(TripCount->getType(), TL.getGrainsize()) :
-    OrderedInputs[2];
-
-  //Type *Int64Ty = Type::getInt64Ty(LLVMM.getContext());
-  Value *RunSizeQ = B.CreateUDiv(TripCount, Grainsize);
-  Value *RunRem = B.CreateURem(TripCount, Grainsize);
-  Value *IsRem = B.CreateICmp(ICmpInst::ICMP_UGT, RunRem, ConstantInt::get(RunRem->getType(), 0));
-  Value *IsRemAdd = B.CreateZExt(IsRem, RunSizeQ->getType());
-  Value *RunSize = B.CreateZExt(B.CreateAdd(RunSizeQ, IsRemAdd), Int64Ty);
-
-  Value* argsPtr = B.CreateConstInBoundsGEP2_32(arrayType, argArray, 0, 0);
-  Value* bcPtr = B.CreateConstInBoundsGEP2_32(LLVMGlobal->getValueType(), LLVMGlobal, 0, 0);
-  Value* stream = B.CreateCall(GPULaunchKernel, { bcPtr, kernelSize, argsPtr, RunSize });
+  Value* argsPtr = B.CreateConstInBoundsGEP2_32(arrayType, argArray, 0, 0); 
+  Value* bcPtr = B.CreateConstInBoundsGEP2_32(LLVMGlobal->getValueType(), LLVMGlobal, 0, 0); 
+  Value* stream = B.CreateCall(GPULaunchKernel, { bcPtr, kernelSize, argsPtr, TripCount });
   B.CreateCall(GPUWaitKernel, stream);
 
   LLVM_DEBUG(dbgs() << "Finished processOutlinedLoopCall: " << M);
diff --git a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
index fa1af8e829a31..422843ccf8e8e 100644
--- a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
+++ b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
@@ -1556,7 +1556,7 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
     // Optimize this routine in the future.
     TI->recalculate(*F, *DT);
 
-  // accumulate reductions
+  // accumulate reductions in main loop
   const std::vector<BasicBlock*>& blocks = L->getBlocks(); 
   std::set<CallInst*> reductions;
   for (BasicBlock *BB : blocks){
@@ -1572,6 +1572,8 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
       }
     }
   }
+   
+  // accumulate reductions in epilog loop
   LLVM_DEBUG(dbgs() << "Found " << reductions.size() << " reduction variables in loop\n"); 
 
   std::vector<std::tuple<CallInst*, Value* , Value*>> redMap; 
@@ -1588,10 +1590,16 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
   Value *outerIters = RB.CreateUDiv(TripCount,
                                ConstantInt::get(TripCount->getType(), Count),
                                "stripiter");
+  auto nred = RB.CreateAdd(outerIters, ConstantInt::get(outerIters->getType(), 1)); 
   for(CallInst* ci : reductions){
+    // TODO: generic allocation/free calls
     auto ptr = ci->getArgOperand(0); 
     auto ty = dyn_cast<PointerType>(ptr->getType())->getElementType(); 
-    auto al = RB.CreateAlloca(ty, outerIters, ptr->getName() + "_reduction");
+    auto gmmTy = FunctionType::get(ptr->getType(), { nred->getType() }, false); 
+    auto arrSize = RB.CreateMul(nred, ConstantInt::get(nred->getType(), DL.getTypeAllocSize(nred->getType()))); 
+    auto al = RB.CreateCall(M->getOrInsertFunction("gpuManagedMalloc", gmmTy), {arrSize}); 
+    //auto al = RB.CreateBitCast(rm, ty); 
+    //auto al = RB.CreateAlloca(ty, nred, ptr->getName() + "_reduction");
     IRBuilder<> BH(NewLoop->getHeader()->getTerminator()); 
     auto lptr = BH.CreateBitCast(
       BH.CreateGEP(ty, al, NewIdx), 
@@ -1605,13 +1613,17 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
     //     red = reduce(red, body(i)); 
     //   }
     //   red = init; 
-    //   localred[m]; 
-    //   forall(k = ...){
+    //   localred[m+1]; 
+    //   
+    //   forall(k ∈ 0..m-1){
     //     localred[i] = body(j_0); 
-    //     for(j = j_1 ...)
+    //     for(j ∈ j_k_1..j_k_l-1)
     //       reduce(localred+i, body(j));
     //   }
-    //   for(k = ...)
+    //   for( j ∈ j_k_m .. n )
+    //     reduce(localred+m, body(j)); 
+    //   }
+    //   for(k ∈ 0..m) 
     //     reduce(&red, localred[k]); 
     //
     ptr->replaceUsesWithIf(lptr, [L](Use &u){
@@ -1688,25 +1700,34 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
         BB.CreateGEP(ty, al, Idx), 
         ptr->getType());                             
       auto x = BB.CreateLoad(ty, lptr); 
+      BB.SetCurrentDebugLocation(ci->getDebugLoc()); 
       BB.CreateCall(ci->getCalledFunction(), { ptr, x }); 
     }
     Value *IdxAdd =
         BB.CreateAdd(Idx, ConstantInt::get(Idx->getType(), 1),
                           Idx->getName() + ".add");
-    Idx->addIncoming(IdxAdd, bodyTerm->getParent()); 
+    BasicBlock* body = bodyTerm->getParent(); 
+    BasicBlock* loopExit = exitTerm->getParent(); 
+    Idx->addIncoming(IdxAdd, body); 
     ReplaceInstWithInst(bodyTerm, BranchInst::Create(RedEpiHeader)); 
+
+    // Update Loopinfo with reduction loop
+    Loop* RL = LI->AllocateLoop(); 
+    if(ParentLoop) ParentLoop->addChildLoop(RL); 
+    else LI->addTopLevelLoop(RL); 
+    RL->addBasicBlockToLoop(RedEpiHeader, *LI); 
+    RL->addBasicBlockToLoop(body, *LI); 
   }
 
   LLVM_DEBUG(dbgs() << "Function after reduction epilogue\n" << *F);  
 
   // TODO: fix DT updates
   DT->recalculate(*F); 
-  /*
+
 #ifndef NDEBUG
   DT->verify();
   LI->verify(*DT);
 #endif
-  */
 
   return NewLoop;
 }

From 6a40583b15349ed7afe0ec94564061f6129f4407 Mon Sep 17 00:00:00 2001
From: George Stelle <stelleg@gmail.com>
Date: Thu, 21 Sep 2023 09:22:56 -0600
Subject: [PATCH 08/14] Fixed reductions use of pointer types

---
 llvm/lib/Transforms/Tapir/LoopStripMine.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
index 422843ccf8e8e..945dc16f89000 100644
--- a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
+++ b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
@@ -1594,7 +1594,7 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
   for(CallInst* ci : reductions){
     // TODO: generic allocation/free calls
     auto ptr = ci->getArgOperand(0); 
-    auto ty = dyn_cast<PointerType>(ptr->getType())->getElementType(); 
+    auto ty = dyn_cast<PointerType>(ptr->getType())->getArrayElementType(); 
     auto gmmTy = FunctionType::get(ptr->getType(), { nred->getType() }, false); 
     auto arrSize = RB.CreateMul(nred, ConstantInt::get(nred->getType(), DL.getTypeAllocSize(nred->getType()))); 
     auto al = RB.CreateCall(M->getOrInsertFunction("gpuManagedMalloc", gmmTy), {arrSize}); 
@@ -1653,7 +1653,8 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
   if(!reductions.empty()){
     // Peel the first iteration of the loop and replace the reduction calls in
     // the peeled code with stores
-    peelLoop(L, 1, LI, SE, *DT, AC, PreserveLCSSA); 
+    ValueToValueMapTy VMap; 
+    peelLoop(L, 1, LI, SE, *DT, AC, PreserveLCSSA, VMap); 
     SmallVector<Instruction*, 4> cis;
     for(auto &BB : NewLoop->blocks()){
       if(!L->contains(BB)){ // better way?
@@ -1695,7 +1696,7 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
     // reduce them. 
     for(auto& kv : redMap){
       const auto [ ci, ptr, al ] = kv; 
-      auto ty = dyn_cast<PointerType>(ptr->getType())->getElementType(); 
+      auto ty = dyn_cast<PointerType>(ptr->getType())->getArrayElementType(); 
       auto lptr = BB.CreateBitCast(
         BB.CreateGEP(ty, al, Idx), 
         ptr->getType());                             

From f5330b43ff1d8f987e6d170b2bb867243f9e386d Mon Sep 17 00:00:00 2001
From: George Stelle <stelleg@gmail.com>
Date: Thu, 12 Sep 2024 14:35:33 -0600
Subject: [PATCH 09/14] working reductions on 16.x

---
 llvm/include/llvm/Bitcode/LLVMBitCodes.h    |  2 ++
 llvm/include/llvm/IR/GlobalValue.h          |  1 -
 llvm/lib/Bitcode/Reader/BitcodeReader.cpp   |  2 ++
 llvm/lib/Bitcode/Writer/BitcodeWriter.cpp   |  2 ++
 llvm/lib/Transforms/Tapir/GPUABI.cpp        |  3 ++-
 llvm/lib/Transforms/Tapir/LoopStripMine.cpp | 19 ++++++++++---------
 llvm/lib/Transforms/Tapir/OpenCilkABI.cpp   |  2 +-
 7 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
index 40ae85cf16919..fa5bdd2c5cc4b 100644
--- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
@@ -764,6 +764,8 @@ enum AttributeKindCodes {
   ATTR_KIND_HYBRID_PATCHABLE = 95,
   ATTR_KIND_SANITIZE_CILK = 96,
   ATTR_KIND_STEALABLE = 97,
+  ATTR_KIND_SANITIZE_CILK = 98,
+  ATTR_KIND_KITSUNE_REDUCTION = 99,
 };
 
 enum ComdatSelectionKindCodes {
diff --git a/llvm/include/llvm/IR/GlobalValue.h b/llvm/include/llvm/IR/GlobalValue.h
index 59ba8919d8605..d98c0c5f4dfd0 100644
--- a/llvm/include/llvm/IR/GlobalValue.h
+++ b/llvm/include/llvm/IR/GlobalValue.h
@@ -571,7 +571,6 @@ class GlobalValue : public Constant {
 
   // FIXME: need a better solution but also too difficult to re-outline 
   // everything w/ tapir for GPU targets (AMD in particular)... 
-  //#warning "FIXME: need better solution than mutateType()."
   void mutateValueType(Type *Ty) {
     ValueType = Ty;
   }
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 92411913c43ee..09306d88ae805 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -2189,6 +2189,8 @@ static Attribute::AttrKind getAttrFromCode(uint64_t Code) {
     return Attribute::Range;
   case bitc::ATTR_KIND_INITIALIZES:
     return Attribute::Initializes;
+  case bitc::ATTR_KIND_KITSUNE_REDUCTION:
+    return Attribute::KitsuneReduction;
   }
 }
 
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 85f9d1556c26c..5251549c36f2d 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -889,6 +889,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) {
     return bitc::ATTR_KIND_RANGE;
   case Attribute::Initializes:
     return bitc::ATTR_KIND_INITIALIZES;
+  case Attribute::KitsuneReduction:
+    return bitc::ATTR_KIND_KITSUNE_REDUCTION; 
   case Attribute::EndAttrKinds:
     llvm_unreachable("Can not encode end-attribute kinds marker.");
   case Attribute::None:
diff --git a/llvm/lib/Transforms/Tapir/GPUABI.cpp b/llvm/lib/Transforms/Tapir/GPUABI.cpp
index 12922e3e65aa8..03b7309023594 100644
--- a/llvm/lib/Transforms/Tapir/GPUABI.cpp
+++ b/llvm/lib/Transforms/Tapir/GPUABI.cpp
@@ -305,7 +305,8 @@ void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
                   SmallVector<ReturnInst*, 8> Returns;
                   CloneFunctionInto(deviceF, f, VMap, CloneFunctionChangeType::DifferentModule, Returns);
                   // GPU calls are slow as balls, try to force inlining
-                  deviceF->addFnAttr(Attribute::AlwaysInline);
+                  if(!deviceF->hasFnAttribute(Attribute::NoInline))
+                    deviceF->addFnAttr(Attribute::AlwaysInline); 
                   todo.push_back(deviceF);
                 }
               }
diff --git a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
index 945dc16f89000..0d12868b37c60 100644
--- a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
+++ b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
@@ -1558,7 +1558,7 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
 
   // accumulate reductions in main loop
   const std::vector<BasicBlock*>& blocks = L->getBlocks(); 
-  std::set<CallInst*> reductions;
+  std::set<std::pair<CallInst*, Type*>> reductions;
   for (BasicBlock *BB : blocks){
     for (Instruction &I : *BB) {
       if(auto ci = dyn_cast<CallInst>(&I)){
@@ -1566,7 +1566,8 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
         if(f->getAttributes().hasAttrSomewhere(Attribute::KitsuneReduction)){
           LLVM_DEBUG(dbgs() << "Found reduction var: " << ci->getArgOperand(0)->getName() << 
                                "with reduction function: " << f->getName() << "\n"); 
-          reductions.insert(ci); 
+          auto ty = ci->getArgOperand(1)->getType(); 
+          reductions.insert(std::make_pair(ci, ty)); 
           //TODO: check the type to confirm valid reduction
         }
       }
@@ -1576,7 +1577,7 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
   // accumulate reductions in epilog loop
   LLVM_DEBUG(dbgs() << "Found " << reductions.size() << " reduction variables in loop\n"); 
 
-  std::vector<std::tuple<CallInst*, Value* , Value*>> redMap; 
+  std::vector<std::tuple<CallInst*, Value* , Value*, Type*>> redMap; 
   // TODO: Modify the strip mining outer loop to be smaller: currently we are
   // stack allocating n/2048 reduction values.
   // TODO: Initialize local reductions with unit values
@@ -1591,12 +1592,13 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
                                ConstantInt::get(TripCount->getType(), Count),
                                "stripiter");
   auto nred = RB.CreateAdd(outerIters, ConstantInt::get(outerIters->getType(), 1)); 
-  for(CallInst* ci : reductions){
+  for(auto &pair : reductions){
     // TODO: generic allocation/free calls
+    auto ci = pair.first; 
     auto ptr = ci->getArgOperand(0); 
-    auto ty = dyn_cast<PointerType>(ptr->getType())->getArrayElementType(); 
+    auto ty = pair.second; 
     auto gmmTy = FunctionType::get(ptr->getType(), { nred->getType() }, false); 
-    auto arrSize = RB.CreateMul(nred, ConstantInt::get(nred->getType(), DL.getTypeAllocSize(nred->getType()))); 
+    auto arrSize = RB.CreateMul(nred, ConstantInt::get(nred->getType(), DL.getTypeAllocSize(ty))); 
     auto al = RB.CreateCall(M->getOrInsertFunction("gpuManagedMalloc", gmmTy), {arrSize}); 
     //auto al = RB.CreateBitCast(rm, ty); 
     //auto al = RB.CreateAlloca(ty, nred, ptr->getName() + "_reduction");
@@ -1604,7 +1606,7 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
     auto lptr = BH.CreateBitCast(
       BH.CreateGEP(ty, al, NewIdx), 
       ptr->getType());                             
-    redMap.push_back(std::make_tuple(ci, ptr, al)); 
+    redMap.push_back(std::make_tuple(ci, ptr, al, ty)); 
     // Assume there is more than one element, and
     // use the first element for the first iteration of the loop.
     // roughly: 
@@ -1695,8 +1697,7 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
     // For each reduction, get the allocated thread local reduced values and
     // reduce them. 
     for(auto& kv : redMap){
-      const auto [ ci, ptr, al ] = kv; 
-      auto ty = dyn_cast<PointerType>(ptr->getType())->getArrayElementType(); 
+      const auto [ ci, ptr, al, ty ] = kv; 
       auto lptr = BB.CreateBitCast(
         BB.CreateGEP(ty, al, Idx), 
         ptr->getType());                             
diff --git a/llvm/lib/Transforms/Tapir/OpenCilkABI.cpp b/llvm/lib/Transforms/Tapir/OpenCilkABI.cpp
index 98339299f009d..be51e49d0d5d7 100644
--- a/llvm/lib/Transforms/Tapir/OpenCilkABI.cpp
+++ b/llvm/lib/Transforms/Tapir/OpenCilkABI.cpp
@@ -173,7 +173,7 @@ void OpenCilkABI::prepareModule() {
     // Parse the bitcode file.  This call imports structure definitions, but not
     // function definitions.
     if (std::unique_ptr<Module> ExternalModule =
-        parseIRFile(path.getValue(), SMD, C)) {
+        parseIRFile(RuntimeBCPath, SMD, C)) {
       // Get the original DiagnosticHandler for this context.
       std::unique_ptr<DiagnosticHandler> OrigDiagHandler =
           C.getDiagnosticHandler();

From 0c73a4158b07add4531ff38a3bd74bff25e9c4d5 Mon Sep 17 00:00:00 2001
From: George Stelle <stelleg@gmail.com>
Date: Mon, 16 Sep 2024 12:42:34 -0600
Subject: [PATCH 10/14] added reductions example makefile

---
 kitsune-tests/reductions/makefile | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)
 create mode 100644 kitsune-tests/reductions/makefile

diff --git a/kitsune-tests/reductions/makefile b/kitsune-tests/reductions/makefile
new file mode 100644
index 0000000000000..78e4a2a8e81cb
--- /dev/null
+++ b/kitsune-tests/reductions/makefile
@@ -0,0 +1,25 @@
+all: l2 l2_gpu l2_serial
+
+l2.ll : l2.c
+	clang -S -emit-llvm l2.c -O1 -ftapir=none 
+
+l2_stripmined.ll: l2.ll
+	opt -S -passes="loop-stripmine" -o l2_stripmined.ll l2.ll 
+
+l2_stripmined_opt.ll: l2_stripmined.ll
+	opt -S -O2 -o l2_stripmined_opt.ll l2_stripmined.ll
+
+l2: l2_stripmined_opt.ll
+	clang -ftapir=opencilk -O2 l2_stripmined_opt.ll -o l2 -lm  -lllvm-gpu
+
+l2_gpu.ll: l2_stripmined_opt.ll
+	clang -ftapir=gpu -O1 -S -emit-llvm $< -o $@ 
+
+l2_gpu: l2_gpu.ll
+	clang l2_gpu.ll -fPIC -o l2_gpu -ftapir=gpu -lm -lllvm-gpu
+
+l2_serial: l2.ll
+	clang l2.ll -o l2_serial -ftapir=serial -lllvm-gpu -lm
+
+clean: 
+	rm -f l2_stripmined.ll l2 l2.ll

From 39204642fc55eb2dc50cfd553960f157a1069b94 Mon Sep 17 00:00:00 2001
From: George Stelle <stelleg@gmail.com>
Date: Thu, 31 Oct 2024 09:23:38 -0600
Subject: [PATCH 11/14] Added openmp reduce for comparison

---
 kitsune-tests/reductions/l2.c        | 13 +++++--
 kitsune-tests/reductions/l2_openmp.c | 55 ++++++++++++++++++++++++++++
 kitsune-tests/reductions/makefile    |  5 ++-
 3 files changed, 68 insertions(+), 5 deletions(-)
 create mode 100644 kitsune-tests/reductions/l2_openmp.c

diff --git a/kitsune-tests/reductions/l2.c b/kitsune-tests/reductions/l2.c
index e39b8765da54d..c8e936fb98b48 100644
--- a/kitsune-tests/reductions/l2.c
+++ b/kitsune-tests/reductions/l2.c
@@ -28,17 +28,22 @@ double l2_seq(uint64_t n, double* a){
 }
 
 int main(int argc, char** argv){
-  uint64_t n = argc > 1 ? atoi(argv[1]) : 2ULL<<28 ; 
+  int e = argc > 1 ? atoi(argv[1]) : 28; 
+  int niter = argc > 2 ? atoi(argv[2]) : 100; 
+  uint64_t n = 1ULL<<e; 
   double* arr = (double*)gpuManagedMalloc(sizeof(double) * n); 
 
   forall(uint64_t i=0; i<n; i++){
     arr[i] = i; 
   }
 
-//  l2(n, arr);
+  l2(n, arr);
 
   clock_t before = clock();
-  double par = l2(n, arr);
+  double par; 
+  for(int i=0; i<niter; i++){
+    par = l2(n, arr);
+  }
   clock_t after = clock(); 
   double partime = (double)(after - before) / 1000000; 
 
@@ -48,7 +53,7 @@ int main(int argc, char** argv){
   double seqtime = (double)(after - before) / 1000000; 
 
   printf("par: %f in %f s , seq: %f in %f s\n" , par, partime, seq, seqtime);
-  double bw = (double)((2ULL<<28) * sizeof(double)) / (1000000000.0 * partime);  
+  double bw = (double)((1ULL<<e) * niter * sizeof(double)) / (1000000000.0 * partime);  
   printf("par bandwidth: %f GB/s \n" , bw);
 }
 
diff --git a/kitsune-tests/reductions/l2_openmp.c b/kitsune-tests/reductions/l2_openmp.c
new file mode 100644
index 0000000000000..836052b1f5579
--- /dev/null
+++ b/kitsune-tests/reductions/l2_openmp.c
@@ -0,0 +1,55 @@
+#include<time.h>
+#include<math.h>
+#include<stdio.h>
+#include<stdlib.h>
+#include<gpu.h>
+
+double l2(uint64_t n, double* a){
+  double red = 0; 
+  #pragma omp parallel for reduction(+:red)
+  for(uint64_t i=0; i<n; i++){
+    red += a[i]*a[i]; 
+  }
+
+  return sqrt(red);
+}
+
+double l2_seq(uint64_t n, double* a){
+  double red = 0; 
+  for(uint64_t i=0; i < n; i++){
+    red += a[i]*a[i]; 
+  }
+  return sqrt(red);
+}
+
+int main(int argc, char** argv){
+  int e = argc > 1 ? atoi(argv[1]) : 28; 
+  int niter = argc > 2 ? atoi(argv[2]) : 100; 
+  uint64_t n = 1ULL<<e; 
+  double* arr = malloc(sizeof(double) * n); 
+
+  #pragma omp parallel for
+  for(uint64_t i=0; i<n; i++){
+    arr[i] = i; 
+  }
+
+  l2(n, arr);
+
+  clock_t before = clock();
+  double par; 
+  for(int i=0; i<niter; i++){
+    par = l2(n, arr);
+  }
+  clock_t after = clock(); 
+  double partime = (double)(after - before) / 1000000; 
+
+  before = clock();
+  double seq = l2_seq(n, arr);
+  after = clock(); 
+  double seqtime = (double)(after - before) / 1000000; 
+
+  printf("par: %f in %f s , seq: %f in %f s\n" , par, partime, seq, seqtime);
+  double bw = (double)((1ULL<<e) * niter * sizeof(double)) / (1000000000.0 * partime);  
+  printf("par bandwidth: %f GB/s \n" , bw);
+}
+
diff --git a/kitsune-tests/reductions/makefile b/kitsune-tests/reductions/makefile
index 78e4a2a8e81cb..d3d264acb781b 100644
--- a/kitsune-tests/reductions/makefile
+++ b/kitsune-tests/reductions/makefile
@@ -1,4 +1,4 @@
-all: l2 l2_gpu l2_serial
+all: l2 l2_gpu l2_serial l2_openmp
 
 l2.ll : l2.c
 	clang -S -emit-llvm l2.c -O1 -ftapir=none 
@@ -21,5 +21,8 @@ l2_gpu: l2_gpu.ll
 l2_serial: l2.ll
 	clang l2.ll -o l2_serial -ftapir=serial -lllvm-gpu -lm
 
+l2_openmp: l2_openmp.c
+	clang l2_openmp.c -fopenmp -O2 -o l2_openmp -lm
+
 clean: 
 	rm -f l2_stripmined.ll l2 l2.ll

From 1877b2c3aee7febd5ceae9026ad91048432f0821 Mon Sep 17 00:00:00 2001
From: George Stelle <stelleg@gmail.com>
Date: Wed, 6 Nov 2024 16:32:16 -0700
Subject: [PATCH 12/14] mostly working grid stride gpu reductions

---
 kitsune-tests/reductions/l2.c                 |  20 +--
 kitsune-tests/reductions/l2_openmp.c          |  43 ++++--
 kitsune-tests/reductions/makefile             |   6 +-
 llvm/include/llvm/Transforms/Tapir/GPUABI.h   |   5 +-
 .../llvm/Transforms/Tapir/LoweringUtils.h     |   3 +
 llvm/lib/Transforms/Tapir/GPUABI.cpp          | 141 +++++++++++++++---
 llvm/lib/Transforms/Tapir/LoopSpawningTI.cpp  |  18 ++-
 7 files changed, 176 insertions(+), 60 deletions(-)

diff --git a/kitsune-tests/reductions/l2.c b/kitsune-tests/reductions/l2.c
index c8e936fb98b48..1da6e4b2968f3 100644
--- a/kitsune-tests/reductions/l2.c
+++ b/kitsune-tests/reductions/l2.c
@@ -10,6 +10,7 @@ void sum(double *a, double b){
   *a += b;
 }
 
+__attribute__((noinline))
 double l2(uint64_t n, double* a){
   double red = 0; 
   forall(uint64_t i=0; i<n; i++){
@@ -19,14 +20,6 @@ double l2(uint64_t n, double* a){
   return sqrt(red);
 }
 
-double l2_seq(uint64_t n, double* a){
-  double red = 0; 
-  for(uint64_t i=0; i < n; i++){
-    sum(&red, a[i]*a[i]); 
-  }
-  return sqrt(red);
-}
-
 int main(int argc, char** argv){
   int e = argc > 1 ? atoi(argv[1]) : 28; 
   int niter = argc > 2 ? atoi(argv[2]) : 100; 
@@ -37,8 +30,9 @@ int main(int argc, char** argv){
     arr[i] = i; 
   }
 
-  l2(n, arr);
+  printf("par:%f \n", l2(n, arr));
 
+  /*
   clock_t before = clock();
   double par; 
   for(int i=0; i<niter; i++){
@@ -47,13 +41,9 @@ int main(int argc, char** argv){
   clock_t after = clock(); 
   double partime = (double)(after - before) / 1000000; 
 
-  before = clock();
-  double seq = l2_seq(n, arr);
-  after = clock(); 
-  double seqtime = (double)(after - before) / 1000000; 
-
-  printf("par: %f in %f s , seq: %f in %f s\n" , par, partime, seq, seqtime);
+  printf("par: %f in %f s \n" , par, partime);
   double bw = (double)((1ULL<<e) * niter * sizeof(double)) / (1000000000.0 * partime);  
   printf("par bandwidth: %f GB/s \n" , bw);
+  */
 }
 
diff --git a/kitsune-tests/reductions/l2_openmp.c b/kitsune-tests/reductions/l2_openmp.c
index 836052b1f5579..70f83a4cd3ec4 100644
--- a/kitsune-tests/reductions/l2_openmp.c
+++ b/kitsune-tests/reductions/l2_openmp.c
@@ -3,6 +3,7 @@
 #include<stdio.h>
 #include<stdlib.h>
 #include<gpu.h>
+#include<omp.h>
 
 double l2(uint64_t n, double* a){
   double red = 0; 
@@ -14,12 +15,19 @@ double l2(uint64_t n, double* a){
   return sqrt(red);
 }
 
-double l2_seq(uint64_t n, double* a){
-  double red = 0; 
-  for(uint64_t i=0; i < n; i++){
-    red += a[i]*a[i]; 
+double l2_grid(uint64_t n, double* a){
+  double red[12];
+  #pragma omp parallel for
+  for(uint64_t i=0; i<12; i++){
+    red[i] = 0; 
+    for(uint64_t j=i; j<n; j+=12){
+      red[i] += a[j]*a[j]; 
+    }
   }
-  return sqrt(red);
+
+  double res = 0; 
+  for(int i=0; i<12; i++) res += red[i]; 
+  return sqrt(res); 
 }
 
 int main(int argc, char** argv){
@@ -35,21 +43,26 @@ int main(int argc, char** argv){
 
   l2(n, arr);
 
-  clock_t before = clock();
+  double before = omp_get_wtime(); 
   double par; 
   for(int i=0; i<niter; i++){
     par = l2(n, arr);
   }
-  clock_t after = clock(); 
-  double partime = (double)(after - before) / 1000000; 
-
-  before = clock();
-  double seq = l2_seq(n, arr);
-  after = clock(); 
-  double seqtime = (double)(after - before) / 1000000; 
-
-  printf("par: %f in %f s , seq: %f in %f s\n" , par, partime, seq, seqtime);
+  double after = omp_get_wtime(); 
+  double partime = (double)(after - before); 
+  printf("par: %f in %f s\n" , par, partime);
   double bw = (double)((1ULL<<e) * niter * sizeof(double)) / (1000000000.0 * partime);  
   printf("par bandwidth: %f GB/s \n" , bw);
+
+  double before_grid = omp_get_wtime();
+  double grid; 
+  for(int i=0; i<niter; i++){
+    grid = l2_grid(n, arr);
+  }
+  double after_grid = omp_get_wtime(); 
+  double partime_grid = (double)(after_grid - before_grid); 
+  printf("grid: %f in %f s\n" , grid, partime_grid);
+  double bw_grid = (double)((1ULL<<e) * niter * sizeof(double)) / (1000000000.0 * partime_grid);  
+  printf("grid bandwidth: %f GB/s \n" , bw_grid);
 }
 
diff --git a/kitsune-tests/reductions/makefile b/kitsune-tests/reductions/makefile
index d3d264acb781b..9f4bd3b1e478c 100644
--- a/kitsune-tests/reductions/makefile
+++ b/kitsune-tests/reductions/makefile
@@ -1,4 +1,4 @@
-all: l2 l2_gpu l2_serial l2_openmp
+all: l2_gpu l2 l2_serial l2_openmp
 
 l2.ll : l2.c
 	clang -S -emit-llvm l2.c -O1 -ftapir=none 
@@ -12,8 +12,8 @@ l2_stripmined_opt.ll: l2_stripmined.ll
 l2: l2_stripmined_opt.ll
 	clang -ftapir=opencilk -O2 l2_stripmined_opt.ll -o l2 -lm  -lllvm-gpu
 
-l2_gpu.ll: l2_stripmined_opt.ll
-	clang -ftapir=gpu -O1 -S -emit-llvm $< -o $@ 
+l2_gpu.ll: l2.ll
+	clang -ftapir=gpu -O1 -S -emit-llvm $< -o $@  -mllvm -debug-only=gpuabi
 
 l2_gpu: l2_gpu.ll
 	clang l2_gpu.ll -fPIC -o l2_gpu -ftapir=gpu -lm -lllvm-gpu
diff --git a/llvm/include/llvm/Transforms/Tapir/GPUABI.h b/llvm/include/llvm/Transforms/Tapir/GPUABI.h
index 63bdd19cf6e63..adb5c95cf190a 100644
--- a/llvm/include/llvm/Transforms/Tapir/GPUABI.h
+++ b/llvm/include/llvm/Transforms/Tapir/GPUABI.h
@@ -67,6 +67,8 @@ class LLVMLoop : public LoopOutlineProcessor {
   FunctionCallee GPUInit = nullptr;
   FunctionCallee GPULaunchKernel = nullptr;
   FunctionCallee GPUWaitKernel = nullptr;
+  FunctionCallee GPUManagedMalloc = nullptr; 
+  FunctionCallee GPUGridSize = nullptr; 
 
   SmallVector<Value *, 5> OrderedInputs; 
 public:
@@ -86,8 +88,9 @@ class LLVMLoop : public LoopOutlineProcessor {
                           ValueToValueMapTy &VMap) override final;
   void processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
                                DominatorTree &DT) override final;
+  void preProcessTapirLoop(TapirLoopInfo &TL, ValueToValueMapTy &VMap, ValueSet& LoopInputs) override final;
 };
-}
+} //namespace llvm
 
 #endif
 /*
diff --git a/llvm/include/llvm/Transforms/Tapir/LoweringUtils.h b/llvm/include/llvm/Transforms/Tapir/LoweringUtils.h
index e2e65b4000fed..99a0a7029c32b 100644
--- a/llvm/include/llvm/Transforms/Tapir/LoweringUtils.h
+++ b/llvm/include/llvm/Transforms/Tapir/LoweringUtils.h
@@ -449,6 +449,9 @@ class LoopOutlineProcessor {
   virtual void preProcessTapirLoop(TapirLoopInfo &TL, ValueToValueMapTy &VMap)
   { /* no-op */ }
 
+  virtual void preProcessTapirLoop(TapirLoopInfo &TL, ValueToValueMapTy &VMap, ValueSet &LoopInputs)
+  { preProcessTapirLoop(TL, VMap); }
+
   /// Processes an outlined Function Helper for a Tapir loop, just after the
   /// function has been outlined.
   virtual void postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out,
diff --git a/llvm/lib/Transforms/Tapir/GPUABI.cpp b/llvm/lib/Transforms/Tapir/GPUABI.cpp
index 03b7309023594..1d1fd763343f4 100644
--- a/llvm/lib/Transforms/Tapir/GPUABI.cpp
+++ b/llvm/lib/Transforms/Tapir/GPUABI.cpp
@@ -45,8 +45,10 @@ static cl::opt<bool>
           "(default=false)"));
 
 Value *GPUABI::lowerGrainsizeCall(CallInst *GrainsizeCall) {
-  Value *Grainsize = ConstantInt::get(GrainsizeCall->getType(), 8);
-
+  //Value *Grainsize = ConstantInt::get(GrainsizeCall->getType(), 8);
+  Module *M = GrainsizeCall->getModule(); 
+  Type *Int64Ty = Type::getInt64Ty(M->getContext());
+  Value* Grainsize = CallInst::Create(M->getOrInsertFunction("gpuGridSize", Int64Ty), {},  GrainsizeCall); 
   // Replace uses of grainsize intrinsic call with this grainsize value.
   GrainsizeCall->replaceAllUsesWith(Grainsize);
   return Grainsize;
@@ -116,6 +118,8 @@ LLVMLoop::LLVMLoop(Module &M)
   GPUInit = M.getOrInsertFunction("initRuntime", VoidTy);
   GPULaunchKernel = M.getOrInsertFunction("launchBCKernel", VoidPtrTy, VoidPtrTy, Int64Ty, VoidPtrPtrTy, Int64Ty);
   GPUWaitKernel = M.getOrInsertFunction("waitKernel", VoidTy, VoidPtrTy);
+  GPUManagedMalloc = M.getOrInsertFunction("gpuManagedMalloc", VoidPtrTy, Int64Ty);  
+  GPUGridSize = M.getOrInsertFunction("gpuGridSize", Int64Ty);  
 }
 
 void LLVMLoop::setupLoopOutlineArgs(
@@ -196,6 +200,7 @@ void LLVMLoop::postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out,
   BasicBlock *Header = cast<BasicBlock>(VMap[L->getHeader()]);
   BasicBlock *Exit = cast<BasicBlock>(VMap[TL.getExitBlock()]);
   PHINode *PrimaryIV = cast<PHINode>(VMap[TL.getPrimaryInduction().first]);
+  InductionDescriptor ID = TL.getPrimaryInduction().second; 
   Value *PrimaryIVInput = PrimaryIV->getIncomingValueForBlock(Entry);
   Instruction *ClonedSyncReg = cast<Instruction>(
       VMap[T->getDetach()->getSyncRegion()]);
@@ -253,14 +258,22 @@ void LLVMLoop::postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out,
   ClonedCond->setOperand(TripCountIdx, End);
   assert(ClonedCond->getOperand(TripCountIdx) == End &&
          "End argument not used in condition");
-
+  // we replace the step
+  assert(TL.getPrimaryInduction().second.getInductionBinOp()->getOpcode() == Instruction::BinaryOps::Add &&
+        "Only support gpu kernels with addition for induction step"); 
+  ConstantInt *CStep = cast<ConstantInt>(VMap[ID.getConstIntStepValue()]);  
+  BinaryOperator *BinOp = cast<BinaryOperator>(VMap[ID.getInductionBinOp()]); 
+  assert(CStep && BinOp && "Couldn't infer step or operator from primary induction gpu variable"); 
+  IRBuilder<> IncB(&BinOp->getParent()->front());  
+  Value* NewStep = IncB.CreateMul(CStep, Grainsize); 
+  BinOp->setOperand(1, NewStep); 
+  ClonedCond->setPredicate(ICmpInst::Predicate::ICMP_UGE); 
 }
 
 void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
                                       DominatorTree &DT) {
   LLVMContext &Ctx = M.getContext();
   Type *Int8Ty = Type::getInt8Ty(Ctx);
-  Type *Int32Ty = Type::getInt32Ty(Ctx);
   Type *Int64Ty = Type::getInt64Ty(Ctx);
   Type *VoidPtrTy = Type::getInt8PtrTy(Ctx);
 
@@ -273,12 +286,7 @@ void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
 
   IRBuilder<> B(&NBB->front());
 
-  // Compile the kernel
-  //LLVMM.getFunctionList().remove(TOI.Outline);
-  //TOI.Outline->eraseFromParent();
-  LLVMContext &LLVMCtx = LLVMM.getContext();
-
-  ValueToValueMapTy VMap;
+  ValueToValueMapTy VMap; 
   // We recursively add definitions and declarations to the device module
   SmallVector<Function*> todo;
   todo.push_back(LLVMM.getFunction("kitsune_kernel"));
@@ -390,14 +398,10 @@ void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
                                  GlobalValue::PrivateLinkage, LLVMBC,
                                  "gpu_" + Twine("kitsune_kernel"));
 
-  Value *KernelID = ConstantInt::get(Int32Ty, MyKernelID);
-  Value *LLVMPtr = B.CreateBitCast(LLVMGlobal, VoidPtrTy);
-  Type *VoidPtrPtrTy = VoidPtrTy->getPointerTo();
-
-  Constant *kernelSize = ConstantInt::get(Int64Ty,
-    LLVMGlobal->getInitializer()->getType()->getArrayNumElements());
-  BasicBlock &EBB = Parent->getEntryBlock();
-  IRBuilder<> EB(&EBB.front());
+  Constant *kernelSize = ConstantInt::get(Int64Ty, 
+    LLVMGlobal->getInitializer()->getType()->getArrayNumElements()); 
+  BasicBlock &EBB = Parent->getEntryBlock(); 
+  IRBuilder<> EB(&EBB.front()); 
   EB.CreateCall(GPUInit, {});
 
   ArrayType* arrayType = ArrayType::get(VoidPtrTy, OrderedInputs.size());
@@ -421,4 +425,103 @@ void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
   LLVM_DEBUG(dbgs() << "Finished processOutlinedLoopCall: " << M);
 }
 
-#endif
+void LLVMLoop::preProcessTapirLoop(TapirLoopInfo &TL, ValueToValueMapTy &VMap, ValueSet &LoopInputs) {
+  Loop *L = TL.getLoop();  
+  BasicBlock *PH = L->getLoopPreheader(); 
+  BasicBlock *Header = L->getHeader();
+  BasicBlock *Latch = L->getLoopLatch();
+  BranchInst *LatchBR = cast<BranchInst>(Latch->getTerminator());
+  unsigned ExitIndex = LatchBR->getSuccessor(0) == Header ? 1 : 0;
+  BasicBlock *LatchExit = LatchBR->getSuccessor(ExitIndex);
+  DetachInst *DI = cast<DetachInst>(Header->getTerminator());
+  Value *SyncReg = DI->getSyncRegion();
+  Value *GS; 
+
+  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+
+  // accumulate Reductions in main loop
+  const std::vector<BasicBlock*>& Blocks = L->getBlocks(); 
+  std::set<std::pair<CallInst*, Type*>> Reductions;
+  for (BasicBlock *BB : Blocks){
+    for (Instruction &I : *BB) {
+      if(auto *CI = dyn_cast<CallInst>(&I)){
+        auto *F = CI->getCalledFunction(); 
+        if(F->getAttributes().hasAttrSomewhere(Attribute::KitsuneReduction)){
+          LLVM_DEBUG(dbgs() << "Found reduction var: " << CI->getArgOperand(0)->getName() << 
+                               "with reduction function: " << F->getName() << "\n"); 
+          auto *Ty = CI->getArgOperand(1)->getType(); 
+          Reductions.insert(std::make_pair(CI, Ty)); 
+          //TODO: check the type to confirm valid reduction
+        }
+      }
+    }
+  }
+
+  std::vector<std::tuple<CallInst*, Value* , Value*, Type*>> RedMap; 
+
+  for(auto &Red : Reductions){
+    // We assume there's a preheader, and use it to insert allocations for reductions:
+    // preheader: 
+    //   gs = gridSize()
+    //   reds = gpuManagedMalloc(gs)
+    //   br header
+    IRBuilder<> RB(&PH->front()); 
+    CallInst *CI = Red.first; 
+    Value *Ptr = CI->getArgOperand(0); 
+    Type *Ty = Red.second; 
+    GS = RB.CreateCall(GPUGridSize); 
+    Value *NBytes = RB.CreateMul(GS, ConstantInt::get(GS->getType(), DL.getTypeAllocSize(Ty))); 
+    CallInst *Alloc = RB.CreateCall(GPUManagedMalloc, {NBytes}); 
+    LoopInputs.insert(Alloc); 
+    // We overwrite in the body the location to Alloc, which will be replaced with a GEP using the tid in postprocessing
+    CI->setOperand(0, Alloc); 
+
+    RedMap.push_back(std::make_tuple(CI, Ptr, Alloc, Ty)); 
+  }
+
+  if(!Reductions.empty()){
+    Instruction* Term = LatchExit->getTerminator(); 
+    BasicBlock *PostSync = Term->getSuccessor(0);
+    BasicBlock* RedEpiHeader = BasicBlock::Create(LatchExit->getContext(), "reductionEpilogue", LatchExit->getParent(), LatchExit); 
+    RedEpiHeader->moveAfter(LatchExit); 
+    ReplaceInstWithInst(Term, SyncInst::Create(RedEpiHeader, SyncReg)); 
+    BranchInst::Create(PostSync, RedEpiHeader);
+    PHINode *Idx = PHINode::Create(GS->getType(), 2,
+                                   "reductionepilogueidx",
+                                   RedEpiHeader->getFirstNonPHI());
+    IRBuilder<> BH(RedEpiHeader->getFirstNonPHI()); 
+    Idx->addIncoming(ConstantInt::get(GS->getType(), 0), LatchExit);
+    Instruction *BodyTerm, *ExitTerm;
+    Value *Cmp = BH.CreateCmp(CmpInst::ICMP_NE, Idx, GS); 
+    SplitBlockAndInsertIfThenElse(Cmp, RedEpiHeader->getTerminator(), &BodyTerm, &ExitTerm);
+
+    IRBuilder<> BB(BodyTerm); 
+    // For each reduction, get the allocated thread local reduced values and
+    // reduce them. 
+    for(auto& KV : RedMap){
+      const auto [ CI, Ptr, Al, Ty ] = KV; 
+      Value* Lptr = BB.CreateBitCast(
+        BB.CreateGEP(Ty, Al, Idx), 
+        Ptr->getType());                             
+      Value *LR = BB.CreateLoad(Ty, Lptr); 
+      BB.SetCurrentDebugLocation(CI->getDebugLoc()); 
+      BB.CreateCall(CI->getCalledFunction(), { Ptr, LR }); 
+    }
+    Value *IdxAdd =
+        BB.CreateAdd(Idx, ConstantInt::get(Idx->getType(), 1),
+                          Idx->getName() + ".add");
+    BasicBlock* Body = BodyTerm->getParent(); 
+    BasicBlock* LoopExit = ExitTerm->getParent(); 
+    Idx->addIncoming(IdxAdd, Body); 
+    ReplaceInstWithInst(BodyTerm, BranchInst::Create(RedEpiHeader)); 
+
+    // Update Loopinfo with reduction loop
+    //Loop* RL = LI->AllocateLoop(); 
+    //if(ParentLoop) ParentLoop->addChildLoop(RL); 
+    //else LI->addTopLevelLoop(RL); 
+    //RL->addBasicBlockToLoop(RedEpiHeader, *LI); 
+    //RL->addBasicBlockToLoop(body, *LI); 
+  }
+  
+  LLVM_DEBUG(dbgs() << "Finished preProcessTapirLoop: " << *PH->getParent());
+}
diff --git a/llvm/lib/Transforms/Tapir/LoopSpawningTI.cpp b/llvm/lib/Transforms/Tapir/LoopSpawningTI.cpp
index d98fdad17043a..ff5321ca23699 100644
--- a/llvm/lib/Transforms/Tapir/LoopSpawningTI.cpp
+++ b/llvm/lib/Transforms/Tapir/LoopSpawningTI.cpp
@@ -1535,11 +1535,14 @@ Function *LoopSpawningImpl::createHelperForTapirLoop(
 
 /// Outline all recorded Tapir loops in the function.
 TaskOutlineMapTy LoopSpawningImpl::outlineAllTapirLoops() {
+
   // Prepare Tapir loops for outlining.
   for (Task *T : post_order(TI.getRootTask())) {
     if (TapirLoopInfo *TL = getTapirLoop(T)) {
+
       PredicatedScalarEvolution PSE(SE, *TL->getLoop());
-      bool CanOutline = TL->prepareForOutlining(DT, LI, TI, PSE, AC, LS_NAME,
+
+      bool canOutline = TL->prepareForOutlining(DT, LI, TI, PSE, AC, LS_NAME,
                                                 ORE, TTI);
       if (!CanOutline) {
         const Loop *L = TL->getLoop();
@@ -1552,6 +1555,7 @@ TaskOutlineMapTy LoopSpawningImpl::outlineAllTapirLoops() {
       // Get an outline processor for each Tapir loop.
       OutlineProcessors[TL] =
         std::unique_ptr<LoopOutlineProcessor>(getOutlineProcessor(TL));
+
     }
   }
 
@@ -1589,12 +1593,17 @@ TaskOutlineMapTy LoopSpawningImpl::outlineAllTapirLoops() {
     } // end timed region
 
     TapirLoopInfo *TL = getTapirLoop(T);
+
     if (!TL)
       continue;
 
     Loop *L = TL->getLoop();
     LLVM_DEBUG(dbgs() << "Outlining Tapir " << *L << "\n");
 
+    ValueToValueMapTy VMap; 
+    // Run a pre-processing step before we create the helper function.
+    OutlineProcessors[TL]->preProcessTapirLoop(*TL, VMap, LoopInputSets[L]);
+
     // Convert the inputs of the Tapir loop to inputs to the helper.
     ValueSet TLInputsFixed;
     ValueToValueMapTy InputMap;
@@ -1611,7 +1620,6 @@ TaskOutlineMapTy LoopSpawningImpl::outlineAllTapirLoops() {
                           OutlineProcessors[TL]->getArgStructMode(), InputMap,
                           L);
     } // end timed region
-
     ValueSet HelperArgs;
     SmallVector<Value *, 8> HelperInputs;
     {
@@ -1619,6 +1627,7 @@ TaskOutlineMapTy LoopSpawningImpl::outlineAllTapirLoops() {
                          "Setup inputs to Tapir-loop helper function",
                          TimerGroupName, TimerGroupDescription,
                          TimePassesIsEnabled);
+
     OutlineProcessors[TL]->setupLoopOutlineArgs(
         F, HelperArgs, HelperInputs, LoopInputSets[L], LoopCtlArgs[L],
         LoopCtlInputs[L], TLInputsFixed);
@@ -1638,11 +1647,6 @@ TaskOutlineMapTy LoopSpawningImpl::outlineAllTapirLoops() {
       LoopInputs[L].push_back(V);
     LoopArgStarts[L] = ArgStart;
 
-    ValueToValueMapTy VMap;
-
-    // Run a pre-processing step before we create the helper function.
-    OutlineProcessors[TL]->preProcessTapirLoop(*TL, VMap);
-
     // Create the helper function.
     Function *Outline = createHelperForTapirLoop(
         TL, LoopArgs[L], OutlineProcessors[TL]->getIVArgIndex(F, LoopArgs[L]),

From c85f06d8a66d62d60535c4f63d08abe54a300122 Mon Sep 17 00:00:00 2001
From: George Stelle <stelleg@lanl.gov>
Date: Thu, 7 Nov 2024 21:34:21 -0700
Subject: [PATCH 13/14] working high performance gpu reductions

---
 kitsune-tests/reductions/l2.c                |  9 +--
 kitsune-tests/reductions/makefile            | 11 +--
 llvm/include/llvm/Transforms/Tapir/GPUABI.h  |  3 +-
 llvm/lib/Transforms/Tapir/GPUABI.cpp         | 73 ++++++++++++++++----
 llvm/lib/Transforms/Tapir/LoopSpawningTI.cpp | 13 ++--
 5 files changed, 80 insertions(+), 29 deletions(-)

diff --git a/kitsune-tests/reductions/l2.c b/kitsune-tests/reductions/l2.c
index 1da6e4b2968f3..74fc51ec8337e 100644
--- a/kitsune-tests/reductions/l2.c
+++ b/kitsune-tests/reductions/l2.c
@@ -14,10 +14,11 @@ __attribute__((noinline))
 double l2(uint64_t n, double* a){
   double red = 0; 
   forall(uint64_t i=0; i<n; i++){
-    sum(&red, a[i]*a[i]); 
+    sum(&red, a[i]); 
   }
 
-  return sqrt(red);
+  return red; 
+  //return sqrt(red);
 }
 
 int main(int argc, char** argv){
@@ -27,10 +28,10 @@ int main(int argc, char** argv){
   double* arr = (double*)gpuManagedMalloc(sizeof(double) * n); 
 
   forall(uint64_t i=0; i<n; i++){
-    arr[i] = i; 
+    arr[i] = 1; 
   }
 
-  printf("par:%f \n", l2(n, arr));
+  printf("result: %f \n", l2(n, arr));
 
   /*
   clock_t before = clock();
diff --git a/kitsune-tests/reductions/makefile b/kitsune-tests/reductions/makefile
index 9f4bd3b1e478c..55e97f971e773 100644
--- a/kitsune-tests/reductions/makefile
+++ b/kitsune-tests/reductions/makefile
@@ -1,4 +1,5 @@
-all: l2_gpu l2 l2_serial l2_openmp
+exes=l2_gpu l2 l2_serial l2_openmp
+all:${exes}
 
 l2.ll : l2.c
 	clang -S -emit-llvm l2.c -O1 -ftapir=none 
@@ -10,13 +11,13 @@ l2_stripmined_opt.ll: l2_stripmined.ll
 	opt -S -O2 -o l2_stripmined_opt.ll l2_stripmined.ll
 
 l2: l2_stripmined_opt.ll
-	clang -ftapir=opencilk -O2 l2_stripmined_opt.ll -o l2 -lm  -lllvm-gpu
+	clang -ftapir=opencilk -O2 l2_stripmined_opt.ll -o l2 -lm  -lllvm-gpu -lpthread
 
 l2_gpu.ll: l2.ll
-	clang -ftapir=gpu -O1 -S -emit-llvm $< -o $@  -mllvm -debug-only=gpuabi
+	clang -ftapir=gpu -O1 -S -emit-llvm $< -o $@ -mllvm --debug-only=gpuabi
 
 l2_gpu: l2_gpu.ll
-	clang l2_gpu.ll -fPIC -o l2_gpu -ftapir=gpu -lm -lllvm-gpu
+	clang l2_gpu.ll -fPIC -o l2_gpu -ftapir=gpu -lm -lllvm-gpu -lpthread
 
 l2_serial: l2.ll
 	clang l2.ll -o l2_serial -ftapir=serial -lllvm-gpu -lm
@@ -25,4 +26,4 @@ l2_openmp: l2_openmp.c
 	clang l2_openmp.c -fopenmp -O2 -o l2_openmp -lm
 
 clean: 
-	rm -f l2_stripmined.ll l2 l2.ll
+	rm -f ${exes} l2.ll
diff --git a/llvm/include/llvm/Transforms/Tapir/GPUABI.h b/llvm/include/llvm/Transforms/Tapir/GPUABI.h
index adb5c95cf190a..112e10780a98e 100644
--- a/llvm/include/llvm/Transforms/Tapir/GPUABI.h
+++ b/llvm/include/llvm/Transforms/Tapir/GPUABI.h
@@ -60,7 +60,6 @@ class LLVMLoop : public LoopOutlineProcessor {
   static unsigned NextKernelID;
   unsigned MyKernelID;
   Module LLVMM;
-  TargetMachine *LLVMTargetMachine;
   GlobalVariable *LLVMGlobal;
 
   FunctionCallee GetThreadIdx = nullptr;
@@ -88,7 +87,7 @@ class LLVMLoop : public LoopOutlineProcessor {
                           ValueToValueMapTy &VMap) override final;
   void processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
                                DominatorTree &DT) override final;
-  void preProcessTapirLoop(TapirLoopInfo &TL, ValueToValueMapTy &VMap, ValueSet& LoopInputs) override final;
+  void preProcessTapirLoop(TapirLoopInfo &TL, ValueToValueMapTy &VMap) override final;
 };
 } //namespace llvm
 
diff --git a/llvm/lib/Transforms/Tapir/GPUABI.cpp b/llvm/lib/Transforms/Tapir/GPUABI.cpp
index 1d1fd763343f4..6db973aa5a9d6 100644
--- a/llvm/lib/Transforms/Tapir/GPUABI.cpp
+++ b/llvm/lib/Transforms/Tapir/GPUABI.cpp
@@ -16,13 +16,17 @@
 #if 0
 
 #include "llvm/Transforms/Tapir/GPUABI.h"
+#include "llvm/Transforms/Tapir/TapirToTarget.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Tapir/Outline.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/LoopPeel.h"
+#include "llvm/Transforms/Utils/LoopSimplify.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/GVN.h"
 #include "llvm/Transforms/Vectorize.h"
@@ -105,15 +109,11 @@ LLVMLoop::LLVMLoop(Module &M)
 
   // Insert runtime-function declarations in LLVM host modules.
   Type *LLVMInt32Ty = Type::getInt32Ty(LLVMM.getContext());
-  Type *LLVMInt64Ty = Type::getInt64Ty(LLVMM.getContext());
   GetThreadIdx = LLVMM.getOrInsertFunction("gtid", LLVMInt32Ty);
-  Function* getid = LLVMM.getFunction("gtid");
 
   Type *VoidTy = Type::getVoidTy(M.getContext());
   Type *VoidPtrTy = Type::getInt8PtrTy(M.getContext());
   Type *VoidPtrPtrTy = VoidPtrTy->getPointerTo();
-  Type *Int8Ty = Type::getInt8Ty(M.getContext());
-  Type *Int32Ty = Type::getInt32Ty(M.getContext());
   Type *Int64Ty = Type::getInt64Ty(M.getContext());
   GPUInit = M.getOrInsertFunction("initRuntime", VoidTy);
   GPULaunchKernel = M.getOrInsertFunction("launchBCKernel", VoidPtrTy, VoidPtrTy, Int64Ty, VoidPtrPtrTy, Int64Ty);
@@ -187,19 +187,24 @@ unsigned LLVMLoop::getLimitArgIndex(const Function &F, const ValueSet &Args)
 
 void LLVMLoop::postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out,
                                    ValueToValueMapTy &VMap) {
-  LLVMContext &Ctx = M.getContext();
-  Type *Int8Ty = Type::getInt8Ty(Ctx);
-  Type *Int32Ty = Type::getInt32Ty(Ctx);
-  //Type *Int64Ty = Type::getInt64Ty(Ctx);
-  //Type *VoidPtrTy = Type::getInt8PtrTy(Ctx);
   Task *T = TL.getTask();
   Loop *L = TL.getLoop();
 
+  Function *Helper = Out.Outline;
+
+  DominatorTree DT;
+  DT.recalculate(*Helper); 
+  LoopInfo LI(DT); 
+
+  // The loop we care about is the outermost loop in the 
 
   BasicBlock *Entry = cast<BasicBlock>(VMap[L->getLoopPreheader()]);
   BasicBlock *Header = cast<BasicBlock>(VMap[L->getHeader()]);
   BasicBlock *Exit = cast<BasicBlock>(VMap[TL.getExitBlock()]);
   PHINode *PrimaryIV = cast<PHINode>(VMap[TL.getPrimaryInduction().first]);
+
+  Loop *NewLoop = LI.getLoopFor(Header); 
+
   InductionDescriptor ID = TL.getPrimaryInduction().second; 
   Value *PrimaryIVInput = PrimaryIV->getIncomingValueForBlock(Entry);
   Instruction *ClonedSyncReg = cast<Instruction>(
@@ -217,8 +222,7 @@ void LLVMLoop::postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out,
   Value *ThreadID = B.CreateIntCast(ThreadIdx, PrimaryIV->getType(), false);
 
 
-  Function *Helper = Out.Outline;
-  Helper->setName("kitsune_kernel");
+  Helper->setName("kitsune_kernel"); 
   // Fix argument pointer types to global, nocapture
   // TODO: read/write attributes?
   LLVM_DEBUG(dbgs() << "Function type after globalization of argument pointers << " << *Helper->getType() << "\n");
@@ -268,6 +272,49 @@ void LLVMLoop::postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out,
   Value* NewStep = IncB.CreateMul(CStep, Grainsize); 
   BinOp->setOperand(1, NewStep); 
   ClonedCond->setPredicate(ICmpInst::Predicate::ICMP_UGE); 
+
+  // Make each thread reduce into its local memory
+  for(auto &BB : NewLoop->getBlocks()){
+    for(auto &I : *BB){
+      if(auto *CI = dyn_cast<CallInst>(&I)){
+        auto *f = CI->getCalledFunction(); 
+        if(f->getAttributes().hasAttrSomewhere(Attribute::KitsuneReduction)){
+          IRBuilder<> PB(&I); 
+          auto ptr = CI->getOperand(0); 
+          auto ty = CI->getOperand(1)->getType();  
+          auto lptr = PB.CreateGEP(ty, ptr, ThreadIdx); 
+          CI->setOperand(0, lptr); 
+          f->removeFnAttr(Attribute::NoInline); 
+        }
+      }
+    }
+  }
+
+  TargetLibraryInfoImpl TLII(Triple(M.getTargetTriple()));
+  TargetLibraryInfo TLI(TLII); 
+  AssumptionCache AC(*Helper); 
+  ScalarEvolution SE(*Helper, TLI, AC, DT, LI); 
+  ValueToValueMapTy peelVMap; 
+  simplifyLoop(NewLoop, &DT, &LI, &SE, &AC, nullptr, false); 
+  peelLoop(NewLoop, 1, &LI, &SE, DT, &AC, false, peelVMap); 
+  SmallVector<Instruction*, 4> cis;
+  for(auto &BB : *Helper){
+    if(!NewLoop->contains(&BB)){ // better way?
+      for(auto &I : BB){
+        if(auto *CI = dyn_cast<CallInst>(&I)){
+          auto *f = CI->getCalledFunction(); 
+          if(f->getAttributes().hasAttrSomewhere(Attribute::KitsuneReduction)){
+            IRBuilder<> pb(&I); 
+            pb.CreateStore(CI->getArgOperand(1), CI->getArgOperand(0)); 
+            cis.push_back(&I); 
+          }
+        }
+      }
+    }
+  }
+  for(auto &I : cis){
+    I->eraseFromParent(); 
+  }
 }
 
 void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
@@ -425,7 +472,7 @@ void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
   LLVM_DEBUG(dbgs() << "Finished processOutlinedLoopCall: " << M);
 }
 
-void LLVMLoop::preProcessTapirLoop(TapirLoopInfo &TL, ValueToValueMapTy &VMap, ValueSet &LoopInputs) {
+void LLVMLoop::preProcessTapirLoop(TapirLoopInfo &TL, ValueToValueMapTy &VMap) {
   Loop *L = TL.getLoop();  
   BasicBlock *PH = L->getLoopPreheader(); 
   BasicBlock *Header = L->getHeader();
@@ -472,7 +519,6 @@ void LLVMLoop::preProcessTapirLoop(TapirLoopInfo &TL, ValueToValueMapTy &VMap, V
     GS = RB.CreateCall(GPUGridSize); 
     Value *NBytes = RB.CreateMul(GS, ConstantInt::get(GS->getType(), DL.getTypeAllocSize(Ty))); 
     CallInst *Alloc = RB.CreateCall(GPUManagedMalloc, {NBytes}); 
-    LoopInputs.insert(Alloc); 
     // We overwrite in the body the location to Alloc, which will be replaced with a GEP using the tid in postprocessing
     CI->setOperand(0, Alloc); 
 
@@ -511,7 +557,6 @@ void LLVMLoop::preProcessTapirLoop(TapirLoopInfo &TL, ValueToValueMapTy &VMap, V
         BB.CreateAdd(Idx, ConstantInt::get(Idx->getType(), 1),
                           Idx->getName() + ".add");
     BasicBlock* Body = BodyTerm->getParent(); 
-    BasicBlock* LoopExit = ExitTerm->getParent(); 
     Idx->addIncoming(IdxAdd, Body); 
     ReplaceInstWithInst(BodyTerm, BranchInst::Create(RedEpiHeader)); 
 
diff --git a/llvm/lib/Transforms/Tapir/LoopSpawningTI.cpp b/llvm/lib/Transforms/Tapir/LoopSpawningTI.cpp
index ff5321ca23699..0be661c1112eb 100644
--- a/llvm/lib/Transforms/Tapir/LoopSpawningTI.cpp
+++ b/llvm/lib/Transforms/Tapir/LoopSpawningTI.cpp
@@ -1536,10 +1536,14 @@ Function *LoopSpawningImpl::createHelperForTapirLoop(
 /// Outline all recorded Tapir loops in the function.
 TaskOutlineMapTy LoopSpawningImpl::outlineAllTapirLoops() {
 
+  std::map<Loop *, ValueToValueMapTy> LoopVMaps;   
+
   // Prepare Tapir loops for outlining.
   for (Task *T : post_order(TI.getRootTask())) {
-    if (TapirLoopInfo *TL = getTapirLoop(T)) {
+    // Run a pre-processing step before we create the helper function.
 
+    if (TapirLoopInfo *TL = getTapirLoop(T)) {
+      
       PredicatedScalarEvolution PSE(SE, *TL->getLoop());
 
       bool canOutline = TL->prepareForOutlining(DT, LI, TI, PSE, AC, LS_NAME,
@@ -1556,6 +1560,8 @@ TaskOutlineMapTy LoopSpawningImpl::outlineAllTapirLoops() {
       OutlineProcessors[TL] =
         std::unique_ptr<LoopOutlineProcessor>(getOutlineProcessor(TL));
 
+      Loop *L = TL->getLoop(); 
+      OutlineProcessors[TL]->preProcessTapirLoop(*TL, LoopVMaps[L]);
     }
   }
 
@@ -1600,9 +1606,6 @@ TaskOutlineMapTy LoopSpawningImpl::outlineAllTapirLoops() {
     Loop *L = TL->getLoop();
     LLVM_DEBUG(dbgs() << "Outlining Tapir " << *L << "\n");
 
-    ValueToValueMapTy VMap; 
-    // Run a pre-processing step before we create the helper function.
-    OutlineProcessors[TL]->preProcessTapirLoop(*TL, VMap, LoopInputSets[L]);
 
     // Convert the inputs of the Tapir loop to inputs to the helper.
     ValueSet TLInputsFixed;
@@ -1647,6 +1650,8 @@ TaskOutlineMapTy LoopSpawningImpl::outlineAllTapirLoops() {
       LoopInputs[L].push_back(V);
     LoopArgStarts[L] = ArgStart;
 
+    ValueToValueMapTy &VMap = LoopVMaps[L]; 
+    
     // Create the helper function.
     Function *Outline = createHelperForTapirLoop(
         TL, LoopArgs[L], OutlineProcessors[TL]->getIVArgIndex(F, LoopArgs[L]),

From c4812367f6acfe687ca30d8d11754e4345b4488c Mon Sep 17 00:00:00 2001
From: George Stelle <stelleg@lanl.gov>
Date: Tue, 14 Jan 2025 14:37:53 -0700
Subject: [PATCH 14/14] GPU reductions loop info fixes

---
 kitsune-tests/reductions/l2.c                 | 29 +++++----
 kitsune-tests/reductions/makefile             |  8 +--
 llvm/include/llvm/Transforms/Tapir/GPUABI.h   |  2 +-
 .../llvm/Transforms/Tapir/LoweringUtils.h     |  3 +-
 llvm/lib/Transforms/Tapir/GPUABI.cpp          | 18 ++----
 llvm/lib/Transforms/Tapir/LoopSpawningTI.cpp  |  5 +-
 llvm/lib/Transforms/Tapir/LoopStripMine.cpp   | 63 +++++++++++--------
 7 files changed, 71 insertions(+), 57 deletions(-)

diff --git a/kitsune-tests/reductions/l2.c b/kitsune-tests/reductions/l2.c
index 74fc51ec8337e..c1918baee371c 100644
--- a/kitsune-tests/reductions/l2.c
+++ b/kitsune-tests/reductions/l2.c
@@ -3,6 +3,7 @@
 #include<stdio.h>
 #include<stdlib.h>
 #include<kitsune.h>
+#include<omp.h>
 #include<gpu.h>
 
 reduction
@@ -14,11 +15,10 @@ __attribute__((noinline))
 double l2(uint64_t n, double* a){
   double red = 0; 
   forall(uint64_t i=0; i<n; i++){
-    sum(&red, a[i]); 
+    sum(&red, x); 
   }
 
-  return red; 
-  //return sqrt(red);
+  return sqrt(red); 
 }
 
 int main(int argc, char** argv){
@@ -28,23 +28,26 @@ int main(int argc, char** argv){
   double* arr = (double*)gpuManagedMalloc(sizeof(double) * n); 
 
   forall(uint64_t i=0; i<n; i++){
-    arr[i] = 1; 
+    arr[i] = (double)i; 
   }
 
   printf("result: %f \n", l2(n, arr));
 
-  /*
-  clock_t before = clock();
-  double par; 
+  double res[niter];
+  double before = omp_get_wtime(); 
   for(int i=0; i<niter; i++){
-    par = l2(n, arr);
+    double red = l2(n, arr); 
+    //printf("red: %f\n", red); 
+    res[i]=red; 
   }
-  clock_t after = clock(); 
-  double partime = (double)(after - before) / 1000000; 
+  double after = omp_get_wtime(); 
 
-  printf("par: %f in %f s \n" , par, partime);
+  double partime = (double)(after - before); 
   double bw = (double)((1ULL<<e) * niter * sizeof(double)) / (1000000000.0 * partime);  
-  printf("par bandwidth: %f GB/s \n" , bw);
-  */
+  printf("bandwidth: %f GB/s \n" , bw);
+
+  //double time = (double)(after - before) / 1000000; 
+  //double bw = (double)((1ULL<<e) * niter * sizeof(double)) / (1000000000.0 * time);  
+  //printf("bandwidth: %f GB/s \n" , bw);
 }
 
diff --git a/kitsune-tests/reductions/makefile b/kitsune-tests/reductions/makefile
index 55e97f971e773..67eb8ddf1118d 100644
--- a/kitsune-tests/reductions/makefile
+++ b/kitsune-tests/reductions/makefile
@@ -1,4 +1,4 @@
-exes=l2_gpu l2 l2_serial l2_openmp
+exes=l2 l2_gpu l2_serial l2_openmp
 all:${exes}
 
 l2.ll : l2.c
@@ -11,16 +11,16 @@ l2_stripmined_opt.ll: l2_stripmined.ll
 	opt -S -O2 -o l2_stripmined_opt.ll l2_stripmined.ll
 
 l2: l2_stripmined_opt.ll
-	clang -ftapir=opencilk -O2 l2_stripmined_opt.ll -o l2 -lm  -lllvm-gpu -lpthread
+	clang -ftapir=opencilk -O1 l2_stripmined_opt.ll -o l2 -lm  -lllvm-gpu -lpthread -lomp
 
 l2_gpu.ll: l2.ll
 	clang -ftapir=gpu -O1 -S -emit-llvm $< -o $@ -mllvm --debug-only=gpuabi
 
 l2_gpu: l2_gpu.ll
-	clang l2_gpu.ll -fPIC -o l2_gpu -ftapir=gpu -lm -lllvm-gpu -lpthread
+	clang l2_gpu.ll -fPIC -O1 -o l2_gpu -ftapir=gpu -lm -lllvm-gpu -lpthread -lomp
 
 l2_serial: l2.ll
-	clang l2.ll -o l2_serial -ftapir=serial -lllvm-gpu -lm
+	clang l2.ll -o l2_serial -ftapir=serial -lllvm-gpu -lm -lomp
 
 l2_openmp: l2_openmp.c
 	clang l2_openmp.c -fopenmp -O2 -o l2_openmp -lm
diff --git a/llvm/include/llvm/Transforms/Tapir/GPUABI.h b/llvm/include/llvm/Transforms/Tapir/GPUABI.h
index 112e10780a98e..3b45e2cc3754f 100644
--- a/llvm/include/llvm/Transforms/Tapir/GPUABI.h
+++ b/llvm/include/llvm/Transforms/Tapir/GPUABI.h
@@ -87,7 +87,7 @@ class LLVMLoop : public LoopOutlineProcessor {
                           ValueToValueMapTy &VMap) override final;
   void processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
                                DominatorTree &DT) override final;
-  void preProcessTapirLoop(TapirLoopInfo &TL, ValueToValueMapTy &VMap) override final;
+  void preProcessTapirLoop(TapirLoopInfo &TL, ValueToValueMapTy &VMap, LoopInfo& LI) override final;
 };
 } //namespace llvm
 
diff --git a/llvm/include/llvm/Transforms/Tapir/LoweringUtils.h b/llvm/include/llvm/Transforms/Tapir/LoweringUtils.h
index 99a0a7029c32b..be0c8a5ed8367 100644
--- a/llvm/include/llvm/Transforms/Tapir/LoweringUtils.h
+++ b/llvm/include/llvm/Transforms/Tapir/LoweringUtils.h
@@ -17,6 +17,7 @@
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Transforms/Tapir/TapirTargetIDs.h"
@@ -449,7 +450,7 @@ class LoopOutlineProcessor {
   virtual void preProcessTapirLoop(TapirLoopInfo &TL, ValueToValueMapTy &VMap)
   { /* no-op */ }
 
-  virtual void preProcessTapirLoop(TapirLoopInfo &TL, ValueToValueMapTy &VMap, ValueSet &LoopInputs)
+  virtual void preProcessTapirLoop(TapirLoopInfo &TL, ValueToValueMapTy &VMap, LoopInfo &LI)
   { preProcessTapirLoop(TL, VMap); }
 
   /// Processes an outlined Function Helper for a Tapir loop, just after the
diff --git a/llvm/lib/Transforms/Tapir/GPUABI.cpp b/llvm/lib/Transforms/Tapir/GPUABI.cpp
index 6db973aa5a9d6..66ec4c98d9d4e 100644
--- a/llvm/lib/Transforms/Tapir/GPUABI.cpp
+++ b/llvm/lib/Transforms/Tapir/GPUABI.cpp
@@ -472,12 +472,13 @@ void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI,
   LLVM_DEBUG(dbgs() << "Finished processOutlinedLoopCall: " << M);
 }
 
-void LLVMLoop::preProcessTapirLoop(TapirLoopInfo &TL, ValueToValueMapTy &VMap) {
+void LLVMLoop::preProcessTapirLoop(TapirLoopInfo &TL, ValueToValueMapTy &VMap, LoopInfo &LI) {
   Loop *L = TL.getLoop();  
   BasicBlock *PH = L->getLoopPreheader(); 
   BasicBlock *Header = L->getHeader();
   BasicBlock *Latch = L->getLoopLatch();
   BranchInst *LatchBR = cast<BranchInst>(Latch->getTerminator());
+  BasicBlock &Entry = Header->getParent()->getEntryBlock(); 
   unsigned ExitIndex = LatchBR->getSuccessor(0) == Header ? 1 : 0;
   BasicBlock *LatchExit = LatchBR->getSuccessor(ExitIndex);
   DetachInst *DI = cast<DetachInst>(Header->getTerminator());
@@ -512,13 +513,13 @@ void LLVMLoop::preProcessTapirLoop(TapirLoopInfo &TL, ValueToValueMapTy &VMap) {
     //   gs = gridSize()
     //   reds = gpuManagedMalloc(gs)
     //   br header
-    IRBuilder<> RB(&PH->front()); 
+    IRBuilder<> EB(Entry.getFirstNonPHI()); 
     CallInst *CI = Red.first; 
     Value *Ptr = CI->getArgOperand(0); 
     Type *Ty = Red.second; 
-    GS = RB.CreateCall(GPUGridSize); 
-    Value *NBytes = RB.CreateMul(GS, ConstantInt::get(GS->getType(), DL.getTypeAllocSize(Ty))); 
-    CallInst *Alloc = RB.CreateCall(GPUManagedMalloc, {NBytes}); 
+    GS = EB.CreateCall(GPUGridSize); 
+    Value *NBytes = EB.CreateMul(GS, ConstantInt::get(GS->getType(), DL.getTypeAllocSize(Ty))); 
+    CallInst *Alloc = EB.CreateCall(GPUManagedMalloc, {NBytes}); 
     // We overwrite in the body the location to Alloc, which will be replaced with a GEP using the tid in postprocessing
     CI->setOperand(0, Alloc); 
 
@@ -559,13 +560,6 @@ void LLVMLoop::preProcessTapirLoop(TapirLoopInfo &TL, ValueToValueMapTy &VMap) {
     BasicBlock* Body = BodyTerm->getParent(); 
     Idx->addIncoming(IdxAdd, Body); 
     ReplaceInstWithInst(BodyTerm, BranchInst::Create(RedEpiHeader)); 
-
-    // Update Loopinfo with reduction loop
-    //Loop* RL = LI->AllocateLoop(); 
-    //if(ParentLoop) ParentLoop->addChildLoop(RL); 
-    //else LI->addTopLevelLoop(RL); 
-    //RL->addBasicBlockToLoop(RedEpiHeader, *LI); 
-    //RL->addBasicBlockToLoop(body, *LI); 
   }
   
   LLVM_DEBUG(dbgs() << "Finished preProcessTapirLoop: " << *PH->getParent());
diff --git a/llvm/lib/Transforms/Tapir/LoopSpawningTI.cpp b/llvm/lib/Transforms/Tapir/LoopSpawningTI.cpp
index 0be661c1112eb..8d2320688143c 100644
--- a/llvm/lib/Transforms/Tapir/LoopSpawningTI.cpp
+++ b/llvm/lib/Transforms/Tapir/LoopSpawningTI.cpp
@@ -1561,10 +1561,13 @@ TaskOutlineMapTy LoopSpawningImpl::outlineAllTapirLoops() {
         std::unique_ptr<LoopOutlineProcessor>(getOutlineProcessor(TL));
 
       Loop *L = TL->getLoop(); 
-      OutlineProcessors[TL]->preProcessTapirLoop(*TL, LoopVMaps[L]);
+      OutlineProcessors[TL]->preProcessTapirLoop(*TL, LoopVMaps[L], LI);
     }
   }
 
+  LI.releaseMemory(); 
+  LI.analyze(DT); 
+
   TaskOutlineMapTy TaskToOutline;
   DenseMap<Loop *, ValueSet> LoopInputSets;
   DenseMap<Loop *, SmallVector<Value *, 3>> LoopCtlArgs;
diff --git a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
index 0d12868b37c60..dfb9cfaeda4f7 100644
--- a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
+++ b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp
@@ -1533,29 +1533,14 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
 
   // Record that the remainder loop was derived from a Tapir loop.
   (*RemainderLoop)->setDerivedFromTapirLoop();
-
-  // At this point, the code is well formed.  We now simplify the new loops,
-  // doing constant propagation and dead code elimination as we go.
-  simplifyLoopAfterStripMine(L, /*SimplifyIVs*/ true, LI, SE, DT, TTI, AC);
-  simplifyLoopAfterStripMine(NewLoop, /*SimplifyIVs*/ true, LI, SE, DT, TTI,
-                             AC);
-  simplifyLoopAfterStripMine(*RemainderLoop, /*SimplifyIVs*/ true, LI, SE, DT,
-                             TTI, AC);
+  // Record that the old loop was derived from a Tapir loop.
+  L->setDerivedFromTapirLoop();
 
 #ifndef NDEBUG
   DT->verify();
   LI->verify(*DT);
 #endif
 
-  // Record that the old loop was derived from a Tapir loop.
-  L->setDerivedFromTapirLoop();
-
-  // Update TaskInfo manually using the updated DT.
-  if (TI)
-    // FIXME: Recalculating TaskInfo for the whole function is wasteful.
-    // Optimize this routine in the future.
-    TI->recalculate(*F, *DT);
-
   // accumulate reductions in main loop
   const std::vector<BasicBlock*>& blocks = L->getBlocks(); 
   std::set<std::pair<CallInst*, Type*>> reductions;
@@ -1680,15 +1665,16 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
 
     Instruction* term = LatchExit->getTerminator(); 
     BasicBlock *PostSync = term->getSuccessor(0);
-    BasicBlock* RedEpiHeader = BasicBlock::Create(LatchExit->getContext(), "reductionEpilogue", LatchExit->getParent(), LatchExit); 
+    BasicBlock *RedEpiHeader = SplitBlock(PostSync, PostSync->getTerminator(), DT, LI, nullptr, "reductionEpilogue"); 
+    //BasicBlock* RedEpiHeader = BasicBlock::Create(LatchExit->getContext(), "reductionEpilogue", LatchExit->getParent(), LatchExit); 
     RedEpiHeader->moveAfter(LatchExit); 
-    ReplaceInstWithInst(term, SyncInst::Create(RedEpiHeader, SyncReg)); 
-    BranchInst::Create(PostSync, RedEpiHeader);
+    ReplaceInstWithInst(PostSync->getTerminator(), SyncInst::Create(RedEpiHeader, SyncReg)); 
+    //BranchInst::Create(PostSync, RedEpiHeader);
     PHINode *Idx = PHINode::Create(outerIters->getType(), 2,
                                    "reductionepilogueidx",
                                    RedEpiHeader->getFirstNonPHI());
     IRBuilder<> BH(RedEpiHeader->getFirstNonPHI()); 
-    Idx->addIncoming(ConstantInt::get(outerIters->getType(), 0), LatchExit);
+    Idx->addIncoming(ConstantInt::get(outerIters->getType(), 0), PostSync);
     Instruction *bodyTerm, *exitTerm;
     Value *cmp = BH.CreateCmp(CmpInst::ICMP_NE, Idx, outerIters); 
     SplitBlockAndInsertIfThenElse(cmp, RedEpiHeader->getTerminator(), &bodyTerm, &exitTerm);
@@ -1715,16 +1701,43 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount,
 
     // Update Loopinfo with reduction loop
     Loop* RL = LI->AllocateLoop(); 
-    if(ParentLoop) ParentLoop->addChildLoop(RL); 
-    else LI->addTopLevelLoop(RL); 
-    RL->addBasicBlockToLoop(RedEpiHeader, *LI); 
-    RL->addBasicBlockToLoop(body, *LI); 
+    if(!ParentLoop){
+      LI->addTopLevelLoop(RL); 
+      RL->addBasicBlockToLoop(RedEpiHeader, *LI); 
+      RL->addBasicBlockToLoop(body, *LI); 
+    }  
+    else {
+      ParentLoop->addChildLoop(RL);
+      LI->changeLoopFor(RedEpiHeader, RL);
+      RL->addBlockEntry(RedEpiHeader);
+      LI->changeLoopFor(body, RL); 
+      RL->addBlockEntry(body);
+    }
+    simplifyLoop(RL, DT, LI, SE, AC, nullptr, PreserveLCSSA); 
   }
 
   LLVM_DEBUG(dbgs() << "Function after reduction epilogue\n" << *F);  
 
+  // At this point, the code is well formed.  We now simplify the new loops,
+  // doing constant propagation and dead code elimination as we go.
+  simplifyLoopAfterStripMine(L, /*SimplifyIVs*/ true, LI, SE, DT, TTI, AC);
+  simplifyLoopAfterStripMine(NewLoop, /*SimplifyIVs*/ true, LI, SE, DT, TTI,
+                             AC);
+  simplifyLoopAfterStripMine(*RemainderLoop, /*SimplifyIVs*/ true, LI, SE, DT,
+                             TTI, AC);
+
+
   // TODO: fix DT updates
   DT->recalculate(*F); 
+  LI->releaseMemory(); 
+  LI->analyze(*DT);
+
+  // Update TaskInfo manually using the updated DT.
+  if (TI)
+    // FIXME: Recalculating TaskInfo for the whole function is wasteful.
+    // Optimize this routine in the future.
+    TI->recalculate(*F, *DT);
+
 
 #ifndef NDEBUG
   DT->verify();