From 0aa95c2eb0acd5a0971ef5d51384e21e99c14567 Mon Sep 17 00:00:00 2001 From: George Stelle Date: Fri, 10 Dec 2021 09:37:44 -0700 Subject: [PATCH 01/14] Basic loop stripmine-based implicit parallel reduction (sum only) codegen --- llvm/lib/Transforms/Tapir/LoopStripMine.cpp | 112 ++++++++++++++++++++ 1 file changed, 112 insertions(+) diff --git a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp index 99bc2c9a962b7..3f328d9b4db2a 100644 --- a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp +++ b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp @@ -1551,5 +1551,117 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount, // Optimize this routine in the future. TI->recalculate(*F, *DT); + // iterate through the stores that should be treated as reductions + const std::vector& blocks = L->getBlocks(); + std::set reductions; + for (BasicBlock *BB : blocks){ + for (Instruction &I : *BB) { + if(auto si = dyn_cast(&I)){ + // TODO: better check if the store should be treated as a + // reduction. What we're doing is just checking if it's + // storing to a loop invariant pointer + Value* ptr = si->getPointerOperand(); + if(L->isLoopInvariant(ptr)) + reductions.insert(ptr); + } + } + } + + ValueToValueMap redMap; + // TODO: Modify the strip mining outer loop to be smaller: currently we are + // stack allocating n/2048 reduction values. + // TODO: Initialize local reductions with unit values + for(Value* ptr : reductions){ + IRBuilder<> B(F->getEntryBlock().getTerminator()); + auto ty = dyn_cast(ptr->getType())->getElementType(); + auto al = B.CreateAlloca(ty, TripCount, ptr->getName() + "_reduction"); + IRBuilder<> BH(L->getHeader()->getTerminator()); + auto lptr = BH.CreateBitCast( + BH.CreateGEP(al, NewIdx), + ptr->getType()); + redMap[ptr] = al; + // TODO: for now, just initializing with the initial sequential + // reduction value, which is often unit, but if it isn't this is + // wrong. + ptr->replaceUsesWithIf(lptr, [L](Use &u){ + if(auto I = dyn_cast(u.getUser())){ + I->dump(); + return L->contains(I->getParent()); + } else { + return false; + }; + }); + BH.CreateStore(BH.CreateLoad(ptr), lptr); + } + + // Epilog "join" of reduction values stored in local reduction value arrays. + // Should be able to use redMap to map original pointer (which is still used + // to reduce the remainder of the strimined loop, so you probably want to + // start the reduction with that value). + LLVM_DEBUG(dbgs() << "Function after strip mining, before reduction epilogue\n" << *F); + + // We insert the reduction code at every sync corresponding to the strimined + // loop + // + // Sync + // RedEpiHeader + // RedEpiBody + // RedEpiExit + + if(!reductions.empty()){ + SmallVector syncs; + for(auto &bb : *F){ + if(auto *sync = dyn_cast(bb.getTerminator())){ + syncs.push_back(sync); + } + } + for(auto *sync : syncs){ + if(sync->getSyncRegion() == SyncReg){ + BasicBlock *PostSync = sync->getSuccessor(0); + BasicBlock* RedEpiHeader = SplitBlock(PostSync, PostSync->getTerminator(), DT); + PHINode *Idx = PHINode::Create(TripCount->getType(), 2, + "reductionepilogueidx", + RedEpiHeader->getFirstNonPHI()); + IRBuilder<> BH(RedEpiHeader->getFirstNonPHI()); + Idx->addIncoming(TripCount, PostSync); + Instruction *bodyTerm, *exitTerm; + Value *cmp = BH.CreateIsNotNull(Idx); + SplitBlockAndInsertIfThenElse(cmp, RedEpiHeader->getTerminator(), &bodyTerm, &exitTerm); + + IRBuilder<> BB(bodyTerm); + // For each reduction, get the allocated thread local reduced values and + // reduce them. For now defaults to sums on primitive types. + // TODO: Add custom unital magmas and/or infer unital magma + for(auto& kv : redMap){ + auto al = kv.second; + Value* ptr = const_cast(kv.first); + auto lptr = BB.CreateBitCast( + BB.CreateGEP(al, Idx), + ptr->getType()); + auto acc = BB.CreateLoad(ptr); + auto x = BB.CreateLoad(lptr); + auto newacc = acc->getType()->isFloatingPointTy() ? BB.CreateFAdd(acc,x) : BB.CreateAdd(acc,x); + BB.CreateStore(newacc, ptr); + } + Value *IdxSub = + BB.CreateSub(Idx, ConstantInt::get(Idx->getType(), 1), + Idx->getName() + ".sub"); + Idx->addIncoming(IdxSub, bodyTerm->getParent()); + ReplaceInstWithInst(bodyTerm, BranchInst::Create(RedEpiHeader)); + } + } + } + + LLVM_DEBUG(dbgs() << "Function after reduction epilogue\n" << *F); + + // TODO: fix DT updates + DT->recalculate(*F); + /* +#ifndef NDEBUG + DT->verify(); + LI->verify(*DT); +#endif + */ + return NewLoop; } From b77e385bd96bd904aee6d82d6407afb03996fb34 Mon Sep 17 00:00:00 2001 From: George Stelle Date: Fri, 10 Dec 2021 09:39:32 -0700 Subject: [PATCH 02/14] Explicit parallel reductions via unital magmas added --- clang/lib/Headers/magma.h | 43 ++++++++++++++++++++++++++ clang/lib/Headers/reductions.h | 56 ++++++++++++++++++++++++++++++++++ 2 files changed, 99 insertions(+) create mode 100644 clang/lib/Headers/magma.h create mode 100644 clang/lib/Headers/reductions.h diff --git a/clang/lib/Headers/magma.h b/clang/lib/Headers/magma.h new file mode 100644 index 0000000000000..43c4fb8f1075f --- /dev/null +++ b/clang/lib/Headers/magma.h @@ -0,0 +1,43 @@ +#include +#include + +template +struct Magma { + virtual a op(a x, a y) = 0; +}; + +template +struct UnitalMagma : public Magma { + virtual a id() = 0; +}; + +// Example unital magmas +template +struct Sum : UnitalMagma{ + a op(a x, a y){ return x + y; } + a id(){ return 0; } // look into this more +}; + +template +struct Product : UnitalMagma { + a op(a x, a y){ return x * y; } + a id(){ return 1; } +}; + +struct StringApp : UnitalMagma { + std::string op(std::string x, std::string y){ return x.append(y); } + std::string id() { return ""; } +}; + +template +struct Max : UnitalMagma { + a op(a x, a y){ return x > y ? x : y; } + a id() { return std::numeric_limits::min(); } +}; + +template +struct Min : UnitalMagma { + a op(a x, a y){ return x < y ? x : y; } + a id() { return std::numeric_limits::max(); } +}; + diff --git a/clang/lib/Headers/reductions.h b/clang/lib/Headers/reductions.h new file mode 100644 index 0000000000000..d9abf9cf276b5 --- /dev/null +++ b/clang/lib/Headers/reductions.h @@ -0,0 +1,56 @@ +#include"magma.h" +#include +#include +#include +//#include + +template +a reduce(um m, v& xs){ + auto acc = m.id(); + for(auto x : xs){ + acc = m.op(acc, x); + } + return acc; +} + +template +a parReduce(um m, v& xs, uint64_t nthreads){ + uint64_t linesize = sysconf(_SC_LEVEL1_DCACHE_LINESIZE); + assert(linesize % sizeof(a) == 0); + uint64_t linenum = linesize / sizeof(a); + a* accs = new a[nthreads * linenum]; + uint64_t size = xs.end() - xs.begin(); + assert(size % nthreads == 0); + uint64_t grainsize = size / nthreads; + for(uint64_t i=0; i +a treeReduce(um m, v& xs, uint64_t start, uint64_t end, uint64_t gs){ + if(end-start < gs){ + a acc = m.id(); + for(uint64_t i=start; i Date: Fri, 10 Dec 2021 09:45:22 -0700 Subject: [PATCH 03/14] Added reductions documentation --- kitsune/docs/using.md | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/kitsune/docs/using.md b/kitsune/docs/using.md index b07a2fa4ea435..e843f6f2f1e76 100644 --- a/kitsune/docs/using.md +++ b/kitsune/docs/using.md @@ -99,3 +99,32 @@ Each special mode and runtime transformation/ABI target has its own named config * ``opencl.cfg``: OpenCL runtime ABI target specific flags. These files can reduce complexity for end users by providing configuration- and build-specific flags. This can be important when version-specific bitcode files and other details are used. In addition, these files can provide developers additional flexibility for debugging, testing, and experimenting. Obviously, all these features can also be hardcoded onto the command line for a more traditional use case. In addition, to override any of the Kitsune or system configuration files you can place an empty config file within the user directory (no kitsune or system configuration files will be read in this case). + +## Reductions +We provide two approaches to reductions. The first (still very much a work in +progress and likely to break) is implicit reductions. This allows you to write +basic reductions in the way you would for sequential code, and have them be +optimized for parallelism, e.g. + +``` +forall(auto x : xs) { + acc += x; +} +``` + +should generate efficient parallel reduction code. + +Second, we provide a c++ interface for parallel reduction via user-defined +reduction operators. Formally, we require a unital magma, which is just a +reduction operator and a unit value, e.g. 0 for sums and 1 for products. + +This allows for the following style of reductions: + +``` +#include +... + double sum = reduce(Sum(), big); +``` + + + From 46a062919d7fba678ccbcc4e172cd5bea79e64bf Mon Sep 17 00:00:00 2001 From: George Stelle Date: Tue, 28 Jun 2022 10:34:21 -0600 Subject: [PATCH 04/14] Implicit sum reductions functional --- llvm/lib/Transforms/Tapir/LoopStripMine.cpp | 28 ++++++++++++++++----- 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp index 3f328d9b4db2a..9f49e6cec8d6d 100644 --- a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp +++ b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp @@ -1561,20 +1561,32 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount, // reduction. What we're doing is just checking if it's // storing to a loop invariant pointer Value* ptr = si->getPointerOperand(); - if(L->isLoopInvariant(ptr)) + if(L->isLoopInvariant(ptr)){ + LLVM_DEBUG(dbgs() << "Found reduction var: " << ptr->getName() << "\n"); reductions.insert(ptr); + } } } } + LLVM_DEBUG(dbgs() << "Found " << reductions.size() << " reduction variables in loop\n"); ValueToValueMap redMap; // TODO: Modify the strip mining outer loop to be smaller: currently we are // stack allocating n/2048 reduction values. // TODO: Initialize local reductions with unit values + Instruction *bloc = nullptr; + if(Instruction* I = dyn_cast(TripCount)){ + bloc = I->getNextNode(); + } else { + bloc = F->getEntryBlock().getTerminator(); + } + IRBuilder<> RB(bloc); + Value *outerIters = RB.CreateUDiv(TripCount, + ConstantInt::get(TripCount->getType(), Count), + "stripiter"); for(Value* ptr : reductions){ - IRBuilder<> B(F->getEntryBlock().getTerminator()); auto ty = dyn_cast(ptr->getType())->getElementType(); - auto al = B.CreateAlloca(ty, TripCount, ptr->getName() + "_reduction"); + auto al = RB.CreateAlloca(ty, outerIters, ptr->getName() + "_reduction"); IRBuilder<> BH(L->getHeader()->getTerminator()); auto lptr = BH.CreateBitCast( BH.CreateGEP(al, NewIdx), @@ -1617,13 +1629,17 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount, } for(auto *sync : syncs){ if(sync->getSyncRegion() == SyncReg){ + BasicBlock *Sync = sync->getParent(); BasicBlock *PostSync = sync->getSuccessor(0); - BasicBlock* RedEpiHeader = SplitBlock(PostSync, PostSync->getTerminator(), DT); - PHINode *Idx = PHINode::Create(TripCount->getType(), 2, + BasicBlock* RedEpiHeader = BasicBlock::Create(Sync->getContext(), "reductionEpilogue", Sync->getParent(), Sync); + RedEpiHeader->moveAfter(Sync); + sync->setSuccessor(0, RedEpiHeader); + BranchInst::Create(PostSync, RedEpiHeader); + PHINode *Idx = PHINode::Create(outerIters->getType(), 2, "reductionepilogueidx", RedEpiHeader->getFirstNonPHI()); IRBuilder<> BH(RedEpiHeader->getFirstNonPHI()); - Idx->addIncoming(TripCount, PostSync); + Idx->addIncoming(outerIters, sync->getParent()); Instruction *bodyTerm, *exitTerm; Value *cmp = BH.CreateIsNotNull(Idx); SplitBlockAndInsertIfThenElse(cmp, RedEpiHeader->getTerminator(), &bodyTerm, &exitTerm); From ce98bc63b260c80ce5039a2bd43298804e5ee1b5 Mon Sep 17 00:00:00 2001 From: George Stelle Date: Wed, 1 Feb 2023 08:48:02 -0700 Subject: [PATCH 05/14] Outlined implementation approach for updated reduction implementation --- llvm/lib/Transforms/Tapir/LoopStripMine.cpp | 31 ++++++++++++++++----- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp index 9f49e6cec8d6d..b83bcb7da6a49 100644 --- a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp +++ b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp @@ -1589,12 +1589,28 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount, auto al = RB.CreateAlloca(ty, outerIters, ptr->getName() + "_reduction"); IRBuilder<> BH(L->getHeader()->getTerminator()); auto lptr = BH.CreateBitCast( - BH.CreateGEP(al, NewIdx), + BH.CreateGEP(ty, al, NewIdx), ptr->getType()); redMap[ptr] = al; // TODO: for now, just initializing with the initial sequential // reduction value, which is often unit, but if it isn't this is - // wrong. + // wrong. What we need to do is assume there is more than one element, and + // use the first element for the first iteration of the loop. + // roughly: + // red = init; + // forall(i = ...){ + // red = reduce(red, body(i)); + // } + // red = init; + // localred[m]; + // forall(k = ...){ + // localred[i] = body(j_0); + // for(j = j_1 ...) + // localred = reduce(localred, body(j)); + // } + // for(k = ...) + // red = reduce(red, localred[k]); + // ptr->replaceUsesWithIf(lptr, [L](Use &u){ if(auto I = dyn_cast(u.getUser())){ I->dump(); @@ -1603,7 +1619,7 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount, return false; }; }); - BH.CreateStore(BH.CreateLoad(ptr), lptr); + BH.CreateStore(BH.CreateLoad(ty, ptr), lptr); } // Epilog "join" of reduction values stored in local reduction value arrays. @@ -1651,11 +1667,12 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount, for(auto& kv : redMap){ auto al = kv.second; Value* ptr = const_cast(kv.first); + auto ty = ptr->getType(); auto lptr = BB.CreateBitCast( - BB.CreateGEP(al, Idx), - ptr->getType()); - auto acc = BB.CreateLoad(ptr); - auto x = BB.CreateLoad(lptr); + BB.CreateGEP(ty, al, Idx), + ty); + auto acc = BB.CreateLoad(ty, ptr); + auto x = BB.CreateLoad(ty, lptr); auto newacc = acc->getType()->isFloatingPointTy() ? BB.CreateFAdd(acc,x) : BB.CreateAdd(acc,x); BB.CreateStore(newacc, ptr); } From 0f86c20e99185ce469aad5decdf5ab8a26fdb0d5 Mon Sep 17 00:00:00 2001 From: George Stelle Date: Tue, 21 Feb 2023 16:17:19 -0700 Subject: [PATCH 06/14] Working reductions, sequential semantics preserved for commutative associative magmas --- clang/include/clang/Basic/Attr.td | 8 ++ clang/lib/CodeGen/CGCall.cpp | 3 + clang/lib/Headers/kitsune.h.cmake | 145 ++++++++++++++++++++ clang/lib/Sema/SemaDeclAttr.cpp | 4 +- kitsune-tests/reductions/l2.c | 36 +++++ llvm/include/llvm/IR/Attributes.td | 10 ++ llvm/lib/Transforms/Tapir/LoopStripMine.cpp | 140 ++++++++++--------- llvm/lib/Transforms/Tapir/OpenCilkABI.cpp | 18 ++- 8 files changed, 296 insertions(+), 68 deletions(-) create mode 100644 clang/lib/Headers/kitsune.h.cmake create mode 100644 kitsune-tests/reductions/l2.c diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index 40e9848e513a8..05cdb8a377345 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -4725,6 +4725,14 @@ def ClspvLibclcBuiltin: InheritableAttr { let SimpleHandler = 1; } +// +===== kitsune-/tapir-centric attributes + +def KitsuneReduction : InheritableAttr { + let Spellings = [Clang<"kitsune_reduction">]; + let Subjects = SubjectList<[FunctionLike]>; + let Documentation = [StrandMallocDocs]; +} + def TapirTarget : StmtAttr { let Spellings = [CXX11<"tapir","target">]; diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index 283fe88bf2f72..4be6d0b4920ee 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -2408,6 +2408,9 @@ void CodeGenModule::ConstructAttributeList(StringRef Name, FuncAttrs.addMemoryAttr(llvm::MemoryEffects::inaccessibleOrArgMemOnly()); FuncAttrs.addAttribute(llvm::Attribute::NoUnwind); } + if (TargetDecl->hasAttr()) { + FuncAttrs.addAttribute(llvm::Attribute::KitsuneReduction); + } if (TargetDecl->hasAttr()) RetAttrs.addAttribute(llvm::Attribute::NoAlias); if (TargetDecl->hasAttr() && diff --git a/clang/lib/Headers/kitsune.h.cmake b/clang/lib/Headers/kitsune.h.cmake new file mode 100644 index 0000000000000..d8e6cd4bf86ee --- /dev/null +++ b/clang/lib/Headers/kitsune.h.cmake @@ -0,0 +1,145 @@ + +/* + * Copyright (c) 2020 Triad National Security, LLC + * All rights reserved. + * + * This file is part of the kitsune/llvm project. It is released under + * the LLVM license. + */ +#ifndef __CLANG_KITSUNE_H__ +#define __CLANG_KITSUNE_H__ + +#include +#include + +#cmakedefine01 KITSUNE_ENABLE_OPENMP_ABI_TARGET +#cmakedefine01 KITSUNE_ENABLE_QTHREADS_ABI_TARGET +#cmakedefine01 KITSUNE_ENABLE_CUDA_ABI_TARGET +#cmakedefine01 KITSUNE_ENABLE_GPU_ABI_TARGET +#cmakedefine01 KITSUNE_ENABLE_REALM_ABI_TARGET +#cmakedefine01 KITSUNE_ENABLE_OPENCL_ABI_TARGET +#cmakedefine01 KITSUNE_ENABLE_HIP_ABI_TARGET + +#include "kitsune_rt.h" + +#if defined(reduction) +#warning found reduction definition: try puttin kitsune lower in include order +#else +#define reduction __attribute__((noinline, kitsune_reduction)) +#endif + + +#if defined(KITSUNE_ENABLE_OPENCL_ABI_TARGET) +#define ocl_mmap(a, n) __kitsune_opencl_mmap_marker((void*)a, n) +#ifdef __cplusplus +extern "C" { +#endif + void __kitsune_opencl_mmap_marker(void* ptr, uint64_t n); +#ifdef __cplusplus +} +#endif +#endif + +#if defined(spawn) +#warning encountered multiple definitions of spawn! +#else +#define spawn _kitsune_spawn +#endif + +#if defined(sync) +#warning encountered multiple definitions of sync! +#else +#define sync _kitsune_sync +#endif + +#if defined(forall) +#warning encountered multiple definitions of forall! +#else +#define forall _kitsune_forall +#endif + + +#if defined(_tapir_cuda_target) + #ifdef __cplusplus + extern "C" __attribute__((malloc)) void* __kitrt_cuMemAllocManaged(size_t); + template + inline __attribute__((always_inline)) + T* alloc(size_t N) { + return (T*)__kitrt_cuMemAllocManaged(sizeof(T) * N); + } + + extern "C" void __kitrt_cuMemFree(void*); + template + void dealloc(T* array) { + __kitrt_cuMemFree((void*)array); + } + #else + void* __attribute__((malloc)) __kitrt_cuMemAllocManaged(size_t); + inline __attribute__((always_inline)) + void *alloc(size_t total_bytes) { + return __kitrt_cuMemAllocManaged(total_bytes); + } + + void __kitrt_cuMemFree(void*); + inline __attribute__((always_inline)) + void dealloc(void *array) { + __kitrt_cuMemFree(array); + } + #endif +#elif defined(_tapir_hip_target) + #ifdef __cplusplus + extern "C" __attribute__((malloc)) void* __kitrt_hipMemAllocManaged(size_t); + template + inline __attribute__((always_inline)) + T* alloc(size_t N) { + return (T*)__kitrt_hipMemAllocManaged(sizeof(T) * N); + } + + extern "C" void __kitrt_hipMemFree(void*); + template + void dealloc(T* array) { + __kitrt_hipMemFree((void*)array); + } + #else + void* __attribute__((malloc)) __kitrt_hipMemAllocManaged(size_t); + inline __attribute__((always_inline)) + void *alloc(size_t total_bytes) { + return __kitrt_hipMemAllocManaged(total_bytes); + } + + void __kitrt_hipMemFree(void*); + inline __attribute__((always_inline)) + void dealloc(void *array) { + __kitrt_hipMemFree(array); + } + #endif +#else + #ifdef __cplusplus + extern "C" __attribute__((malloc)) void* __kitrt_defaultMemAlloc(size_t); + template + inline __attribute__((always_inline)) + T* alloc(size_t N) { + return (T*)__kitrt_defaultMemAlloc(sizeof(T) * N); + } + + extern "C" void __kitrt_defaultMemFree(void*); + template + void dealloc(T* array) { + __kitrt_defaultMemFree(array); + } + #else + void* __attribute__((malloc)) __kitrt_defaultMemAlloc(size_t); + inline __attribute__((always_inline)) + void *alloc(size_t total_bytes) { + return __kitrt_defaultMemAlloc(total_bytes); + } + + void __kitrt_defaultMemFree(void*); + inline __attribute__((always_inline)) + void dealloc(void* array) { + __kitrt_defaultMemFree(array); + } + #endif // __cplusplus +#endif // cpu targets + +#endif diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index 2f426bba3c74b..a159eb8e55287 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -7120,10 +7120,12 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL, case ParsedAttr::AT_TypeNullable: handleNullableTypeAttr(S, D, AL); break; - case ParsedAttr::AT_VTablePointerAuthentication: handleVTablePointerAuthentication(S, D, AL); break; + case ParsedAttr::AT_KitsuneReduction: + handleSimpleAttribute(S, D, AL); + break; } } diff --git a/kitsune-tests/reductions/l2.c b/kitsune-tests/reductions/l2.c new file mode 100644 index 0000000000000..fc81e0057e8a5 --- /dev/null +++ b/kitsune-tests/reductions/l2.c @@ -0,0 +1,36 @@ +#include +#include +#include +#include + +reduction +void sum(float *a, float b){ + *a += b; +} + +float l2(int n, float* a){ + float red = 3.14159; + forall(int i=0; i 1 ? atoi(argv[1]) : 4096 ; + float* arr = (float*)malloc(sizeof(float) * n); + for(int i=0 ; i; /// Function only reads from memory. def ReadOnly : EnumAttr<"readonly", [ParamAttr]>; +<<<<<<< HEAD +======= +/// Tapir reducer-related attributes. +def HyperView : EnumAttr<"hyper_view", [FnAttr]>; +def HyperToken : EnumAttr<"hyper_token", [FnAttr]>; +def ReducerRegister : EnumAttr<"reducer_register", [FnAttr]>; +def ReducerUnregister : EnumAttr<"reducer_unregister", [FnAttr]>; +def KitsuneReduction : EnumAttr<"kitsune_reduction", [FnAttr]>; + +>>>>>>> e2078e0739d8 (Working reductions, sequential semantics preserved for commutative associative magmas) /// Return value is always equal to this argument. def Returned : EnumAttr<"returned", [ParamAttr]>; diff --git a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp index b83bcb7da6a49..fa1af8e829a31 100644 --- a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp +++ b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp @@ -37,6 +37,7 @@ #include "llvm/Transforms/Utils/SimplifyIndVar.h" #include "llvm/Transforms/Utils/TapirUtils.h" #include "llvm/Transforms/Utils/UnrollLoop.h" +#include "llvm/Transforms/Utils/LoopPeel.h" using namespace llvm; @@ -804,6 +805,10 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount, unsigned ExitIndex = LatchBR->getSuccessor(0) == Header ? 1 : 0; BasicBlock *LatchExit = LatchBR->getSuccessor(ExitIndex); + + Function *F = Header->getParent(); + LLVM_DEBUG(dbgs() << "Function before strip mining\n" << *F); + // We will use the increment of the primary induction variable to derive // wrapping flags. Instruction *PrimaryInc = @@ -1551,26 +1556,25 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount, // Optimize this routine in the future. TI->recalculate(*F, *DT); - // iterate through the stores that should be treated as reductions + // accumulate reductions const std::vector& blocks = L->getBlocks(); - std::set reductions; + std::set reductions; for (BasicBlock *BB : blocks){ for (Instruction &I : *BB) { - if(auto si = dyn_cast(&I)){ - // TODO: better check if the store should be treated as a - // reduction. What we're doing is just checking if it's - // storing to a loop invariant pointer - Value* ptr = si->getPointerOperand(); - if(L->isLoopInvariant(ptr)){ - LLVM_DEBUG(dbgs() << "Found reduction var: " << ptr->getName() << "\n"); - reductions.insert(ptr); + if(auto ci = dyn_cast(&I)){ + auto f = ci->getCalledFunction(); + if(f->getAttributes().hasAttrSomewhere(Attribute::KitsuneReduction)){ + LLVM_DEBUG(dbgs() << "Found reduction var: " << ci->getArgOperand(0)->getName() << + "with reduction function: " << f->getName() << "\n"); + reductions.insert(ci); + //TODO: check the type to confirm valid reduction } } } } LLVM_DEBUG(dbgs() << "Found " << reductions.size() << " reduction variables in loop\n"); - ValueToValueMap redMap; + std::vector> redMap; // TODO: Modify the strip mining outer loop to be smaller: currently we are // stack allocating n/2048 reduction values. // TODO: Initialize local reductions with unit values @@ -1584,17 +1588,16 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount, Value *outerIters = RB.CreateUDiv(TripCount, ConstantInt::get(TripCount->getType(), Count), "stripiter"); - for(Value* ptr : reductions){ + for(CallInst* ci : reductions){ + auto ptr = ci->getArgOperand(0); auto ty = dyn_cast(ptr->getType())->getElementType(); auto al = RB.CreateAlloca(ty, outerIters, ptr->getName() + "_reduction"); - IRBuilder<> BH(L->getHeader()->getTerminator()); + IRBuilder<> BH(NewLoop->getHeader()->getTerminator()); auto lptr = BH.CreateBitCast( BH.CreateGEP(ty, al, NewIdx), ptr->getType()); - redMap[ptr] = al; - // TODO: for now, just initializing with the initial sequential - // reduction value, which is often unit, but if it isn't this is - // wrong. What we need to do is assume there is more than one element, and + redMap.push_back(std::make_tuple(ci, ptr, al)); + // Assume there is more than one element, and // use the first element for the first iteration of the loop. // roughly: // red = init; @@ -1606,20 +1609,18 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount, // forall(k = ...){ // localred[i] = body(j_0); // for(j = j_1 ...) - // localred = reduce(localred, body(j)); + // reduce(localred+i, body(j)); // } // for(k = ...) - // red = reduce(red, localred[k]); + // reduce(&red, localred[k]); // ptr->replaceUsesWithIf(lptr, [L](Use &u){ if(auto I = dyn_cast(u.getUser())){ - I->dump(); return L->contains(I->getParent()); } else { return false; }; }); - BH.CreateStore(BH.CreateLoad(ty, ptr), lptr); } // Epilog "join" of reduction values stored in local reduction value arrays. @@ -1636,53 +1637,64 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount, // RedEpiBody // RedEpiExit + // Todo: re-order epilogue and reduction epilogue to preserve associativity if(!reductions.empty()){ - SmallVector syncs; - for(auto &bb : *F){ - if(auto *sync = dyn_cast(bb.getTerminator())){ - syncs.push_back(sync); - } - } - for(auto *sync : syncs){ - if(sync->getSyncRegion() == SyncReg){ - BasicBlock *Sync = sync->getParent(); - BasicBlock *PostSync = sync->getSuccessor(0); - BasicBlock* RedEpiHeader = BasicBlock::Create(Sync->getContext(), "reductionEpilogue", Sync->getParent(), Sync); - RedEpiHeader->moveAfter(Sync); - sync->setSuccessor(0, RedEpiHeader); - BranchInst::Create(PostSync, RedEpiHeader); - PHINode *Idx = PHINode::Create(outerIters->getType(), 2, - "reductionepilogueidx", - RedEpiHeader->getFirstNonPHI()); - IRBuilder<> BH(RedEpiHeader->getFirstNonPHI()); - Idx->addIncoming(outerIters, sync->getParent()); - Instruction *bodyTerm, *exitTerm; - Value *cmp = BH.CreateIsNotNull(Idx); - SplitBlockAndInsertIfThenElse(cmp, RedEpiHeader->getTerminator(), &bodyTerm, &exitTerm); - - IRBuilder<> BB(bodyTerm); - // For each reduction, get the allocated thread local reduced values and - // reduce them. For now defaults to sums on primitive types. - // TODO: Add custom unital magmas and/or infer unital magma - for(auto& kv : redMap){ - auto al = kv.second; - Value* ptr = const_cast(kv.first); - auto ty = ptr->getType(); - auto lptr = BB.CreateBitCast( - BB.CreateGEP(ty, al, Idx), - ty); - auto acc = BB.CreateLoad(ty, ptr); - auto x = BB.CreateLoad(ty, lptr); - auto newacc = acc->getType()->isFloatingPointTy() ? BB.CreateFAdd(acc,x) : BB.CreateAdd(acc,x); - BB.CreateStore(newacc, ptr); + // Peel the first iteration of the loop and replace the reduction calls in + // the peeled code with stores + peelLoop(L, 1, LI, SE, *DT, AC, PreserveLCSSA); + SmallVector cis; + for(auto &BB : NewLoop->blocks()){ + if(!L->contains(BB)){ // better way? + for(auto &I : *BB){ + if(auto *CI = dyn_cast(&I)){ + auto *f = CI->getCalledFunction(); + if(f->getAttributes().hasAttrSomewhere(Attribute::KitsuneReduction)){ + IRBuilder<> pb(&I); + pb.CreateStore(CI->getArgOperand(1), CI->getArgOperand(0)); + cis.push_back(&I); + f->removeFnAttr(Attribute::NoInline); + } + } } - Value *IdxSub = - BB.CreateSub(Idx, ConstantInt::get(Idx->getType(), 1), - Idx->getName() + ".sub"); - Idx->addIncoming(IdxSub, bodyTerm->getParent()); - ReplaceInstWithInst(bodyTerm, BranchInst::Create(RedEpiHeader)); } } + + for(auto &I : cis){ + I->eraseFromParent(); + } + + Instruction* term = LatchExit->getTerminator(); + BasicBlock *PostSync = term->getSuccessor(0); + BasicBlock* RedEpiHeader = BasicBlock::Create(LatchExit->getContext(), "reductionEpilogue", LatchExit->getParent(), LatchExit); + RedEpiHeader->moveAfter(LatchExit); + ReplaceInstWithInst(term, SyncInst::Create(RedEpiHeader, SyncReg)); + BranchInst::Create(PostSync, RedEpiHeader); + PHINode *Idx = PHINode::Create(outerIters->getType(), 2, + "reductionepilogueidx", + RedEpiHeader->getFirstNonPHI()); + IRBuilder<> BH(RedEpiHeader->getFirstNonPHI()); + Idx->addIncoming(ConstantInt::get(outerIters->getType(), 0), LatchExit); + Instruction *bodyTerm, *exitTerm; + Value *cmp = BH.CreateCmp(CmpInst::ICMP_NE, Idx, outerIters); + SplitBlockAndInsertIfThenElse(cmp, RedEpiHeader->getTerminator(), &bodyTerm, &exitTerm); + + IRBuilder<> BB(bodyTerm); + // For each reduction, get the allocated thread local reduced values and + // reduce them. + for(auto& kv : redMap){ + const auto [ ci, ptr, al ] = kv; + auto ty = dyn_cast(ptr->getType())->getElementType(); + auto lptr = BB.CreateBitCast( + BB.CreateGEP(ty, al, Idx), + ptr->getType()); + auto x = BB.CreateLoad(ty, lptr); + BB.CreateCall(ci->getCalledFunction(), { ptr, x }); + } + Value *IdxAdd = + BB.CreateAdd(Idx, ConstantInt::get(Idx->getType(), 1), + Idx->getName() + ".add"); + Idx->addIncoming(IdxAdd, bodyTerm->getParent()); + ReplaceInstWithInst(bodyTerm, BranchInst::Create(RedEpiHeader)); } LLVM_DEBUG(dbgs() << "Function after reduction epilogue\n" << *F); diff --git a/llvm/lib/Transforms/Tapir/OpenCilkABI.cpp b/llvm/lib/Transforms/Tapir/OpenCilkABI.cpp index e048cdaf57c58..98339299f009d 100644 --- a/llvm/lib/Transforms/Tapir/OpenCilkABI.cpp +++ b/llvm/lib/Transforms/Tapir/OpenCilkABI.cpp @@ -151,8 +151,20 @@ void OpenCilkABI::prepareModule() { if ("" != ClOpenCilkRuntimeBCPath) RuntimeBCPath = ClOpenCilkRuntimeBCPath; - if ("" == RuntimeBCPath) - C.emitError("OpenCilkABI: No OpenCilk bitcode ABI file given."); + std::optional path; + if("" == RuntimeBCPath){ + path = sys::Process::FindInEnvPath("LD_LIBRARY_PATH", "libopencilk-abi.bc"); + if (! path) + // TODO: This is an in-tree build solution for now... + #if defined(OPENCILK_BC_PATH) + path = OPENCILK_BC_PATH; + #else + report_fatal_error("Could not find OpenCilk runtime bitcode file " + "(libopencilk-abi.bc) in LD_LIBRARY_PATH."); + #endif + } else { + path = ClOpenCilkRuntimeBCPath.getValue(); + } LLVM_DEBUG(dbgs() << "Using external bitcode file for OpenCilk ABI: " << RuntimeBCPath << "\n"); @@ -161,7 +173,7 @@ void OpenCilkABI::prepareModule() { // Parse the bitcode file. This call imports structure definitions, but not // function definitions. if (std::unique_ptr ExternalModule = - parseIRFile(RuntimeBCPath, SMD, C)) { + parseIRFile(path.getValue(), SMD, C)) { // Get the original DiagnosticHandler for this context. std::unique_ptr OrigDiagHandler = C.getDiagnosticHandler(); From 983ddbac55f6cb749bec793f36d792224c6968dc Mon Sep 17 00:00:00 2001 From: George Stelle Date: Tue, 13 Jun 2023 11:32:26 -0600 Subject: [PATCH 07/14] updated reduction example with timing --- kitsune-tests/reductions/l2.c | 40 +++++++++++++------ llvm/lib/Transforms/Tapir/GPUABI.cpp | 44 +++++++-------------- llvm/lib/Transforms/Tapir/LoopStripMine.cpp | 39 +++++++++++++----- 3 files changed, 73 insertions(+), 50 deletions(-) diff --git a/kitsune-tests/reductions/l2.c b/kitsune-tests/reductions/l2.c index fc81e0057e8a5..e39b8765da54d 100644 --- a/kitsune-tests/reductions/l2.c +++ b/kitsune-tests/reductions/l2.c @@ -1,36 +1,54 @@ +#include #include #include #include #include +#include reduction -void sum(float *a, float b){ +void sum(double *a, double b){ *a += b; } -float l2(int n, float* a){ - float red = 3.14159; - forall(int i=0; i 1 ? atoi(argv[1]) : 4096 ; - float* arr = (float*)malloc(sizeof(float) * n); - for(int i=0 ; i 1 ? atoi(argv[1]) : 2ULL<<28 ; + double* arr = (double*)gpuManagedMalloc(sizeof(double) * n); + + forall(uint64_t i=0; i + JIT("jit-callsite", cl::init(false), cl::NotHidden, + cl::desc("Wait until parallel loop is called to jit kernel. " + "(default=false)")); + Value *GPUABI::lowerGrainsizeCall(CallInst *GrainsizeCall) { Value *Grainsize = ConstantInt::get(GrainsizeCall->getType(), 8); @@ -231,11 +238,7 @@ void LLVMLoop::postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out, // Grainsize argument is the third LC arg. Grainsize = &*++(++OutlineArgsIter); } - ThreadID = B.CreateMul(ThreadID, Grainsize); - Value *ThreadEndGrain = B.CreateAdd(ThreadID, Grainsize); - Value *Cmp = B.CreateICmp(ICmpInst::ICMP_ULT, ThreadEndGrain, End); - Value *ThreadEnd = B.CreateSelect(Cmp, ThreadEndGrain, End); - Value *Cond = B.CreateICmpUGE(ThreadID, ThreadEnd); + Value *Cond = B.CreateICmpUGE(ThreadID, End); ReplaceInstWithInst(Entry->getTerminator(), BranchInst::Create(Exit, Header, Cond)); @@ -245,10 +248,10 @@ void LLVMLoop::postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out, // Update cloned loop condition to use the thread-end value. unsigned TripCountIdx = 0; ICmpInst *ClonedCond = cast(VMap[TL.getCondition()]); - if (ClonedCond->getOperand(0) != ThreadEnd) + if (ClonedCond->getOperand(0) != End) ++TripCountIdx; - ClonedCond->setOperand(TripCountIdx, ThreadEnd); - assert(ClonedCond->getOperand(TripCountIdx) == ThreadEnd && + ClonedCond->setOperand(TripCountIdx, End); + assert(ClonedCond->getOperand(TripCountIdx) == End && "End argument not used in condition"); } @@ -261,8 +264,6 @@ void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI, Type *Int64Ty = Type::getInt64Ty(Ctx); Type *VoidPtrTy = Type::getInt8PtrTy(Ctx); - //Task *T = TL.getTask(); - //Instruction *ReplCall = cast(TOI.ReplCall); LLVM_DEBUG(dbgs() << "Running processOutlinedLoopCall: " << LLVMM); Function *Parent = TOI.ReplCall->getFunction(); Value *TripCount = OrderedInputs[0]; @@ -388,12 +389,6 @@ void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI, GlobalValue::PrivateLinkage, LLVMBC, "gpu_" + Twine("kitsune_kernel")); - //Value* TripCount = isSRetInput(TOI.InputSet[0]) ? TOI.InputSet[1] : TOI.InputSet[0]; - //Value *RunStart = ReplCall->getArgOperand(getIVArgIndex(*Parent, - // TOI.InputSet)); - //Value *TripCount = ReplCall->getArgOperand(getLimitArgIndex(*Parent, - // TOI.InputSet)); - Value *KernelID = ConstantInt::get(Int32Ty, MyKernelID); Value *LLVMPtr = B.CreateBitCast(LLVMGlobal, VoidPtrTy); Type *VoidPtrPtrTy = VoidPtrTy->getPointerTo(); @@ -417,20 +412,9 @@ void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI, B.CreateStore(VoidVPtr, argPtr); } - Value *Grainsize = TL.getGrainsize() ? - ConstantInt::get(TripCount->getType(), TL.getGrainsize()) : - OrderedInputs[2]; - - //Type *Int64Ty = Type::getInt64Ty(LLVMM.getContext()); - Value *RunSizeQ = B.CreateUDiv(TripCount, Grainsize); - Value *RunRem = B.CreateURem(TripCount, Grainsize); - Value *IsRem = B.CreateICmp(ICmpInst::ICMP_UGT, RunRem, ConstantInt::get(RunRem->getType(), 0)); - Value *IsRemAdd = B.CreateZExt(IsRem, RunSizeQ->getType()); - Value *RunSize = B.CreateZExt(B.CreateAdd(RunSizeQ, IsRemAdd), Int64Ty); - - Value* argsPtr = B.CreateConstInBoundsGEP2_32(arrayType, argArray, 0, 0); - Value* bcPtr = B.CreateConstInBoundsGEP2_32(LLVMGlobal->getValueType(), LLVMGlobal, 0, 0); - Value* stream = B.CreateCall(GPULaunchKernel, { bcPtr, kernelSize, argsPtr, RunSize }); + Value* argsPtr = B.CreateConstInBoundsGEP2_32(arrayType, argArray, 0, 0); + Value* bcPtr = B.CreateConstInBoundsGEP2_32(LLVMGlobal->getValueType(), LLVMGlobal, 0, 0); + Value* stream = B.CreateCall(GPULaunchKernel, { bcPtr, kernelSize, argsPtr, TripCount }); B.CreateCall(GPUWaitKernel, stream); LLVM_DEBUG(dbgs() << "Finished processOutlinedLoopCall: " << M); diff --git a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp index fa1af8e829a31..422843ccf8e8e 100644 --- a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp +++ b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp @@ -1556,7 +1556,7 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount, // Optimize this routine in the future. TI->recalculate(*F, *DT); - // accumulate reductions + // accumulate reductions in main loop const std::vector& blocks = L->getBlocks(); std::set reductions; for (BasicBlock *BB : blocks){ @@ -1572,6 +1572,8 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount, } } } + + // accumulate reductions in epilog loop LLVM_DEBUG(dbgs() << "Found " << reductions.size() << " reduction variables in loop\n"); std::vector> redMap; @@ -1588,10 +1590,16 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount, Value *outerIters = RB.CreateUDiv(TripCount, ConstantInt::get(TripCount->getType(), Count), "stripiter"); + auto nred = RB.CreateAdd(outerIters, ConstantInt::get(outerIters->getType(), 1)); for(CallInst* ci : reductions){ + // TODO: generic allocation/free calls auto ptr = ci->getArgOperand(0); auto ty = dyn_cast(ptr->getType())->getElementType(); - auto al = RB.CreateAlloca(ty, outerIters, ptr->getName() + "_reduction"); + auto gmmTy = FunctionType::get(ptr->getType(), { nred->getType() }, false); + auto arrSize = RB.CreateMul(nred, ConstantInt::get(nred->getType(), DL.getTypeAllocSize(nred->getType()))); + auto al = RB.CreateCall(M->getOrInsertFunction("gpuManagedMalloc", gmmTy), {arrSize}); + //auto al = RB.CreateBitCast(rm, ty); + //auto al = RB.CreateAlloca(ty, nred, ptr->getName() + "_reduction"); IRBuilder<> BH(NewLoop->getHeader()->getTerminator()); auto lptr = BH.CreateBitCast( BH.CreateGEP(ty, al, NewIdx), @@ -1605,13 +1613,17 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount, // red = reduce(red, body(i)); // } // red = init; - // localred[m]; - // forall(k = ...){ + // localred[m+1]; + // + // forall(k ∈ 0..m-1){ // localred[i] = body(j_0); - // for(j = j_1 ...) + // for(j ∈ j_k_1..j_k_l-1) // reduce(localred+i, body(j)); // } - // for(k = ...) + // for( j ∈ j_k_m .. n ) + // reduce(localred+m, body(j)); + // } + // for(k ∈ 0..m) // reduce(&red, localred[k]); // ptr->replaceUsesWithIf(lptr, [L](Use &u){ @@ -1688,25 +1700,34 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount, BB.CreateGEP(ty, al, Idx), ptr->getType()); auto x = BB.CreateLoad(ty, lptr); + BB.SetCurrentDebugLocation(ci->getDebugLoc()); BB.CreateCall(ci->getCalledFunction(), { ptr, x }); } Value *IdxAdd = BB.CreateAdd(Idx, ConstantInt::get(Idx->getType(), 1), Idx->getName() + ".add"); - Idx->addIncoming(IdxAdd, bodyTerm->getParent()); + BasicBlock* body = bodyTerm->getParent(); + BasicBlock* loopExit = exitTerm->getParent(); + Idx->addIncoming(IdxAdd, body); ReplaceInstWithInst(bodyTerm, BranchInst::Create(RedEpiHeader)); + + // Update Loopinfo with reduction loop + Loop* RL = LI->AllocateLoop(); + if(ParentLoop) ParentLoop->addChildLoop(RL); + else LI->addTopLevelLoop(RL); + RL->addBasicBlockToLoop(RedEpiHeader, *LI); + RL->addBasicBlockToLoop(body, *LI); } LLVM_DEBUG(dbgs() << "Function after reduction epilogue\n" << *F); // TODO: fix DT updates DT->recalculate(*F); - /* + #ifndef NDEBUG DT->verify(); LI->verify(*DT); #endif - */ return NewLoop; } From 6a40583b15349ed7afe0ec94564061f6129f4407 Mon Sep 17 00:00:00 2001 From: George Stelle Date: Thu, 21 Sep 2023 09:22:56 -0600 Subject: [PATCH 08/14] Fixed reductions use of pointer types --- llvm/lib/Transforms/Tapir/LoopStripMine.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp index 422843ccf8e8e..945dc16f89000 100644 --- a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp +++ b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp @@ -1594,7 +1594,7 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount, for(CallInst* ci : reductions){ // TODO: generic allocation/free calls auto ptr = ci->getArgOperand(0); - auto ty = dyn_cast(ptr->getType())->getElementType(); + auto ty = dyn_cast(ptr->getType())->getArrayElementType(); auto gmmTy = FunctionType::get(ptr->getType(), { nred->getType() }, false); auto arrSize = RB.CreateMul(nred, ConstantInt::get(nred->getType(), DL.getTypeAllocSize(nred->getType()))); auto al = RB.CreateCall(M->getOrInsertFunction("gpuManagedMalloc", gmmTy), {arrSize}); @@ -1653,7 +1653,8 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount, if(!reductions.empty()){ // Peel the first iteration of the loop and replace the reduction calls in // the peeled code with stores - peelLoop(L, 1, LI, SE, *DT, AC, PreserveLCSSA); + ValueToValueMapTy VMap; + peelLoop(L, 1, LI, SE, *DT, AC, PreserveLCSSA, VMap); SmallVector cis; for(auto &BB : NewLoop->blocks()){ if(!L->contains(BB)){ // better way? @@ -1695,7 +1696,7 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount, // reduce them. for(auto& kv : redMap){ const auto [ ci, ptr, al ] = kv; - auto ty = dyn_cast(ptr->getType())->getElementType(); + auto ty = dyn_cast(ptr->getType())->getArrayElementType(); auto lptr = BB.CreateBitCast( BB.CreateGEP(ty, al, Idx), ptr->getType()); From f5330b43ff1d8f987e6d170b2bb867243f9e386d Mon Sep 17 00:00:00 2001 From: George Stelle Date: Thu, 12 Sep 2024 14:35:33 -0600 Subject: [PATCH 09/14] working reductions on 16.x --- llvm/include/llvm/Bitcode/LLVMBitCodes.h | 2 ++ llvm/include/llvm/IR/GlobalValue.h | 1 - llvm/lib/Bitcode/Reader/BitcodeReader.cpp | 2 ++ llvm/lib/Bitcode/Writer/BitcodeWriter.cpp | 2 ++ llvm/lib/Transforms/Tapir/GPUABI.cpp | 3 ++- llvm/lib/Transforms/Tapir/LoopStripMine.cpp | 19 ++++++++++--------- llvm/lib/Transforms/Tapir/OpenCilkABI.cpp | 2 +- 7 files changed, 19 insertions(+), 12 deletions(-) diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h index 40ae85cf16919..fa5bdd2c5cc4b 100644 --- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h +++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h @@ -764,6 +764,8 @@ enum AttributeKindCodes { ATTR_KIND_HYBRID_PATCHABLE = 95, ATTR_KIND_SANITIZE_CILK = 96, ATTR_KIND_STEALABLE = 97, + ATTR_KIND_SANITIZE_CILK = 98, + ATTR_KIND_KITSUNE_REDUCTION = 99, }; enum ComdatSelectionKindCodes { diff --git a/llvm/include/llvm/IR/GlobalValue.h b/llvm/include/llvm/IR/GlobalValue.h index 59ba8919d8605..d98c0c5f4dfd0 100644 --- a/llvm/include/llvm/IR/GlobalValue.h +++ b/llvm/include/llvm/IR/GlobalValue.h @@ -571,7 +571,6 @@ class GlobalValue : public Constant { // FIXME: need a better solution but also too difficult to re-outline // everything w/ tapir for GPU targets (AMD in particular)... - //#warning "FIXME: need better solution than mutateType()." void mutateValueType(Type *Ty) { ValueType = Ty; } diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index 92411913c43ee..09306d88ae805 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -2189,6 +2189,8 @@ static Attribute::AttrKind getAttrFromCode(uint64_t Code) { return Attribute::Range; case bitc::ATTR_KIND_INITIALIZES: return Attribute::Initializes; + case bitc::ATTR_KIND_KITSUNE_REDUCTION: + return Attribute::KitsuneReduction; } } diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index 85f9d1556c26c..5251549c36f2d 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -889,6 +889,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) { return bitc::ATTR_KIND_RANGE; case Attribute::Initializes: return bitc::ATTR_KIND_INITIALIZES; + case Attribute::KitsuneReduction: + return bitc::ATTR_KIND_KITSUNE_REDUCTION; case Attribute::EndAttrKinds: llvm_unreachable("Can not encode end-attribute kinds marker."); case Attribute::None: diff --git a/llvm/lib/Transforms/Tapir/GPUABI.cpp b/llvm/lib/Transforms/Tapir/GPUABI.cpp index 12922e3e65aa8..03b7309023594 100644 --- a/llvm/lib/Transforms/Tapir/GPUABI.cpp +++ b/llvm/lib/Transforms/Tapir/GPUABI.cpp @@ -305,7 +305,8 @@ void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI, SmallVector Returns; CloneFunctionInto(deviceF, f, VMap, CloneFunctionChangeType::DifferentModule, Returns); // GPU calls are slow as balls, try to force inlining - deviceF->addFnAttr(Attribute::AlwaysInline); + if(!deviceF->hasFnAttribute(Attribute::NoInline)) + deviceF->addFnAttr(Attribute::AlwaysInline); todo.push_back(deviceF); } } diff --git a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp index 945dc16f89000..0d12868b37c60 100644 --- a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp +++ b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp @@ -1558,7 +1558,7 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount, // accumulate reductions in main loop const std::vector& blocks = L->getBlocks(); - std::set reductions; + std::set> reductions; for (BasicBlock *BB : blocks){ for (Instruction &I : *BB) { if(auto ci = dyn_cast(&I)){ @@ -1566,7 +1566,8 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount, if(f->getAttributes().hasAttrSomewhere(Attribute::KitsuneReduction)){ LLVM_DEBUG(dbgs() << "Found reduction var: " << ci->getArgOperand(0)->getName() << "with reduction function: " << f->getName() << "\n"); - reductions.insert(ci); + auto ty = ci->getArgOperand(1)->getType(); + reductions.insert(std::make_pair(ci, ty)); //TODO: check the type to confirm valid reduction } } @@ -1576,7 +1577,7 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount, // accumulate reductions in epilog loop LLVM_DEBUG(dbgs() << "Found " << reductions.size() << " reduction variables in loop\n"); - std::vector> redMap; + std::vector> redMap; // TODO: Modify the strip mining outer loop to be smaller: currently we are // stack allocating n/2048 reduction values. // TODO: Initialize local reductions with unit values @@ -1591,12 +1592,13 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount, ConstantInt::get(TripCount->getType(), Count), "stripiter"); auto nred = RB.CreateAdd(outerIters, ConstantInt::get(outerIters->getType(), 1)); - for(CallInst* ci : reductions){ + for(auto &pair : reductions){ // TODO: generic allocation/free calls + auto ci = pair.first; auto ptr = ci->getArgOperand(0); - auto ty = dyn_cast(ptr->getType())->getArrayElementType(); + auto ty = pair.second; auto gmmTy = FunctionType::get(ptr->getType(), { nred->getType() }, false); - auto arrSize = RB.CreateMul(nred, ConstantInt::get(nred->getType(), DL.getTypeAllocSize(nred->getType()))); + auto arrSize = RB.CreateMul(nred, ConstantInt::get(nred->getType(), DL.getTypeAllocSize(ty))); auto al = RB.CreateCall(M->getOrInsertFunction("gpuManagedMalloc", gmmTy), {arrSize}); //auto al = RB.CreateBitCast(rm, ty); //auto al = RB.CreateAlloca(ty, nred, ptr->getName() + "_reduction"); @@ -1604,7 +1606,7 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount, auto lptr = BH.CreateBitCast( BH.CreateGEP(ty, al, NewIdx), ptr->getType()); - redMap.push_back(std::make_tuple(ci, ptr, al)); + redMap.push_back(std::make_tuple(ci, ptr, al, ty)); // Assume there is more than one element, and // use the first element for the first iteration of the loop. // roughly: @@ -1695,8 +1697,7 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount, // For each reduction, get the allocated thread local reduced values and // reduce them. for(auto& kv : redMap){ - const auto [ ci, ptr, al ] = kv; - auto ty = dyn_cast(ptr->getType())->getArrayElementType(); + const auto [ ci, ptr, al, ty ] = kv; auto lptr = BB.CreateBitCast( BB.CreateGEP(ty, al, Idx), ptr->getType()); diff --git a/llvm/lib/Transforms/Tapir/OpenCilkABI.cpp b/llvm/lib/Transforms/Tapir/OpenCilkABI.cpp index 98339299f009d..be51e49d0d5d7 100644 --- a/llvm/lib/Transforms/Tapir/OpenCilkABI.cpp +++ b/llvm/lib/Transforms/Tapir/OpenCilkABI.cpp @@ -173,7 +173,7 @@ void OpenCilkABI::prepareModule() { // Parse the bitcode file. This call imports structure definitions, but not // function definitions. if (std::unique_ptr ExternalModule = - parseIRFile(path.getValue(), SMD, C)) { + parseIRFile(RuntimeBCPath, SMD, C)) { // Get the original DiagnosticHandler for this context. std::unique_ptr OrigDiagHandler = C.getDiagnosticHandler(); From 0c73a4158b07add4531ff38a3bd74bff25e9c4d5 Mon Sep 17 00:00:00 2001 From: George Stelle Date: Mon, 16 Sep 2024 12:42:34 -0600 Subject: [PATCH 10/14] added reductions example makefile --- kitsune-tests/reductions/makefile | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 kitsune-tests/reductions/makefile diff --git a/kitsune-tests/reductions/makefile b/kitsune-tests/reductions/makefile new file mode 100644 index 0000000000000..78e4a2a8e81cb --- /dev/null +++ b/kitsune-tests/reductions/makefile @@ -0,0 +1,25 @@ +all: l2 l2_gpu l2_serial + +l2.ll : l2.c + clang -S -emit-llvm l2.c -O1 -ftapir=none + +l2_stripmined.ll: l2.ll + opt -S -passes="loop-stripmine" -o l2_stripmined.ll l2.ll + +l2_stripmined_opt.ll: l2_stripmined.ll + opt -S -O2 -o l2_stripmined_opt.ll l2_stripmined.ll + +l2: l2_stripmined_opt.ll + clang -ftapir=opencilk -O2 l2_stripmined_opt.ll -o l2 -lm -lllvm-gpu + +l2_gpu.ll: l2_stripmined_opt.ll + clang -ftapir=gpu -O1 -S -emit-llvm $< -o $@ + +l2_gpu: l2_gpu.ll + clang l2_gpu.ll -fPIC -o l2_gpu -ftapir=gpu -lm -lllvm-gpu + +l2_serial: l2.ll + clang l2.ll -o l2_serial -ftapir=serial -lllvm-gpu -lm + +clean: + rm -f l2_stripmined.ll l2 l2.ll From 39204642fc55eb2dc50cfd553960f157a1069b94 Mon Sep 17 00:00:00 2001 From: George Stelle Date: Thu, 31 Oct 2024 09:23:38 -0600 Subject: [PATCH 11/14] Added openmp reduce for comparison --- kitsune-tests/reductions/l2.c | 13 +++++-- kitsune-tests/reductions/l2_openmp.c | 55 ++++++++++++++++++++++++++++ kitsune-tests/reductions/makefile | 5 ++- 3 files changed, 68 insertions(+), 5 deletions(-) create mode 100644 kitsune-tests/reductions/l2_openmp.c diff --git a/kitsune-tests/reductions/l2.c b/kitsune-tests/reductions/l2.c index e39b8765da54d..c8e936fb98b48 100644 --- a/kitsune-tests/reductions/l2.c +++ b/kitsune-tests/reductions/l2.c @@ -28,17 +28,22 @@ double l2_seq(uint64_t n, double* a){ } int main(int argc, char** argv){ - uint64_t n = argc > 1 ? atoi(argv[1]) : 2ULL<<28 ; + int e = argc > 1 ? atoi(argv[1]) : 28; + int niter = argc > 2 ? atoi(argv[2]) : 100; + uint64_t n = 1ULL< +#include +#include +#include +#include + +double l2(uint64_t n, double* a){ + double red = 0; + #pragma omp parallel for reduction(+:red) + for(uint64_t i=0; i 1 ? atoi(argv[1]) : 28; + int niter = argc > 2 ? atoi(argv[2]) : 100; + uint64_t n = 1ULL< Date: Wed, 6 Nov 2024 16:32:16 -0700 Subject: [PATCH 12/14] mostly working grid stride gpu reductions --- kitsune-tests/reductions/l2.c | 20 +-- kitsune-tests/reductions/l2_openmp.c | 43 ++++-- kitsune-tests/reductions/makefile | 6 +- llvm/include/llvm/Transforms/Tapir/GPUABI.h | 5 +- .../llvm/Transforms/Tapir/LoweringUtils.h | 3 + llvm/lib/Transforms/Tapir/GPUABI.cpp | 141 +++++++++++++++--- llvm/lib/Transforms/Tapir/LoopSpawningTI.cpp | 18 ++- 7 files changed, 176 insertions(+), 60 deletions(-) diff --git a/kitsune-tests/reductions/l2.c b/kitsune-tests/reductions/l2.c index c8e936fb98b48..1da6e4b2968f3 100644 --- a/kitsune-tests/reductions/l2.c +++ b/kitsune-tests/reductions/l2.c @@ -10,6 +10,7 @@ void sum(double *a, double b){ *a += b; } +__attribute__((noinline)) double l2(uint64_t n, double* a){ double red = 0; forall(uint64_t i=0; i 1 ? atoi(argv[1]) : 28; int niter = argc > 2 ? atoi(argv[2]) : 100; @@ -37,8 +30,9 @@ int main(int argc, char** argv){ arr[i] = i; } - l2(n, arr); + printf("par:%f \n", l2(n, arr)); + /* clock_t before = clock(); double par; for(int i=0; i #include #include +#include double l2(uint64_t n, double* a){ double red = 0; @@ -14,12 +15,19 @@ double l2(uint64_t n, double* a){ return sqrt(red); } -double l2_seq(uint64_t n, double* a){ - double red = 0; - for(uint64_t i=0; i < n; i++){ - red += a[i]*a[i]; +double l2_grid(uint64_t n, double* a){ + double red[12]; + #pragma omp parallel for + for(uint64_t i=0; i<12; i++){ + red[i] = 0; + for(uint64_t j=i; j OrderedInputs; public: @@ -86,8 +88,9 @@ class LLVMLoop : public LoopOutlineProcessor { ValueToValueMapTy &VMap) override final; void processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI, DominatorTree &DT) override final; + void preProcessTapirLoop(TapirLoopInfo &TL, ValueToValueMapTy &VMap, ValueSet& LoopInputs) override final; }; -} +} //namespace llvm #endif /* diff --git a/llvm/include/llvm/Transforms/Tapir/LoweringUtils.h b/llvm/include/llvm/Transforms/Tapir/LoweringUtils.h index e2e65b4000fed..99a0a7029c32b 100644 --- a/llvm/include/llvm/Transforms/Tapir/LoweringUtils.h +++ b/llvm/include/llvm/Transforms/Tapir/LoweringUtils.h @@ -449,6 +449,9 @@ class LoopOutlineProcessor { virtual void preProcessTapirLoop(TapirLoopInfo &TL, ValueToValueMapTy &VMap) { /* no-op */ } + virtual void preProcessTapirLoop(TapirLoopInfo &TL, ValueToValueMapTy &VMap, ValueSet &LoopInputs) + { preProcessTapirLoop(TL, VMap); } + /// Processes an outlined Function Helper for a Tapir loop, just after the /// function has been outlined. virtual void postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out, diff --git a/llvm/lib/Transforms/Tapir/GPUABI.cpp b/llvm/lib/Transforms/Tapir/GPUABI.cpp index 03b7309023594..1d1fd763343f4 100644 --- a/llvm/lib/Transforms/Tapir/GPUABI.cpp +++ b/llvm/lib/Transforms/Tapir/GPUABI.cpp @@ -45,8 +45,10 @@ static cl::opt "(default=false)")); Value *GPUABI::lowerGrainsizeCall(CallInst *GrainsizeCall) { - Value *Grainsize = ConstantInt::get(GrainsizeCall->getType(), 8); - + //Value *Grainsize = ConstantInt::get(GrainsizeCall->getType(), 8); + Module *M = GrainsizeCall->getModule(); + Type *Int64Ty = Type::getInt64Ty(M->getContext()); + Value* Grainsize = CallInst::Create(M->getOrInsertFunction("gpuGridSize", Int64Ty), {}, GrainsizeCall); // Replace uses of grainsize intrinsic call with this grainsize value. GrainsizeCall->replaceAllUsesWith(Grainsize); return Grainsize; @@ -116,6 +118,8 @@ LLVMLoop::LLVMLoop(Module &M) GPUInit = M.getOrInsertFunction("initRuntime", VoidTy); GPULaunchKernel = M.getOrInsertFunction("launchBCKernel", VoidPtrTy, VoidPtrTy, Int64Ty, VoidPtrPtrTy, Int64Ty); GPUWaitKernel = M.getOrInsertFunction("waitKernel", VoidTy, VoidPtrTy); + GPUManagedMalloc = M.getOrInsertFunction("gpuManagedMalloc", VoidPtrTy, Int64Ty); + GPUGridSize = M.getOrInsertFunction("gpuGridSize", Int64Ty); } void LLVMLoop::setupLoopOutlineArgs( @@ -196,6 +200,7 @@ void LLVMLoop::postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out, BasicBlock *Header = cast(VMap[L->getHeader()]); BasicBlock *Exit = cast(VMap[TL.getExitBlock()]); PHINode *PrimaryIV = cast(VMap[TL.getPrimaryInduction().first]); + InductionDescriptor ID = TL.getPrimaryInduction().second; Value *PrimaryIVInput = PrimaryIV->getIncomingValueForBlock(Entry); Instruction *ClonedSyncReg = cast( VMap[T->getDetach()->getSyncRegion()]); @@ -253,14 +258,22 @@ void LLVMLoop::postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out, ClonedCond->setOperand(TripCountIdx, End); assert(ClonedCond->getOperand(TripCountIdx) == End && "End argument not used in condition"); - + // we replace the step + assert(TL.getPrimaryInduction().second.getInductionBinOp()->getOpcode() == Instruction::BinaryOps::Add && + "Only support gpu kernels with addition for induction step"); + ConstantInt *CStep = cast(VMap[ID.getConstIntStepValue()]); + BinaryOperator *BinOp = cast(VMap[ID.getInductionBinOp()]); + assert(CStep && BinOp && "Couldn't infer step or operator from primary induction gpu variable"); + IRBuilder<> IncB(&BinOp->getParent()->front()); + Value* NewStep = IncB.CreateMul(CStep, Grainsize); + BinOp->setOperand(1, NewStep); + ClonedCond->setPredicate(ICmpInst::Predicate::ICMP_UGE); } void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI, DominatorTree &DT) { LLVMContext &Ctx = M.getContext(); Type *Int8Ty = Type::getInt8Ty(Ctx); - Type *Int32Ty = Type::getInt32Ty(Ctx); Type *Int64Ty = Type::getInt64Ty(Ctx); Type *VoidPtrTy = Type::getInt8PtrTy(Ctx); @@ -273,12 +286,7 @@ void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI, IRBuilder<> B(&NBB->front()); - // Compile the kernel - //LLVMM.getFunctionList().remove(TOI.Outline); - //TOI.Outline->eraseFromParent(); - LLVMContext &LLVMCtx = LLVMM.getContext(); - - ValueToValueMapTy VMap; + ValueToValueMapTy VMap; // We recursively add definitions and declarations to the device module SmallVector todo; todo.push_back(LLVMM.getFunction("kitsune_kernel")); @@ -390,14 +398,10 @@ void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI, GlobalValue::PrivateLinkage, LLVMBC, "gpu_" + Twine("kitsune_kernel")); - Value *KernelID = ConstantInt::get(Int32Ty, MyKernelID); - Value *LLVMPtr = B.CreateBitCast(LLVMGlobal, VoidPtrTy); - Type *VoidPtrPtrTy = VoidPtrTy->getPointerTo(); - - Constant *kernelSize = ConstantInt::get(Int64Ty, - LLVMGlobal->getInitializer()->getType()->getArrayNumElements()); - BasicBlock &EBB = Parent->getEntryBlock(); - IRBuilder<> EB(&EBB.front()); + Constant *kernelSize = ConstantInt::get(Int64Ty, + LLVMGlobal->getInitializer()->getType()->getArrayNumElements()); + BasicBlock &EBB = Parent->getEntryBlock(); + IRBuilder<> EB(&EBB.front()); EB.CreateCall(GPUInit, {}); ArrayType* arrayType = ArrayType::get(VoidPtrTy, OrderedInputs.size()); @@ -421,4 +425,103 @@ void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI, LLVM_DEBUG(dbgs() << "Finished processOutlinedLoopCall: " << M); } -#endif +void LLVMLoop::preProcessTapirLoop(TapirLoopInfo &TL, ValueToValueMapTy &VMap, ValueSet &LoopInputs) { + Loop *L = TL.getLoop(); + BasicBlock *PH = L->getLoopPreheader(); + BasicBlock *Header = L->getHeader(); + BasicBlock *Latch = L->getLoopLatch(); + BranchInst *LatchBR = cast(Latch->getTerminator()); + unsigned ExitIndex = LatchBR->getSuccessor(0) == Header ? 1 : 0; + BasicBlock *LatchExit = LatchBR->getSuccessor(ExitIndex); + DetachInst *DI = cast(Header->getTerminator()); + Value *SyncReg = DI->getSyncRegion(); + Value *GS; + + const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); + + // accumulate Reductions in main loop + const std::vector& Blocks = L->getBlocks(); + std::set> Reductions; + for (BasicBlock *BB : Blocks){ + for (Instruction &I : *BB) { + if(auto *CI = dyn_cast(&I)){ + auto *F = CI->getCalledFunction(); + if(F->getAttributes().hasAttrSomewhere(Attribute::KitsuneReduction)){ + LLVM_DEBUG(dbgs() << "Found reduction var: " << CI->getArgOperand(0)->getName() << + "with reduction function: " << F->getName() << "\n"); + auto *Ty = CI->getArgOperand(1)->getType(); + Reductions.insert(std::make_pair(CI, Ty)); + //TODO: check the type to confirm valid reduction + } + } + } + } + + std::vector> RedMap; + + for(auto &Red : Reductions){ + // We assume there's a preheader, and use it to insert allocations for reductions: + // preheader: + // gs = gridSize() + // reds = gpuManagedMalloc(gs) + // br header + IRBuilder<> RB(&PH->front()); + CallInst *CI = Red.first; + Value *Ptr = CI->getArgOperand(0); + Type *Ty = Red.second; + GS = RB.CreateCall(GPUGridSize); + Value *NBytes = RB.CreateMul(GS, ConstantInt::get(GS->getType(), DL.getTypeAllocSize(Ty))); + CallInst *Alloc = RB.CreateCall(GPUManagedMalloc, {NBytes}); + LoopInputs.insert(Alloc); + // We overwrite in the body the location to Alloc, which will be replaced with a GEP using the tid in postprocessing + CI->setOperand(0, Alloc); + + RedMap.push_back(std::make_tuple(CI, Ptr, Alloc, Ty)); + } + + if(!Reductions.empty()){ + Instruction* Term = LatchExit->getTerminator(); + BasicBlock *PostSync = Term->getSuccessor(0); + BasicBlock* RedEpiHeader = BasicBlock::Create(LatchExit->getContext(), "reductionEpilogue", LatchExit->getParent(), LatchExit); + RedEpiHeader->moveAfter(LatchExit); + ReplaceInstWithInst(Term, SyncInst::Create(RedEpiHeader, SyncReg)); + BranchInst::Create(PostSync, RedEpiHeader); + PHINode *Idx = PHINode::Create(GS->getType(), 2, + "reductionepilogueidx", + RedEpiHeader->getFirstNonPHI()); + IRBuilder<> BH(RedEpiHeader->getFirstNonPHI()); + Idx->addIncoming(ConstantInt::get(GS->getType(), 0), LatchExit); + Instruction *BodyTerm, *ExitTerm; + Value *Cmp = BH.CreateCmp(CmpInst::ICMP_NE, Idx, GS); + SplitBlockAndInsertIfThenElse(Cmp, RedEpiHeader->getTerminator(), &BodyTerm, &ExitTerm); + + IRBuilder<> BB(BodyTerm); + // For each reduction, get the allocated thread local reduced values and + // reduce them. + for(auto& KV : RedMap){ + const auto [ CI, Ptr, Al, Ty ] = KV; + Value* Lptr = BB.CreateBitCast( + BB.CreateGEP(Ty, Al, Idx), + Ptr->getType()); + Value *LR = BB.CreateLoad(Ty, Lptr); + BB.SetCurrentDebugLocation(CI->getDebugLoc()); + BB.CreateCall(CI->getCalledFunction(), { Ptr, LR }); + } + Value *IdxAdd = + BB.CreateAdd(Idx, ConstantInt::get(Idx->getType(), 1), + Idx->getName() + ".add"); + BasicBlock* Body = BodyTerm->getParent(); + BasicBlock* LoopExit = ExitTerm->getParent(); + Idx->addIncoming(IdxAdd, Body); + ReplaceInstWithInst(BodyTerm, BranchInst::Create(RedEpiHeader)); + + // Update Loopinfo with reduction loop + //Loop* RL = LI->AllocateLoop(); + //if(ParentLoop) ParentLoop->addChildLoop(RL); + //else LI->addTopLevelLoop(RL); + //RL->addBasicBlockToLoop(RedEpiHeader, *LI); + //RL->addBasicBlockToLoop(body, *LI); + } + + LLVM_DEBUG(dbgs() << "Finished preProcessTapirLoop: " << *PH->getParent()); +} diff --git a/llvm/lib/Transforms/Tapir/LoopSpawningTI.cpp b/llvm/lib/Transforms/Tapir/LoopSpawningTI.cpp index d98fdad17043a..ff5321ca23699 100644 --- a/llvm/lib/Transforms/Tapir/LoopSpawningTI.cpp +++ b/llvm/lib/Transforms/Tapir/LoopSpawningTI.cpp @@ -1535,11 +1535,14 @@ Function *LoopSpawningImpl::createHelperForTapirLoop( /// Outline all recorded Tapir loops in the function. TaskOutlineMapTy LoopSpawningImpl::outlineAllTapirLoops() { + // Prepare Tapir loops for outlining. for (Task *T : post_order(TI.getRootTask())) { if (TapirLoopInfo *TL = getTapirLoop(T)) { + PredicatedScalarEvolution PSE(SE, *TL->getLoop()); - bool CanOutline = TL->prepareForOutlining(DT, LI, TI, PSE, AC, LS_NAME, + + bool canOutline = TL->prepareForOutlining(DT, LI, TI, PSE, AC, LS_NAME, ORE, TTI); if (!CanOutline) { const Loop *L = TL->getLoop(); @@ -1552,6 +1555,7 @@ TaskOutlineMapTy LoopSpawningImpl::outlineAllTapirLoops() { // Get an outline processor for each Tapir loop. OutlineProcessors[TL] = std::unique_ptr(getOutlineProcessor(TL)); + } } @@ -1589,12 +1593,17 @@ TaskOutlineMapTy LoopSpawningImpl::outlineAllTapirLoops() { } // end timed region TapirLoopInfo *TL = getTapirLoop(T); + if (!TL) continue; Loop *L = TL->getLoop(); LLVM_DEBUG(dbgs() << "Outlining Tapir " << *L << "\n"); + ValueToValueMapTy VMap; + // Run a pre-processing step before we create the helper function. + OutlineProcessors[TL]->preProcessTapirLoop(*TL, VMap, LoopInputSets[L]); + // Convert the inputs of the Tapir loop to inputs to the helper. ValueSet TLInputsFixed; ValueToValueMapTy InputMap; @@ -1611,7 +1620,6 @@ TaskOutlineMapTy LoopSpawningImpl::outlineAllTapirLoops() { OutlineProcessors[TL]->getArgStructMode(), InputMap, L); } // end timed region - ValueSet HelperArgs; SmallVector HelperInputs; { @@ -1619,6 +1627,7 @@ TaskOutlineMapTy LoopSpawningImpl::outlineAllTapirLoops() { "Setup inputs to Tapir-loop helper function", TimerGroupName, TimerGroupDescription, TimePassesIsEnabled); + OutlineProcessors[TL]->setupLoopOutlineArgs( F, HelperArgs, HelperInputs, LoopInputSets[L], LoopCtlArgs[L], LoopCtlInputs[L], TLInputsFixed); @@ -1638,11 +1647,6 @@ TaskOutlineMapTy LoopSpawningImpl::outlineAllTapirLoops() { LoopInputs[L].push_back(V); LoopArgStarts[L] = ArgStart; - ValueToValueMapTy VMap; - - // Run a pre-processing step before we create the helper function. - OutlineProcessors[TL]->preProcessTapirLoop(*TL, VMap); - // Create the helper function. Function *Outline = createHelperForTapirLoop( TL, LoopArgs[L], OutlineProcessors[TL]->getIVArgIndex(F, LoopArgs[L]), From c85f06d8a66d62d60535c4f63d08abe54a300122 Mon Sep 17 00:00:00 2001 From: George Stelle Date: Thu, 7 Nov 2024 21:34:21 -0700 Subject: [PATCH 13/14] working high performance gpu reductions --- kitsune-tests/reductions/l2.c | 9 +-- kitsune-tests/reductions/makefile | 11 +-- llvm/include/llvm/Transforms/Tapir/GPUABI.h | 3 +- llvm/lib/Transforms/Tapir/GPUABI.cpp | 73 ++++++++++++++++---- llvm/lib/Transforms/Tapir/LoopSpawningTI.cpp | 13 ++-- 5 files changed, 80 insertions(+), 29 deletions(-) diff --git a/kitsune-tests/reductions/l2.c b/kitsune-tests/reductions/l2.c index 1da6e4b2968f3..74fc51ec8337e 100644 --- a/kitsune-tests/reductions/l2.c +++ b/kitsune-tests/reductions/l2.c @@ -14,10 +14,11 @@ __attribute__((noinline)) double l2(uint64_t n, double* a){ double red = 0; forall(uint64_t i=0; igetPointerTo(); - Type *Int8Ty = Type::getInt8Ty(M.getContext()); - Type *Int32Ty = Type::getInt32Ty(M.getContext()); Type *Int64Ty = Type::getInt64Ty(M.getContext()); GPUInit = M.getOrInsertFunction("initRuntime", VoidTy); GPULaunchKernel = M.getOrInsertFunction("launchBCKernel", VoidPtrTy, VoidPtrTy, Int64Ty, VoidPtrPtrTy, Int64Ty); @@ -187,19 +187,24 @@ unsigned LLVMLoop::getLimitArgIndex(const Function &F, const ValueSet &Args) void LLVMLoop::postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out, ValueToValueMapTy &VMap) { - LLVMContext &Ctx = M.getContext(); - Type *Int8Ty = Type::getInt8Ty(Ctx); - Type *Int32Ty = Type::getInt32Ty(Ctx); - //Type *Int64Ty = Type::getInt64Ty(Ctx); - //Type *VoidPtrTy = Type::getInt8PtrTy(Ctx); Task *T = TL.getTask(); Loop *L = TL.getLoop(); + Function *Helper = Out.Outline; + + DominatorTree DT; + DT.recalculate(*Helper); + LoopInfo LI(DT); + + // The loop we care about is the outermost loop in the BasicBlock *Entry = cast(VMap[L->getLoopPreheader()]); BasicBlock *Header = cast(VMap[L->getHeader()]); BasicBlock *Exit = cast(VMap[TL.getExitBlock()]); PHINode *PrimaryIV = cast(VMap[TL.getPrimaryInduction().first]); + + Loop *NewLoop = LI.getLoopFor(Header); + InductionDescriptor ID = TL.getPrimaryInduction().second; Value *PrimaryIVInput = PrimaryIV->getIncomingValueForBlock(Entry); Instruction *ClonedSyncReg = cast( @@ -217,8 +222,7 @@ void LLVMLoop::postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out, Value *ThreadID = B.CreateIntCast(ThreadIdx, PrimaryIV->getType(), false); - Function *Helper = Out.Outline; - Helper->setName("kitsune_kernel"); + Helper->setName("kitsune_kernel"); // Fix argument pointer types to global, nocapture // TODO: read/write attributes? LLVM_DEBUG(dbgs() << "Function type after globalization of argument pointers << " << *Helper->getType() << "\n"); @@ -268,6 +272,49 @@ void LLVMLoop::postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out, Value* NewStep = IncB.CreateMul(CStep, Grainsize); BinOp->setOperand(1, NewStep); ClonedCond->setPredicate(ICmpInst::Predicate::ICMP_UGE); + + // Make each thread reduce into its local memory + for(auto &BB : NewLoop->getBlocks()){ + for(auto &I : *BB){ + if(auto *CI = dyn_cast(&I)){ + auto *f = CI->getCalledFunction(); + if(f->getAttributes().hasAttrSomewhere(Attribute::KitsuneReduction)){ + IRBuilder<> PB(&I); + auto ptr = CI->getOperand(0); + auto ty = CI->getOperand(1)->getType(); + auto lptr = PB.CreateGEP(ty, ptr, ThreadIdx); + CI->setOperand(0, lptr); + f->removeFnAttr(Attribute::NoInline); + } + } + } + } + + TargetLibraryInfoImpl TLII(Triple(M.getTargetTriple())); + TargetLibraryInfo TLI(TLII); + AssumptionCache AC(*Helper); + ScalarEvolution SE(*Helper, TLI, AC, DT, LI); + ValueToValueMapTy peelVMap; + simplifyLoop(NewLoop, &DT, &LI, &SE, &AC, nullptr, false); + peelLoop(NewLoop, 1, &LI, &SE, DT, &AC, false, peelVMap); + SmallVector cis; + for(auto &BB : *Helper){ + if(!NewLoop->contains(&BB)){ // better way? + for(auto &I : BB){ + if(auto *CI = dyn_cast(&I)){ + auto *f = CI->getCalledFunction(); + if(f->getAttributes().hasAttrSomewhere(Attribute::KitsuneReduction)){ + IRBuilder<> pb(&I); + pb.CreateStore(CI->getArgOperand(1), CI->getArgOperand(0)); + cis.push_back(&I); + } + } + } + } + } + for(auto &I : cis){ + I->eraseFromParent(); + } } void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI, @@ -425,7 +472,7 @@ void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI, LLVM_DEBUG(dbgs() << "Finished processOutlinedLoopCall: " << M); } -void LLVMLoop::preProcessTapirLoop(TapirLoopInfo &TL, ValueToValueMapTy &VMap, ValueSet &LoopInputs) { +void LLVMLoop::preProcessTapirLoop(TapirLoopInfo &TL, ValueToValueMapTy &VMap) { Loop *L = TL.getLoop(); BasicBlock *PH = L->getLoopPreheader(); BasicBlock *Header = L->getHeader(); @@ -472,7 +519,6 @@ void LLVMLoop::preProcessTapirLoop(TapirLoopInfo &TL, ValueToValueMapTy &VMap, V GS = RB.CreateCall(GPUGridSize); Value *NBytes = RB.CreateMul(GS, ConstantInt::get(GS->getType(), DL.getTypeAllocSize(Ty))); CallInst *Alloc = RB.CreateCall(GPUManagedMalloc, {NBytes}); - LoopInputs.insert(Alloc); // We overwrite in the body the location to Alloc, which will be replaced with a GEP using the tid in postprocessing CI->setOperand(0, Alloc); @@ -511,7 +557,6 @@ void LLVMLoop::preProcessTapirLoop(TapirLoopInfo &TL, ValueToValueMapTy &VMap, V BB.CreateAdd(Idx, ConstantInt::get(Idx->getType(), 1), Idx->getName() + ".add"); BasicBlock* Body = BodyTerm->getParent(); - BasicBlock* LoopExit = ExitTerm->getParent(); Idx->addIncoming(IdxAdd, Body); ReplaceInstWithInst(BodyTerm, BranchInst::Create(RedEpiHeader)); diff --git a/llvm/lib/Transforms/Tapir/LoopSpawningTI.cpp b/llvm/lib/Transforms/Tapir/LoopSpawningTI.cpp index ff5321ca23699..0be661c1112eb 100644 --- a/llvm/lib/Transforms/Tapir/LoopSpawningTI.cpp +++ b/llvm/lib/Transforms/Tapir/LoopSpawningTI.cpp @@ -1536,10 +1536,14 @@ Function *LoopSpawningImpl::createHelperForTapirLoop( /// Outline all recorded Tapir loops in the function. TaskOutlineMapTy LoopSpawningImpl::outlineAllTapirLoops() { + std::map LoopVMaps; + // Prepare Tapir loops for outlining. for (Task *T : post_order(TI.getRootTask())) { - if (TapirLoopInfo *TL = getTapirLoop(T)) { + // Run a pre-processing step before we create the helper function. + if (TapirLoopInfo *TL = getTapirLoop(T)) { + PredicatedScalarEvolution PSE(SE, *TL->getLoop()); bool canOutline = TL->prepareForOutlining(DT, LI, TI, PSE, AC, LS_NAME, @@ -1556,6 +1560,8 @@ TaskOutlineMapTy LoopSpawningImpl::outlineAllTapirLoops() { OutlineProcessors[TL] = std::unique_ptr(getOutlineProcessor(TL)); + Loop *L = TL->getLoop(); + OutlineProcessors[TL]->preProcessTapirLoop(*TL, LoopVMaps[L]); } } @@ -1600,9 +1606,6 @@ TaskOutlineMapTy LoopSpawningImpl::outlineAllTapirLoops() { Loop *L = TL->getLoop(); LLVM_DEBUG(dbgs() << "Outlining Tapir " << *L << "\n"); - ValueToValueMapTy VMap; - // Run a pre-processing step before we create the helper function. - OutlineProcessors[TL]->preProcessTapirLoop(*TL, VMap, LoopInputSets[L]); // Convert the inputs of the Tapir loop to inputs to the helper. ValueSet TLInputsFixed; @@ -1647,6 +1650,8 @@ TaskOutlineMapTy LoopSpawningImpl::outlineAllTapirLoops() { LoopInputs[L].push_back(V); LoopArgStarts[L] = ArgStart; + ValueToValueMapTy &VMap = LoopVMaps[L]; + // Create the helper function. Function *Outline = createHelperForTapirLoop( TL, LoopArgs[L], OutlineProcessors[TL]->getIVArgIndex(F, LoopArgs[L]), From c4812367f6acfe687ca30d8d11754e4345b4488c Mon Sep 17 00:00:00 2001 From: George Stelle Date: Tue, 14 Jan 2025 14:37:53 -0700 Subject: [PATCH 14/14] GPU reductions loop info fixes --- kitsune-tests/reductions/l2.c | 29 +++++---- kitsune-tests/reductions/makefile | 8 +-- llvm/include/llvm/Transforms/Tapir/GPUABI.h | 2 +- .../llvm/Transforms/Tapir/LoweringUtils.h | 3 +- llvm/lib/Transforms/Tapir/GPUABI.cpp | 18 ++---- llvm/lib/Transforms/Tapir/LoopSpawningTI.cpp | 5 +- llvm/lib/Transforms/Tapir/LoopStripMine.cpp | 63 +++++++++++-------- 7 files changed, 71 insertions(+), 57 deletions(-) diff --git a/kitsune-tests/reductions/l2.c b/kitsune-tests/reductions/l2.c index 74fc51ec8337e..c1918baee371c 100644 --- a/kitsune-tests/reductions/l2.c +++ b/kitsune-tests/reductions/l2.c @@ -3,6 +3,7 @@ #include #include #include +#include #include reduction @@ -14,11 +15,10 @@ __attribute__((noinline)) double l2(uint64_t n, double* a){ double red = 0; forall(uint64_t i=0; igetLoopPreheader(); BasicBlock *Header = L->getHeader(); BasicBlock *Latch = L->getLoopLatch(); BranchInst *LatchBR = cast(Latch->getTerminator()); + BasicBlock &Entry = Header->getParent()->getEntryBlock(); unsigned ExitIndex = LatchBR->getSuccessor(0) == Header ? 1 : 0; BasicBlock *LatchExit = LatchBR->getSuccessor(ExitIndex); DetachInst *DI = cast(Header->getTerminator()); @@ -512,13 +513,13 @@ void LLVMLoop::preProcessTapirLoop(TapirLoopInfo &TL, ValueToValueMapTy &VMap) { // gs = gridSize() // reds = gpuManagedMalloc(gs) // br header - IRBuilder<> RB(&PH->front()); + IRBuilder<> EB(Entry.getFirstNonPHI()); CallInst *CI = Red.first; Value *Ptr = CI->getArgOperand(0); Type *Ty = Red.second; - GS = RB.CreateCall(GPUGridSize); - Value *NBytes = RB.CreateMul(GS, ConstantInt::get(GS->getType(), DL.getTypeAllocSize(Ty))); - CallInst *Alloc = RB.CreateCall(GPUManagedMalloc, {NBytes}); + GS = EB.CreateCall(GPUGridSize); + Value *NBytes = EB.CreateMul(GS, ConstantInt::get(GS->getType(), DL.getTypeAllocSize(Ty))); + CallInst *Alloc = EB.CreateCall(GPUManagedMalloc, {NBytes}); // We overwrite in the body the location to Alloc, which will be replaced with a GEP using the tid in postprocessing CI->setOperand(0, Alloc); @@ -559,13 +560,6 @@ void LLVMLoop::preProcessTapirLoop(TapirLoopInfo &TL, ValueToValueMapTy &VMap) { BasicBlock* Body = BodyTerm->getParent(); Idx->addIncoming(IdxAdd, Body); ReplaceInstWithInst(BodyTerm, BranchInst::Create(RedEpiHeader)); - - // Update Loopinfo with reduction loop - //Loop* RL = LI->AllocateLoop(); - //if(ParentLoop) ParentLoop->addChildLoop(RL); - //else LI->addTopLevelLoop(RL); - //RL->addBasicBlockToLoop(RedEpiHeader, *LI); - //RL->addBasicBlockToLoop(body, *LI); } LLVM_DEBUG(dbgs() << "Finished preProcessTapirLoop: " << *PH->getParent()); diff --git a/llvm/lib/Transforms/Tapir/LoopSpawningTI.cpp b/llvm/lib/Transforms/Tapir/LoopSpawningTI.cpp index 0be661c1112eb..8d2320688143c 100644 --- a/llvm/lib/Transforms/Tapir/LoopSpawningTI.cpp +++ b/llvm/lib/Transforms/Tapir/LoopSpawningTI.cpp @@ -1561,10 +1561,13 @@ TaskOutlineMapTy LoopSpawningImpl::outlineAllTapirLoops() { std::unique_ptr(getOutlineProcessor(TL)); Loop *L = TL->getLoop(); - OutlineProcessors[TL]->preProcessTapirLoop(*TL, LoopVMaps[L]); + OutlineProcessors[TL]->preProcessTapirLoop(*TL, LoopVMaps[L], LI); } } + LI.releaseMemory(); + LI.analyze(DT); + TaskOutlineMapTy TaskToOutline; DenseMap LoopInputSets; DenseMap> LoopCtlArgs; diff --git a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp index 0d12868b37c60..dfb9cfaeda4f7 100644 --- a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp +++ b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp @@ -1533,29 +1533,14 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount, // Record that the remainder loop was derived from a Tapir loop. (*RemainderLoop)->setDerivedFromTapirLoop(); - - // At this point, the code is well formed. We now simplify the new loops, - // doing constant propagation and dead code elimination as we go. - simplifyLoopAfterStripMine(L, /*SimplifyIVs*/ true, LI, SE, DT, TTI, AC); - simplifyLoopAfterStripMine(NewLoop, /*SimplifyIVs*/ true, LI, SE, DT, TTI, - AC); - simplifyLoopAfterStripMine(*RemainderLoop, /*SimplifyIVs*/ true, LI, SE, DT, - TTI, AC); + // Record that the old loop was derived from a Tapir loop. + L->setDerivedFromTapirLoop(); #ifndef NDEBUG DT->verify(); LI->verify(*DT); #endif - // Record that the old loop was derived from a Tapir loop. - L->setDerivedFromTapirLoop(); - - // Update TaskInfo manually using the updated DT. - if (TI) - // FIXME: Recalculating TaskInfo for the whole function is wasteful. - // Optimize this routine in the future. - TI->recalculate(*F, *DT); - // accumulate reductions in main loop const std::vector& blocks = L->getBlocks(); std::set> reductions; @@ -1680,15 +1665,16 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount, Instruction* term = LatchExit->getTerminator(); BasicBlock *PostSync = term->getSuccessor(0); - BasicBlock* RedEpiHeader = BasicBlock::Create(LatchExit->getContext(), "reductionEpilogue", LatchExit->getParent(), LatchExit); + BasicBlock *RedEpiHeader = SplitBlock(PostSync, PostSync->getTerminator(), DT, LI, nullptr, "reductionEpilogue"); + //BasicBlock* RedEpiHeader = BasicBlock::Create(LatchExit->getContext(), "reductionEpilogue", LatchExit->getParent(), LatchExit); RedEpiHeader->moveAfter(LatchExit); - ReplaceInstWithInst(term, SyncInst::Create(RedEpiHeader, SyncReg)); - BranchInst::Create(PostSync, RedEpiHeader); + ReplaceInstWithInst(PostSync->getTerminator(), SyncInst::Create(RedEpiHeader, SyncReg)); + //BranchInst::Create(PostSync, RedEpiHeader); PHINode *Idx = PHINode::Create(outerIters->getType(), 2, "reductionepilogueidx", RedEpiHeader->getFirstNonPHI()); IRBuilder<> BH(RedEpiHeader->getFirstNonPHI()); - Idx->addIncoming(ConstantInt::get(outerIters->getType(), 0), LatchExit); + Idx->addIncoming(ConstantInt::get(outerIters->getType(), 0), PostSync); Instruction *bodyTerm, *exitTerm; Value *cmp = BH.CreateCmp(CmpInst::ICMP_NE, Idx, outerIters); SplitBlockAndInsertIfThenElse(cmp, RedEpiHeader->getTerminator(), &bodyTerm, &exitTerm); @@ -1715,16 +1701,43 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount, // Update Loopinfo with reduction loop Loop* RL = LI->AllocateLoop(); - if(ParentLoop) ParentLoop->addChildLoop(RL); - else LI->addTopLevelLoop(RL); - RL->addBasicBlockToLoop(RedEpiHeader, *LI); - RL->addBasicBlockToLoop(body, *LI); + if(!ParentLoop){ + LI->addTopLevelLoop(RL); + RL->addBasicBlockToLoop(RedEpiHeader, *LI); + RL->addBasicBlockToLoop(body, *LI); + } + else { + ParentLoop->addChildLoop(RL); + LI->changeLoopFor(RedEpiHeader, RL); + RL->addBlockEntry(RedEpiHeader); + LI->changeLoopFor(body, RL); + RL->addBlockEntry(body); + } + simplifyLoop(RL, DT, LI, SE, AC, nullptr, PreserveLCSSA); } LLVM_DEBUG(dbgs() << "Function after reduction epilogue\n" << *F); + // At this point, the code is well formed. We now simplify the new loops, + // doing constant propagation and dead code elimination as we go. + simplifyLoopAfterStripMine(L, /*SimplifyIVs*/ true, LI, SE, DT, TTI, AC); + simplifyLoopAfterStripMine(NewLoop, /*SimplifyIVs*/ true, LI, SE, DT, TTI, + AC); + simplifyLoopAfterStripMine(*RemainderLoop, /*SimplifyIVs*/ true, LI, SE, DT, + TTI, AC); + + // TODO: fix DT updates DT->recalculate(*F); + LI->releaseMemory(); + LI->analyze(*DT); + + // Update TaskInfo manually using the updated DT. + if (TI) + // FIXME: Recalculating TaskInfo for the whole function is wasteful. + // Optimize this routine in the future. + TI->recalculate(*F, *DT); + #ifndef NDEBUG DT->verify();