diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index 40e9848e513a8..05cdb8a377345 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -4725,6 +4725,14 @@ def ClspvLibclcBuiltin: InheritableAttr { let SimpleHandler = 1; } +// +===== kitsune-/tapir-centric attributes + +def KitsuneReduction : InheritableAttr { + let Spellings = [Clang<"kitsune_reduction">]; + let Subjects = SubjectList<[FunctionLike]>; + let Documentation = [StrandMallocDocs]; +} + def TapirTarget : StmtAttr { let Spellings = [CXX11<"tapir","target">]; diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index 283fe88bf2f72..4be6d0b4920ee 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -2408,6 +2408,9 @@ void CodeGenModule::ConstructAttributeList(StringRef Name, FuncAttrs.addMemoryAttr(llvm::MemoryEffects::inaccessibleOrArgMemOnly()); FuncAttrs.addAttribute(llvm::Attribute::NoUnwind); } + if (TargetDecl->hasAttr()) { + FuncAttrs.addAttribute(llvm::Attribute::KitsuneReduction); + } if (TargetDecl->hasAttr()) RetAttrs.addAttribute(llvm::Attribute::NoAlias); if (TargetDecl->hasAttr() && diff --git a/clang/lib/Headers/kitsune.h.cmake b/clang/lib/Headers/kitsune.h.cmake new file mode 100644 index 0000000000000..d8e6cd4bf86ee --- /dev/null +++ b/clang/lib/Headers/kitsune.h.cmake @@ -0,0 +1,145 @@ + +/* + * Copyright (c) 2020 Triad National Security, LLC + * All rights reserved. + * + * This file is part of the kitsune/llvm project. It is released under + * the LLVM license. + */ +#ifndef __CLANG_KITSUNE_H__ +#define __CLANG_KITSUNE_H__ + +#include +#include + +#cmakedefine01 KITSUNE_ENABLE_OPENMP_ABI_TARGET +#cmakedefine01 KITSUNE_ENABLE_QTHREADS_ABI_TARGET +#cmakedefine01 KITSUNE_ENABLE_CUDA_ABI_TARGET +#cmakedefine01 KITSUNE_ENABLE_GPU_ABI_TARGET +#cmakedefine01 KITSUNE_ENABLE_REALM_ABI_TARGET +#cmakedefine01 KITSUNE_ENABLE_OPENCL_ABI_TARGET +#cmakedefine01 KITSUNE_ENABLE_HIP_ABI_TARGET + +#include "kitsune_rt.h" + +#if defined(reduction) +#warning found reduction definition: try puttin kitsune lower in include order +#else +#define reduction __attribute__((noinline, kitsune_reduction)) +#endif + + +#if defined(KITSUNE_ENABLE_OPENCL_ABI_TARGET) +#define ocl_mmap(a, n) __kitsune_opencl_mmap_marker((void*)a, n) +#ifdef __cplusplus +extern "C" { +#endif + void __kitsune_opencl_mmap_marker(void* ptr, uint64_t n); +#ifdef __cplusplus +} +#endif +#endif + +#if defined(spawn) +#warning encountered multiple definitions of spawn! +#else +#define spawn _kitsune_spawn +#endif + +#if defined(sync) +#warning encountered multiple definitions of sync! +#else +#define sync _kitsune_sync +#endif + +#if defined(forall) +#warning encountered multiple definitions of forall! +#else +#define forall _kitsune_forall +#endif + + +#if defined(_tapir_cuda_target) + #ifdef __cplusplus + extern "C" __attribute__((malloc)) void* __kitrt_cuMemAllocManaged(size_t); + template + inline __attribute__((always_inline)) + T* alloc(size_t N) { + return (T*)__kitrt_cuMemAllocManaged(sizeof(T) * N); + } + + extern "C" void __kitrt_cuMemFree(void*); + template + void dealloc(T* array) { + __kitrt_cuMemFree((void*)array); + } + #else + void* __attribute__((malloc)) __kitrt_cuMemAllocManaged(size_t); + inline __attribute__((always_inline)) + void *alloc(size_t total_bytes) { + return __kitrt_cuMemAllocManaged(total_bytes); + } + + void __kitrt_cuMemFree(void*); + inline __attribute__((always_inline)) + void dealloc(void *array) { + __kitrt_cuMemFree(array); + } + #endif +#elif defined(_tapir_hip_target) + #ifdef __cplusplus + extern "C" __attribute__((malloc)) void* __kitrt_hipMemAllocManaged(size_t); + template + inline __attribute__((always_inline)) + T* alloc(size_t N) { + return (T*)__kitrt_hipMemAllocManaged(sizeof(T) * N); + } + + extern "C" void __kitrt_hipMemFree(void*); + template + void dealloc(T* array) { + __kitrt_hipMemFree((void*)array); + } + #else + void* __attribute__((malloc)) __kitrt_hipMemAllocManaged(size_t); + inline __attribute__((always_inline)) + void *alloc(size_t total_bytes) { + return __kitrt_hipMemAllocManaged(total_bytes); + } + + void __kitrt_hipMemFree(void*); + inline __attribute__((always_inline)) + void dealloc(void *array) { + __kitrt_hipMemFree(array); + } + #endif +#else + #ifdef __cplusplus + extern "C" __attribute__((malloc)) void* __kitrt_defaultMemAlloc(size_t); + template + inline __attribute__((always_inline)) + T* alloc(size_t N) { + return (T*)__kitrt_defaultMemAlloc(sizeof(T) * N); + } + + extern "C" void __kitrt_defaultMemFree(void*); + template + void dealloc(T* array) { + __kitrt_defaultMemFree(array); + } + #else + void* __attribute__((malloc)) __kitrt_defaultMemAlloc(size_t); + inline __attribute__((always_inline)) + void *alloc(size_t total_bytes) { + return __kitrt_defaultMemAlloc(total_bytes); + } + + void __kitrt_defaultMemFree(void*); + inline __attribute__((always_inline)) + void dealloc(void* array) { + __kitrt_defaultMemFree(array); + } + #endif // __cplusplus +#endif // cpu targets + +#endif diff --git a/clang/lib/Headers/magma.h b/clang/lib/Headers/magma.h new file mode 100644 index 0000000000000..43c4fb8f1075f --- /dev/null +++ b/clang/lib/Headers/magma.h @@ -0,0 +1,43 @@ +#include +#include + +template +struct Magma { + virtual a op(a x, a y) = 0; +}; + +template +struct UnitalMagma : public Magma { + virtual a id() = 0; +}; + +// Example unital magmas +template +struct Sum : UnitalMagma{ + a op(a x, a y){ return x + y; } + a id(){ return 0; } // look into this more +}; + +template +struct Product : UnitalMagma { + a op(a x, a y){ return x * y; } + a id(){ return 1; } +}; + +struct StringApp : UnitalMagma { + std::string op(std::string x, std::string y){ return x.append(y); } + std::string id() { return ""; } +}; + +template +struct Max : UnitalMagma { + a op(a x, a y){ return x > y ? x : y; } + a id() { return std::numeric_limits::min(); } +}; + +template +struct Min : UnitalMagma { + a op(a x, a y){ return x < y ? x : y; } + a id() { return std::numeric_limits::max(); } +}; + diff --git a/clang/lib/Headers/reductions.h b/clang/lib/Headers/reductions.h new file mode 100644 index 0000000000000..d9abf9cf276b5 --- /dev/null +++ b/clang/lib/Headers/reductions.h @@ -0,0 +1,56 @@ +#include"magma.h" +#include +#include +#include +//#include + +template +a reduce(um m, v& xs){ + auto acc = m.id(); + for(auto x : xs){ + acc = m.op(acc, x); + } + return acc; +} + +template +a parReduce(um m, v& xs, uint64_t nthreads){ + uint64_t linesize = sysconf(_SC_LEVEL1_DCACHE_LINESIZE); + assert(linesize % sizeof(a) == 0); + uint64_t linenum = linesize / sizeof(a); + a* accs = new a[nthreads * linenum]; + uint64_t size = xs.end() - xs.begin(); + assert(size % nthreads == 0); + uint64_t grainsize = size / nthreads; + for(uint64_t i=0; i +a treeReduce(um m, v& xs, uint64_t start, uint64_t end, uint64_t gs){ + if(end-start < gs){ + a acc = m.id(); + for(uint64_t i=start; i(S, D, AL); + break; } } diff --git a/kitsune-tests/reductions/l2.c b/kitsune-tests/reductions/l2.c new file mode 100644 index 0000000000000..c1918baee371c --- /dev/null +++ b/kitsune-tests/reductions/l2.c @@ -0,0 +1,53 @@ +#include +#include +#include +#include +#include +#include +#include + +reduction +void sum(double *a, double b){ + *a += b; +} + +__attribute__((noinline)) +double l2(uint64_t n, double* a){ + double red = 0; + forall(uint64_t i=0; i 1 ? atoi(argv[1]) : 28; + int niter = argc > 2 ? atoi(argv[2]) : 100; + uint64_t n = 1ULL< +#include +#include +#include +#include +#include + +double l2(uint64_t n, double* a){ + double red = 0; + #pragma omp parallel for reduction(+:red) + for(uint64_t i=0; i 1 ? atoi(argv[1]) : 28; + int niter = argc > 2 ? atoi(argv[2]) : 100; + uint64_t n = 1ULL< +... + double sum = reduce(Sum(), big); +``` + + + diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h index 40ae85cf16919..fa5bdd2c5cc4b 100644 --- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h +++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h @@ -764,6 +764,8 @@ enum AttributeKindCodes { ATTR_KIND_HYBRID_PATCHABLE = 95, ATTR_KIND_SANITIZE_CILK = 96, ATTR_KIND_STEALABLE = 97, + ATTR_KIND_SANITIZE_CILK = 98, + ATTR_KIND_KITSUNE_REDUCTION = 99, }; enum ComdatSelectionKindCodes { diff --git a/llvm/include/llvm/IR/Attributes.td b/llvm/include/llvm/IR/Attributes.td index bea25b9ed4191..63f17c762ca6b 100644 --- a/llvm/include/llvm/IR/Attributes.td +++ b/llvm/include/llvm/IR/Attributes.td @@ -239,6 +239,16 @@ def ReadNone : EnumAttr<"readnone", [ParamAttr]>; /// Function only reads from memory. def ReadOnly : EnumAttr<"readonly", [ParamAttr]>; +<<<<<<< HEAD +======= +/// Tapir reducer-related attributes. +def HyperView : EnumAttr<"hyper_view", [FnAttr]>; +def HyperToken : EnumAttr<"hyper_token", [FnAttr]>; +def ReducerRegister : EnumAttr<"reducer_register", [FnAttr]>; +def ReducerUnregister : EnumAttr<"reducer_unregister", [FnAttr]>; +def KitsuneReduction : EnumAttr<"kitsune_reduction", [FnAttr]>; + +>>>>>>> e2078e0739d8 (Working reductions, sequential semantics preserved for commutative associative magmas) /// Return value is always equal to this argument. def Returned : EnumAttr<"returned", [ParamAttr]>; diff --git a/llvm/include/llvm/IR/GlobalValue.h b/llvm/include/llvm/IR/GlobalValue.h index 59ba8919d8605..d98c0c5f4dfd0 100644 --- a/llvm/include/llvm/IR/GlobalValue.h +++ b/llvm/include/llvm/IR/GlobalValue.h @@ -571,7 +571,6 @@ class GlobalValue : public Constant { // FIXME: need a better solution but also too difficult to re-outline // everything w/ tapir for GPU targets (AMD in particular)... - //#warning "FIXME: need better solution than mutateType()." void mutateValueType(Type *Ty) { ValueType = Ty; } diff --git a/llvm/include/llvm/Transforms/Tapir/GPUABI.h b/llvm/include/llvm/Transforms/Tapir/GPUABI.h index 63bdd19cf6e63..3b45e2cc3754f 100644 --- a/llvm/include/llvm/Transforms/Tapir/GPUABI.h +++ b/llvm/include/llvm/Transforms/Tapir/GPUABI.h @@ -60,13 +60,14 @@ class LLVMLoop : public LoopOutlineProcessor { static unsigned NextKernelID; unsigned MyKernelID; Module LLVMM; - TargetMachine *LLVMTargetMachine; GlobalVariable *LLVMGlobal; FunctionCallee GetThreadIdx = nullptr; FunctionCallee GPUInit = nullptr; FunctionCallee GPULaunchKernel = nullptr; FunctionCallee GPUWaitKernel = nullptr; + FunctionCallee GPUManagedMalloc = nullptr; + FunctionCallee GPUGridSize = nullptr; SmallVector OrderedInputs; public: @@ -86,8 +87,9 @@ class LLVMLoop : public LoopOutlineProcessor { ValueToValueMapTy &VMap) override final; void processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI, DominatorTree &DT) override final; + void preProcessTapirLoop(TapirLoopInfo &TL, ValueToValueMapTy &VMap, LoopInfo& LI) override final; }; -} +} //namespace llvm #endif /* diff --git a/llvm/include/llvm/Transforms/Tapir/LoweringUtils.h b/llvm/include/llvm/Transforms/Tapir/LoweringUtils.h index e2e65b4000fed..be0c8a5ed8367 100644 --- a/llvm/include/llvm/Transforms/Tapir/LoweringUtils.h +++ b/llvm/include/llvm/Transforms/Tapir/LoweringUtils.h @@ -17,6 +17,7 @@ #include "llvm/ADT/MapVector.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" #include "llvm/Transforms/Tapir/TapirTargetIDs.h" @@ -449,6 +450,9 @@ class LoopOutlineProcessor { virtual void preProcessTapirLoop(TapirLoopInfo &TL, ValueToValueMapTy &VMap) { /* no-op */ } + virtual void preProcessTapirLoop(TapirLoopInfo &TL, ValueToValueMapTy &VMap, LoopInfo &LI) + { preProcessTapirLoop(TL, VMap); } + /// Processes an outlined Function Helper for a Tapir loop, just after the /// function has been outlined. virtual void postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out, diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index 92411913c43ee..09306d88ae805 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -2189,6 +2189,8 @@ static Attribute::AttrKind getAttrFromCode(uint64_t Code) { return Attribute::Range; case bitc::ATTR_KIND_INITIALIZES: return Attribute::Initializes; + case bitc::ATTR_KIND_KITSUNE_REDUCTION: + return Attribute::KitsuneReduction; } } diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index 85f9d1556c26c..5251549c36f2d 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -889,6 +889,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) { return bitc::ATTR_KIND_RANGE; case Attribute::Initializes: return bitc::ATTR_KIND_INITIALIZES; + case Attribute::KitsuneReduction: + return bitc::ATTR_KIND_KITSUNE_REDUCTION; case Attribute::EndAttrKinds: llvm_unreachable("Can not encode end-attribute kinds marker."); case Attribute::None: diff --git a/llvm/lib/Transforms/Tapir/GPUABI.cpp b/llvm/lib/Transforms/Tapir/GPUABI.cpp index af302214d2545..66ec4c98d9d4e 100644 --- a/llvm/lib/Transforms/Tapir/GPUABI.cpp +++ b/llvm/lib/Transforms/Tapir/GPUABI.cpp @@ -16,13 +16,17 @@ #if 0 #include "llvm/Transforms/Tapir/GPUABI.h" +#include "llvm/Transforms/Tapir/TapirToTarget.h" #include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/Module.h" #include "llvm/IR/Verifier.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/Tapir/Outline.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/LoopPeel.h" +#include "llvm/Transforms/Utils/LoopSimplify.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/GVN.h" #include "llvm/Transforms/Vectorize.h" @@ -37,9 +41,18 @@ using namespace llvm; #define DEBUG_TYPE "gpuabi" -Value *GPUABI::lowerGrainsizeCall(CallInst *GrainsizeCall) { - Value *Grainsize = ConstantInt::get(GrainsizeCall->getType(), 8); +// JIT compiler kernel at containing function entry, makes timing easier at the +// cost of less laziness +static cl::opt + JIT("jit-callsite", cl::init(false), cl::NotHidden, + cl::desc("Wait until parallel loop is called to jit kernel. " + "(default=false)")); +Value *GPUABI::lowerGrainsizeCall(CallInst *GrainsizeCall) { + //Value *Grainsize = ConstantInt::get(GrainsizeCall->getType(), 8); + Module *M = GrainsizeCall->getModule(); + Type *Int64Ty = Type::getInt64Ty(M->getContext()); + Value* Grainsize = CallInst::Create(M->getOrInsertFunction("gpuGridSize", Int64Ty), {}, GrainsizeCall); // Replace uses of grainsize intrinsic call with this grainsize value. GrainsizeCall->replaceAllUsesWith(Grainsize); return Grainsize; @@ -96,19 +109,17 @@ LLVMLoop::LLVMLoop(Module &M) // Insert runtime-function declarations in LLVM host modules. Type *LLVMInt32Ty = Type::getInt32Ty(LLVMM.getContext()); - Type *LLVMInt64Ty = Type::getInt64Ty(LLVMM.getContext()); GetThreadIdx = LLVMM.getOrInsertFunction("gtid", LLVMInt32Ty); - Function* getid = LLVMM.getFunction("gtid"); Type *VoidTy = Type::getVoidTy(M.getContext()); Type *VoidPtrTy = Type::getInt8PtrTy(M.getContext()); Type *VoidPtrPtrTy = VoidPtrTy->getPointerTo(); - Type *Int8Ty = Type::getInt8Ty(M.getContext()); - Type *Int32Ty = Type::getInt32Ty(M.getContext()); Type *Int64Ty = Type::getInt64Ty(M.getContext()); GPUInit = M.getOrInsertFunction("initRuntime", VoidTy); GPULaunchKernel = M.getOrInsertFunction("launchBCKernel", VoidPtrTy, VoidPtrTy, Int64Ty, VoidPtrPtrTy, Int64Ty); GPUWaitKernel = M.getOrInsertFunction("waitKernel", VoidTy, VoidPtrTy); + GPUManagedMalloc = M.getOrInsertFunction("gpuManagedMalloc", VoidPtrTy, Int64Ty); + GPUGridSize = M.getOrInsertFunction("gpuGridSize", Int64Ty); } void LLVMLoop::setupLoopOutlineArgs( @@ -176,19 +187,25 @@ unsigned LLVMLoop::getLimitArgIndex(const Function &F, const ValueSet &Args) void LLVMLoop::postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out, ValueToValueMapTy &VMap) { - LLVMContext &Ctx = M.getContext(); - Type *Int8Ty = Type::getInt8Ty(Ctx); - Type *Int32Ty = Type::getInt32Ty(Ctx); - //Type *Int64Ty = Type::getInt64Ty(Ctx); - //Type *VoidPtrTy = Type::getInt8PtrTy(Ctx); Task *T = TL.getTask(); Loop *L = TL.getLoop(); + Function *Helper = Out.Outline; + + DominatorTree DT; + DT.recalculate(*Helper); + LoopInfo LI(DT); + + // The loop we care about is the outermost loop in the BasicBlock *Entry = cast(VMap[L->getLoopPreheader()]); BasicBlock *Header = cast(VMap[L->getHeader()]); BasicBlock *Exit = cast(VMap[TL.getExitBlock()]); PHINode *PrimaryIV = cast(VMap[TL.getPrimaryInduction().first]); + + Loop *NewLoop = LI.getLoopFor(Header); + + InductionDescriptor ID = TL.getPrimaryInduction().second; Value *PrimaryIVInput = PrimaryIV->getIncomingValueForBlock(Entry); Instruction *ClonedSyncReg = cast( VMap[T->getDetach()->getSyncRegion()]); @@ -205,8 +222,7 @@ void LLVMLoop::postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out, Value *ThreadID = B.CreateIntCast(ThreadIdx, PrimaryIV->getType(), false); - Function *Helper = Out.Outline; - Helper->setName("kitsune_kernel"); + Helper->setName("kitsune_kernel"); // Fix argument pointer types to global, nocapture // TODO: read/write attributes? LLVM_DEBUG(dbgs() << "Function type after globalization of argument pointers << " << *Helper->getType() << "\n"); @@ -231,11 +247,7 @@ void LLVMLoop::postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out, // Grainsize argument is the third LC arg. Grainsize = &*++(++OutlineArgsIter); } - ThreadID = B.CreateMul(ThreadID, Grainsize); - Value *ThreadEndGrain = B.CreateAdd(ThreadID, Grainsize); - Value *Cmp = B.CreateICmp(ICmpInst::ICMP_ULT, ThreadEndGrain, End); - Value *ThreadEnd = B.CreateSelect(Cmp, ThreadEndGrain, End); - Value *Cond = B.CreateICmpUGE(ThreadID, ThreadEnd); + Value *Cond = B.CreateICmpUGE(ThreadID, End); ReplaceInstWithInst(Entry->getTerminator(), BranchInst::Create(Exit, Header, Cond)); @@ -245,24 +257,73 @@ void LLVMLoop::postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out, // Update cloned loop condition to use the thread-end value. unsigned TripCountIdx = 0; ICmpInst *ClonedCond = cast(VMap[TL.getCondition()]); - if (ClonedCond->getOperand(0) != ThreadEnd) + if (ClonedCond->getOperand(0) != End) ++TripCountIdx; - ClonedCond->setOperand(TripCountIdx, ThreadEnd); - assert(ClonedCond->getOperand(TripCountIdx) == ThreadEnd && + ClonedCond->setOperand(TripCountIdx, End); + assert(ClonedCond->getOperand(TripCountIdx) == End && "End argument not used in condition"); + // we replace the step + assert(TL.getPrimaryInduction().second.getInductionBinOp()->getOpcode() == Instruction::BinaryOps::Add && + "Only support gpu kernels with addition for induction step"); + ConstantInt *CStep = cast(VMap[ID.getConstIntStepValue()]); + BinaryOperator *BinOp = cast(VMap[ID.getInductionBinOp()]); + assert(CStep && BinOp && "Couldn't infer step or operator from primary induction gpu variable"); + IRBuilder<> IncB(&BinOp->getParent()->front()); + Value* NewStep = IncB.CreateMul(CStep, Grainsize); + BinOp->setOperand(1, NewStep); + ClonedCond->setPredicate(ICmpInst::Predicate::ICMP_UGE); + + // Make each thread reduce into its local memory + for(auto &BB : NewLoop->getBlocks()){ + for(auto &I : *BB){ + if(auto *CI = dyn_cast(&I)){ + auto *f = CI->getCalledFunction(); + if(f->getAttributes().hasAttrSomewhere(Attribute::KitsuneReduction)){ + IRBuilder<> PB(&I); + auto ptr = CI->getOperand(0); + auto ty = CI->getOperand(1)->getType(); + auto lptr = PB.CreateGEP(ty, ptr, ThreadIdx); + CI->setOperand(0, lptr); + f->removeFnAttr(Attribute::NoInline); + } + } + } + } + TargetLibraryInfoImpl TLII(Triple(M.getTargetTriple())); + TargetLibraryInfo TLI(TLII); + AssumptionCache AC(*Helper); + ScalarEvolution SE(*Helper, TLI, AC, DT, LI); + ValueToValueMapTy peelVMap; + simplifyLoop(NewLoop, &DT, &LI, &SE, &AC, nullptr, false); + peelLoop(NewLoop, 1, &LI, &SE, DT, &AC, false, peelVMap); + SmallVector cis; + for(auto &BB : *Helper){ + if(!NewLoop->contains(&BB)){ // better way? + for(auto &I : BB){ + if(auto *CI = dyn_cast(&I)){ + auto *f = CI->getCalledFunction(); + if(f->getAttributes().hasAttrSomewhere(Attribute::KitsuneReduction)){ + IRBuilder<> pb(&I); + pb.CreateStore(CI->getArgOperand(1), CI->getArgOperand(0)); + cis.push_back(&I); + } + } + } + } + } + for(auto &I : cis){ + I->eraseFromParent(); + } } void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI, DominatorTree &DT) { LLVMContext &Ctx = M.getContext(); Type *Int8Ty = Type::getInt8Ty(Ctx); - Type *Int32Ty = Type::getInt32Ty(Ctx); Type *Int64Ty = Type::getInt64Ty(Ctx); Type *VoidPtrTy = Type::getInt8PtrTy(Ctx); - //Task *T = TL.getTask(); - //Instruction *ReplCall = cast(TOI.ReplCall); LLVM_DEBUG(dbgs() << "Running processOutlinedLoopCall: " << LLVMM); Function *Parent = TOI.ReplCall->getFunction(); Value *TripCount = OrderedInputs[0]; @@ -272,12 +333,7 @@ void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI, IRBuilder<> B(&NBB->front()); - // Compile the kernel - //LLVMM.getFunctionList().remove(TOI.Outline); - //TOI.Outline->eraseFromParent(); - LLVMContext &LLVMCtx = LLVMM.getContext(); - - ValueToValueMapTy VMap; + ValueToValueMapTy VMap; // We recursively add definitions and declarations to the device module SmallVector todo; todo.push_back(LLVMM.getFunction("kitsune_kernel")); @@ -304,7 +360,8 @@ void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI, SmallVector Returns; CloneFunctionInto(deviceF, f, VMap, CloneFunctionChangeType::DifferentModule, Returns); // GPU calls are slow as balls, try to force inlining - deviceF->addFnAttr(Attribute::AlwaysInline); + if(!deviceF->hasFnAttribute(Attribute::NoInline)) + deviceF->addFnAttr(Attribute::AlwaysInline); todo.push_back(deviceF); } } @@ -388,20 +445,10 @@ void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI, GlobalValue::PrivateLinkage, LLVMBC, "gpu_" + Twine("kitsune_kernel")); - //Value* TripCount = isSRetInput(TOI.InputSet[0]) ? TOI.InputSet[1] : TOI.InputSet[0]; - //Value *RunStart = ReplCall->getArgOperand(getIVArgIndex(*Parent, - // TOI.InputSet)); - //Value *TripCount = ReplCall->getArgOperand(getLimitArgIndex(*Parent, - // TOI.InputSet)); - - Value *KernelID = ConstantInt::get(Int32Ty, MyKernelID); - Value *LLVMPtr = B.CreateBitCast(LLVMGlobal, VoidPtrTy); - Type *VoidPtrPtrTy = VoidPtrTy->getPointerTo(); - - Constant *kernelSize = ConstantInt::get(Int64Ty, - LLVMGlobal->getInitializer()->getType()->getArrayNumElements()); - BasicBlock &EBB = Parent->getEntryBlock(); - IRBuilder<> EB(&EBB.front()); + Constant *kernelSize = ConstantInt::get(Int64Ty, + LLVMGlobal->getInitializer()->getType()->getArrayNumElements()); + BasicBlock &EBB = Parent->getEntryBlock(); + IRBuilder<> EB(&EBB.front()); EB.CreateCall(GPUInit, {}); ArrayType* arrayType = ArrayType::get(VoidPtrTy, OrderedInputs.size()); @@ -417,23 +464,103 @@ void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI, B.CreateStore(VoidVPtr, argPtr); } - Value *Grainsize = TL.getGrainsize() ? - ConstantInt::get(TripCount->getType(), TL.getGrainsize()) : - OrderedInputs[2]; - - //Type *Int64Ty = Type::getInt64Ty(LLVMM.getContext()); - Value *RunSizeQ = B.CreateUDiv(TripCount, Grainsize); - Value *RunRem = B.CreateURem(TripCount, Grainsize); - Value *IsRem = B.CreateICmp(ICmpInst::ICMP_UGT, RunRem, ConstantInt::get(RunRem->getType(), 0)); - Value *IsRemAdd = B.CreateZExt(IsRem, RunSizeQ->getType()); - Value *RunSize = B.CreateZExt(B.CreateAdd(RunSizeQ, IsRemAdd), Int64Ty); - - Value* argsPtr = B.CreateConstInBoundsGEP2_32(arrayType, argArray, 0, 0); - Value* bcPtr = B.CreateConstInBoundsGEP2_32(LLVMGlobal->getValueType(), LLVMGlobal, 0, 0); - Value* stream = B.CreateCall(GPULaunchKernel, { bcPtr, kernelSize, argsPtr, RunSize }); + Value* argsPtr = B.CreateConstInBoundsGEP2_32(arrayType, argArray, 0, 0); + Value* bcPtr = B.CreateConstInBoundsGEP2_32(LLVMGlobal->getValueType(), LLVMGlobal, 0, 0); + Value* stream = B.CreateCall(GPULaunchKernel, { bcPtr, kernelSize, argsPtr, TripCount }); B.CreateCall(GPUWaitKernel, stream); LLVM_DEBUG(dbgs() << "Finished processOutlinedLoopCall: " << M); } -#endif +void LLVMLoop::preProcessTapirLoop(TapirLoopInfo &TL, ValueToValueMapTy &VMap, LoopInfo &LI) { + Loop *L = TL.getLoop(); + BasicBlock *PH = L->getLoopPreheader(); + BasicBlock *Header = L->getHeader(); + BasicBlock *Latch = L->getLoopLatch(); + BranchInst *LatchBR = cast(Latch->getTerminator()); + BasicBlock &Entry = Header->getParent()->getEntryBlock(); + unsigned ExitIndex = LatchBR->getSuccessor(0) == Header ? 1 : 0; + BasicBlock *LatchExit = LatchBR->getSuccessor(ExitIndex); + DetachInst *DI = cast(Header->getTerminator()); + Value *SyncReg = DI->getSyncRegion(); + Value *GS; + + const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); + + // accumulate Reductions in main loop + const std::vector& Blocks = L->getBlocks(); + std::set> Reductions; + for (BasicBlock *BB : Blocks){ + for (Instruction &I : *BB) { + if(auto *CI = dyn_cast(&I)){ + auto *F = CI->getCalledFunction(); + if(F->getAttributes().hasAttrSomewhere(Attribute::KitsuneReduction)){ + LLVM_DEBUG(dbgs() << "Found reduction var: " << CI->getArgOperand(0)->getName() << + "with reduction function: " << F->getName() << "\n"); + auto *Ty = CI->getArgOperand(1)->getType(); + Reductions.insert(std::make_pair(CI, Ty)); + //TODO: check the type to confirm valid reduction + } + } + } + } + + std::vector> RedMap; + + for(auto &Red : Reductions){ + // We assume there's a preheader, and use it to insert allocations for reductions: + // preheader: + // gs = gridSize() + // reds = gpuManagedMalloc(gs) + // br header + IRBuilder<> EB(Entry.getFirstNonPHI()); + CallInst *CI = Red.first; + Value *Ptr = CI->getArgOperand(0); + Type *Ty = Red.second; + GS = EB.CreateCall(GPUGridSize); + Value *NBytes = EB.CreateMul(GS, ConstantInt::get(GS->getType(), DL.getTypeAllocSize(Ty))); + CallInst *Alloc = EB.CreateCall(GPUManagedMalloc, {NBytes}); + // We overwrite in the body the location to Alloc, which will be replaced with a GEP using the tid in postprocessing + CI->setOperand(0, Alloc); + + RedMap.push_back(std::make_tuple(CI, Ptr, Alloc, Ty)); + } + + if(!Reductions.empty()){ + Instruction* Term = LatchExit->getTerminator(); + BasicBlock *PostSync = Term->getSuccessor(0); + BasicBlock* RedEpiHeader = BasicBlock::Create(LatchExit->getContext(), "reductionEpilogue", LatchExit->getParent(), LatchExit); + RedEpiHeader->moveAfter(LatchExit); + ReplaceInstWithInst(Term, SyncInst::Create(RedEpiHeader, SyncReg)); + BranchInst::Create(PostSync, RedEpiHeader); + PHINode *Idx = PHINode::Create(GS->getType(), 2, + "reductionepilogueidx", + RedEpiHeader->getFirstNonPHI()); + IRBuilder<> BH(RedEpiHeader->getFirstNonPHI()); + Idx->addIncoming(ConstantInt::get(GS->getType(), 0), LatchExit); + Instruction *BodyTerm, *ExitTerm; + Value *Cmp = BH.CreateCmp(CmpInst::ICMP_NE, Idx, GS); + SplitBlockAndInsertIfThenElse(Cmp, RedEpiHeader->getTerminator(), &BodyTerm, &ExitTerm); + + IRBuilder<> BB(BodyTerm); + // For each reduction, get the allocated thread local reduced values and + // reduce them. + for(auto& KV : RedMap){ + const auto [ CI, Ptr, Al, Ty ] = KV; + Value* Lptr = BB.CreateBitCast( + BB.CreateGEP(Ty, Al, Idx), + Ptr->getType()); + Value *LR = BB.CreateLoad(Ty, Lptr); + BB.SetCurrentDebugLocation(CI->getDebugLoc()); + BB.CreateCall(CI->getCalledFunction(), { Ptr, LR }); + } + Value *IdxAdd = + BB.CreateAdd(Idx, ConstantInt::get(Idx->getType(), 1), + Idx->getName() + ".add"); + BasicBlock* Body = BodyTerm->getParent(); + Idx->addIncoming(IdxAdd, Body); + ReplaceInstWithInst(BodyTerm, BranchInst::Create(RedEpiHeader)); + } + + LLVM_DEBUG(dbgs() << "Finished preProcessTapirLoop: " << *PH->getParent()); +} diff --git a/llvm/lib/Transforms/Tapir/LoopSpawningTI.cpp b/llvm/lib/Transforms/Tapir/LoopSpawningTI.cpp index d98fdad17043a..8d2320688143c 100644 --- a/llvm/lib/Transforms/Tapir/LoopSpawningTI.cpp +++ b/llvm/lib/Transforms/Tapir/LoopSpawningTI.cpp @@ -1535,11 +1535,18 @@ Function *LoopSpawningImpl::createHelperForTapirLoop( /// Outline all recorded Tapir loops in the function. TaskOutlineMapTy LoopSpawningImpl::outlineAllTapirLoops() { + + std::map LoopVMaps; + // Prepare Tapir loops for outlining. for (Task *T : post_order(TI.getRootTask())) { + // Run a pre-processing step before we create the helper function. + if (TapirLoopInfo *TL = getTapirLoop(T)) { + PredicatedScalarEvolution PSE(SE, *TL->getLoop()); - bool CanOutline = TL->prepareForOutlining(DT, LI, TI, PSE, AC, LS_NAME, + + bool canOutline = TL->prepareForOutlining(DT, LI, TI, PSE, AC, LS_NAME, ORE, TTI); if (!CanOutline) { const Loop *L = TL->getLoop(); @@ -1552,9 +1559,15 @@ TaskOutlineMapTy LoopSpawningImpl::outlineAllTapirLoops() { // Get an outline processor for each Tapir loop. OutlineProcessors[TL] = std::unique_ptr(getOutlineProcessor(TL)); + + Loop *L = TL->getLoop(); + OutlineProcessors[TL]->preProcessTapirLoop(*TL, LoopVMaps[L], LI); } } + LI.releaseMemory(); + LI.analyze(DT); + TaskOutlineMapTy TaskToOutline; DenseMap LoopInputSets; DenseMap> LoopCtlArgs; @@ -1589,12 +1602,14 @@ TaskOutlineMapTy LoopSpawningImpl::outlineAllTapirLoops() { } // end timed region TapirLoopInfo *TL = getTapirLoop(T); + if (!TL) continue; Loop *L = TL->getLoop(); LLVM_DEBUG(dbgs() << "Outlining Tapir " << *L << "\n"); + // Convert the inputs of the Tapir loop to inputs to the helper. ValueSet TLInputsFixed; ValueToValueMapTy InputMap; @@ -1611,7 +1626,6 @@ TaskOutlineMapTy LoopSpawningImpl::outlineAllTapirLoops() { OutlineProcessors[TL]->getArgStructMode(), InputMap, L); } // end timed region - ValueSet HelperArgs; SmallVector HelperInputs; { @@ -1619,6 +1633,7 @@ TaskOutlineMapTy LoopSpawningImpl::outlineAllTapirLoops() { "Setup inputs to Tapir-loop helper function", TimerGroupName, TimerGroupDescription, TimePassesIsEnabled); + OutlineProcessors[TL]->setupLoopOutlineArgs( F, HelperArgs, HelperInputs, LoopInputSets[L], LoopCtlArgs[L], LoopCtlInputs[L], TLInputsFixed); @@ -1638,11 +1653,8 @@ TaskOutlineMapTy LoopSpawningImpl::outlineAllTapirLoops() { LoopInputs[L].push_back(V); LoopArgStarts[L] = ArgStart; - ValueToValueMapTy VMap; - - // Run a pre-processing step before we create the helper function. - OutlineProcessors[TL]->preProcessTapirLoop(*TL, VMap); - + ValueToValueMapTy &VMap = LoopVMaps[L]; + // Create the helper function. Function *Outline = createHelperForTapirLoop( TL, LoopArgs[L], OutlineProcessors[TL]->getIVArgIndex(F, LoopArgs[L]), diff --git a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp index 99bc2c9a962b7..dfb9cfaeda4f7 100644 --- a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp +++ b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp @@ -37,6 +37,7 @@ #include "llvm/Transforms/Utils/SimplifyIndVar.h" #include "llvm/Transforms/Utils/TapirUtils.h" #include "llvm/Transforms/Utils/UnrollLoop.h" +#include "llvm/Transforms/Utils/LoopPeel.h" using namespace llvm; @@ -804,6 +805,10 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount, unsigned ExitIndex = LatchBR->getSuccessor(0) == Header ? 1 : 0; BasicBlock *LatchExit = LatchBR->getSuccessor(ExitIndex); + + Function *F = Header->getParent(); + LLVM_DEBUG(dbgs() << "Function before strip mining\n" << *F); + // We will use the increment of the primary induction variable to derive // wrapping flags. Instruction *PrimaryInc = @@ -1528,6 +1533,190 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount, // Record that the remainder loop was derived from a Tapir loop. (*RemainderLoop)->setDerivedFromTapirLoop(); + // Record that the old loop was derived from a Tapir loop. + L->setDerivedFromTapirLoop(); + +#ifndef NDEBUG + DT->verify(); + LI->verify(*DT); +#endif + + // accumulate reductions in main loop + const std::vector& blocks = L->getBlocks(); + std::set> reductions; + for (BasicBlock *BB : blocks){ + for (Instruction &I : *BB) { + if(auto ci = dyn_cast(&I)){ + auto f = ci->getCalledFunction(); + if(f->getAttributes().hasAttrSomewhere(Attribute::KitsuneReduction)){ + LLVM_DEBUG(dbgs() << "Found reduction var: " << ci->getArgOperand(0)->getName() << + "with reduction function: " << f->getName() << "\n"); + auto ty = ci->getArgOperand(1)->getType(); + reductions.insert(std::make_pair(ci, ty)); + //TODO: check the type to confirm valid reduction + } + } + } + } + + // accumulate reductions in epilog loop + LLVM_DEBUG(dbgs() << "Found " << reductions.size() << " reduction variables in loop\n"); + + std::vector> redMap; + // TODO: Modify the strip mining outer loop to be smaller: currently we are + // stack allocating n/2048 reduction values. + // TODO: Initialize local reductions with unit values + Instruction *bloc = nullptr; + if(Instruction* I = dyn_cast(TripCount)){ + bloc = I->getNextNode(); + } else { + bloc = F->getEntryBlock().getTerminator(); + } + IRBuilder<> RB(bloc); + Value *outerIters = RB.CreateUDiv(TripCount, + ConstantInt::get(TripCount->getType(), Count), + "stripiter"); + auto nred = RB.CreateAdd(outerIters, ConstantInt::get(outerIters->getType(), 1)); + for(auto &pair : reductions){ + // TODO: generic allocation/free calls + auto ci = pair.first; + auto ptr = ci->getArgOperand(0); + auto ty = pair.second; + auto gmmTy = FunctionType::get(ptr->getType(), { nred->getType() }, false); + auto arrSize = RB.CreateMul(nred, ConstantInt::get(nred->getType(), DL.getTypeAllocSize(ty))); + auto al = RB.CreateCall(M->getOrInsertFunction("gpuManagedMalloc", gmmTy), {arrSize}); + //auto al = RB.CreateBitCast(rm, ty); + //auto al = RB.CreateAlloca(ty, nred, ptr->getName() + "_reduction"); + IRBuilder<> BH(NewLoop->getHeader()->getTerminator()); + auto lptr = BH.CreateBitCast( + BH.CreateGEP(ty, al, NewIdx), + ptr->getType()); + redMap.push_back(std::make_tuple(ci, ptr, al, ty)); + // Assume there is more than one element, and + // use the first element for the first iteration of the loop. + // roughly: + // red = init; + // forall(i = ...){ + // red = reduce(red, body(i)); + // } + // red = init; + // localred[m+1]; + // + // forall(k ∈ 0..m-1){ + // localred[i] = body(j_0); + // for(j ∈ j_k_1..j_k_l-1) + // reduce(localred+i, body(j)); + // } + // for( j ∈ j_k_m .. n ) + // reduce(localred+m, body(j)); + // } + // for(k ∈ 0..m) + // reduce(&red, localred[k]); + // + ptr->replaceUsesWithIf(lptr, [L](Use &u){ + if(auto I = dyn_cast(u.getUser())){ + return L->contains(I->getParent()); + } else { + return false; + }; + }); + } + + // Epilog "join" of reduction values stored in local reduction value arrays. + // Should be able to use redMap to map original pointer (which is still used + // to reduce the remainder of the strimined loop, so you probably want to + // start the reduction with that value). + LLVM_DEBUG(dbgs() << "Function after strip mining, before reduction epilogue\n" << *F); + + // We insert the reduction code at every sync corresponding to the strimined + // loop + // + // Sync + // RedEpiHeader + // RedEpiBody + // RedEpiExit + + // Todo: re-order epilogue and reduction epilogue to preserve associativity + if(!reductions.empty()){ + // Peel the first iteration of the loop and replace the reduction calls in + // the peeled code with stores + ValueToValueMapTy VMap; + peelLoop(L, 1, LI, SE, *DT, AC, PreserveLCSSA, VMap); + SmallVector cis; + for(auto &BB : NewLoop->blocks()){ + if(!L->contains(BB)){ // better way? + for(auto &I : *BB){ + if(auto *CI = dyn_cast(&I)){ + auto *f = CI->getCalledFunction(); + if(f->getAttributes().hasAttrSomewhere(Attribute::KitsuneReduction)){ + IRBuilder<> pb(&I); + pb.CreateStore(CI->getArgOperand(1), CI->getArgOperand(0)); + cis.push_back(&I); + f->removeFnAttr(Attribute::NoInline); + } + } + } + } + } + + for(auto &I : cis){ + I->eraseFromParent(); + } + + Instruction* term = LatchExit->getTerminator(); + BasicBlock *PostSync = term->getSuccessor(0); + BasicBlock *RedEpiHeader = SplitBlock(PostSync, PostSync->getTerminator(), DT, LI, nullptr, "reductionEpilogue"); + //BasicBlock* RedEpiHeader = BasicBlock::Create(LatchExit->getContext(), "reductionEpilogue", LatchExit->getParent(), LatchExit); + RedEpiHeader->moveAfter(LatchExit); + ReplaceInstWithInst(PostSync->getTerminator(), SyncInst::Create(RedEpiHeader, SyncReg)); + //BranchInst::Create(PostSync, RedEpiHeader); + PHINode *Idx = PHINode::Create(outerIters->getType(), 2, + "reductionepilogueidx", + RedEpiHeader->getFirstNonPHI()); + IRBuilder<> BH(RedEpiHeader->getFirstNonPHI()); + Idx->addIncoming(ConstantInt::get(outerIters->getType(), 0), PostSync); + Instruction *bodyTerm, *exitTerm; + Value *cmp = BH.CreateCmp(CmpInst::ICMP_NE, Idx, outerIters); + SplitBlockAndInsertIfThenElse(cmp, RedEpiHeader->getTerminator(), &bodyTerm, &exitTerm); + + IRBuilder<> BB(bodyTerm); + // For each reduction, get the allocated thread local reduced values and + // reduce them. + for(auto& kv : redMap){ + const auto [ ci, ptr, al, ty ] = kv; + auto lptr = BB.CreateBitCast( + BB.CreateGEP(ty, al, Idx), + ptr->getType()); + auto x = BB.CreateLoad(ty, lptr); + BB.SetCurrentDebugLocation(ci->getDebugLoc()); + BB.CreateCall(ci->getCalledFunction(), { ptr, x }); + } + Value *IdxAdd = + BB.CreateAdd(Idx, ConstantInt::get(Idx->getType(), 1), + Idx->getName() + ".add"); + BasicBlock* body = bodyTerm->getParent(); + BasicBlock* loopExit = exitTerm->getParent(); + Idx->addIncoming(IdxAdd, body); + ReplaceInstWithInst(bodyTerm, BranchInst::Create(RedEpiHeader)); + + // Update Loopinfo with reduction loop + Loop* RL = LI->AllocateLoop(); + if(!ParentLoop){ + LI->addTopLevelLoop(RL); + RL->addBasicBlockToLoop(RedEpiHeader, *LI); + RL->addBasicBlockToLoop(body, *LI); + } + else { + ParentLoop->addChildLoop(RL); + LI->changeLoopFor(RedEpiHeader, RL); + RL->addBlockEntry(RedEpiHeader); + LI->changeLoopFor(body, RL); + RL->addBlockEntry(body); + } + simplifyLoop(RL, DT, LI, SE, AC, nullptr, PreserveLCSSA); + } + + LLVM_DEBUG(dbgs() << "Function after reduction epilogue\n" << *F); // At this point, the code is well formed. We now simplify the new loops, // doing constant propagation and dead code elimination as we go. @@ -1537,13 +1726,11 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount, simplifyLoopAfterStripMine(*RemainderLoop, /*SimplifyIVs*/ true, LI, SE, DT, TTI, AC); -#ifndef NDEBUG - DT->verify(); - LI->verify(*DT); -#endif - // Record that the old loop was derived from a Tapir loop. - L->setDerivedFromTapirLoop(); + // TODO: fix DT updates + DT->recalculate(*F); + LI->releaseMemory(); + LI->analyze(*DT); // Update TaskInfo manually using the updated DT. if (TI) @@ -1551,5 +1738,11 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount, // Optimize this routine in the future. TI->recalculate(*F, *DT); + +#ifndef NDEBUG + DT->verify(); + LI->verify(*DT); +#endif + return NewLoop; } diff --git a/llvm/lib/Transforms/Tapir/OpenCilkABI.cpp b/llvm/lib/Transforms/Tapir/OpenCilkABI.cpp index e048cdaf57c58..be51e49d0d5d7 100644 --- a/llvm/lib/Transforms/Tapir/OpenCilkABI.cpp +++ b/llvm/lib/Transforms/Tapir/OpenCilkABI.cpp @@ -151,8 +151,20 @@ void OpenCilkABI::prepareModule() { if ("" != ClOpenCilkRuntimeBCPath) RuntimeBCPath = ClOpenCilkRuntimeBCPath; - if ("" == RuntimeBCPath) - C.emitError("OpenCilkABI: No OpenCilk bitcode ABI file given."); + std::optional path; + if("" == RuntimeBCPath){ + path = sys::Process::FindInEnvPath("LD_LIBRARY_PATH", "libopencilk-abi.bc"); + if (! path) + // TODO: This is an in-tree build solution for now... + #if defined(OPENCILK_BC_PATH) + path = OPENCILK_BC_PATH; + #else + report_fatal_error("Could not find OpenCilk runtime bitcode file " + "(libopencilk-abi.bc) in LD_LIBRARY_PATH."); + #endif + } else { + path = ClOpenCilkRuntimeBCPath.getValue(); + } LLVM_DEBUG(dbgs() << "Using external bitcode file for OpenCilk ABI: " << RuntimeBCPath << "\n");