diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index 959367a73add2..50063de0d4067 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -5229,6 +5229,14 @@ def NonString : InheritableAttr { let Documentation = [NonStringDocs]; } +// +===== kitsune-/tapir-centric attributes + +def KitsuneReduction : InheritableAttr { + let Spellings = [Clang<"kitsune_reduction">]; + let Subjects = SubjectList<[FunctionLike]>; + let Documentation = [TapirRTDocs]; +} + def TapirTarget : StmtAttr { let Spellings = [CXX11<"tapir","target">]; @@ -5247,9 +5255,9 @@ def TapirTarget : StmtAttr { ErrorDiag, "'parallel' statements">; let Args = [ EnumArgument<"TapirTargetAttrType", "TapirTargetAttrTy", /*is_string=*/ true, - ["nolo", "serial", "cuda", "hip", "opencilk", "openmp", + ["nolo", "serial", "cuda", "hip", "gpu", "opencilk", "openmp", "qthreads", "realm"], - ["Nolo", "Serial", "Cuda", "Hip", "OpenCilk", "OpenMP", + ["Nolo", "Serial", "Cuda", "Hip", "GPU", "OpenCilk", "OpenMP", "Qthreads", "Realm"], 0> ]; diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td index 29ff8dbe1f12f..80d80e9699a00 100644 --- a/clang/include/clang/Basic/DiagnosticDriverKinds.td +++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td @@ -901,6 +901,9 @@ def err_drv_unsupported_option_argument_for_frontend : Error< def err_drv_kitsune_missing_required : Error< "missing required option '--%0'">; +def err_drv_kitsune_tapir_required : Error< + "--tapir is required with '%0'">; + def err_drv_kitsune_optzns_required : Error< "--tapir requires optimization level O1 or higher">; def err_drv_kitsune_lto_o2_required : Error< @@ -944,8 +947,6 @@ def err_drv_opencilk_missing_abi_bitcode: Error< def err_drv_kitsune_kokkos_disabled : Error< "kokkos support was not enabled when kitsune was built">; -def err_drv_kitsune_kokkos_no_tapir : Error< - "--tapir is required with '%0'">; def err_drv_kitsune_target_not_enabled: Error< "tapir target '%0' was not enabled when kitsune was built">; diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index f71de4f5bfa4d..6d958332755ed 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -4283,7 +4283,7 @@ def fno_trigraphs : Flag<["-"], "fno-trigraphs">, Group, HelpText<"Do not process trigraph sequences">, Visibility<[ClangOption, CC1Option]>; def funique_source_file_names: Flag<["-"], "funique-source-file-names">, Group, - HelpText<"Allow the compiler to assume that each translation unit has a unique " + HelpText<"Allow the compiler to assume that each translation unit has a unique " "source file identifier (see -funique-source-file-identifier) at link time">; def fno_unique_source_file_names: Flag<["-"], "fno-unique-source-file-names">; def unique_source_file_identifier_EQ: Joined<["-"], "funique-source-file-identifier=">, Group, @@ -7084,7 +7084,7 @@ defm android_pad_segment : BooleanFFlag<"android-pad-segment">, Group; def shared_libflangrt : Flag<["-"], "shared-libflangrt">, HelpText<"Link the flang-rt shared library">, Group, Visibility<[FlangOption]>, Flags<[NoArgumentUnused]>; -def static_libflangrt : Flag<["-"], "static-libflangrt">, +def static_libflangrt : Flag<["-"], "static-libflangrt">, HelpText<"Link the flang-rt static library">, Group, Visibility<[FlangOption]>, Flags<[NoArgumentUnused]>; @@ -9508,6 +9508,15 @@ def fkokkos_no_init : Flag<["-"], "fkokkos-no-init">, Alias, Visibility<[ClangOption, CC1Option]>, HelpText<"DEPRECATED: Use --kokkos-no-init">; +// Tapir currently requires at least -O1. This makes it difficult to write tests +// that check that the frontend generates the expected LLVM IR. If this +// restriction is ever removed, this option will no longer be necessary since +// we could just pass -O0 to examine the IR generated by the frontend. +def print_before_first: Joined<["--"], "print-before-first">, + Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>, + HelpText<"Print the LLVM Module to stderr before running the optimization " + "pipeline">; + def kitrt_verbose: Joined<["--"], "kitrt-verbose">, Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>, HelpText<"Enable verbose mode in kitsune's runtime">; @@ -9515,7 +9524,7 @@ def kitrt_verbose: Joined<["--"], "kitrt-verbose">, def tapir_EQ : Joined<["--"], "tapir=">, Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>, MetaVarName<"">, - Values<"none,serial,cuda,hip,opencilk">, + Values<"nolo,serial,cuda,hip,opencilk">, HelpText<"The primary tapir target">; def ftapir_EQ : Joined<["-"], "ftapir=">, Alias, diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp index 0a82af6fbbbe6..03fb8a71a6746 100644 --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -1216,6 +1216,10 @@ void EmitAssemblyHelper::RunOptimizationPipeline( return; } + const KitsuneOptions& kitOpts = CI.getKitsuneOpts(); + if (kitOpts.hasTTID() && kitOpts.getPrintBeforeFirst()) + llvm::errs() << *TheModule << "\n"; + // Now that we have all of the passes ready, run them. { PrettyStackTraceString CrashInfo("Optimizer"); diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index 2336c3da3b38c..9fad6a8bd4f39 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -2512,6 +2512,9 @@ void CodeGenModule::ConstructAttributeList(StringRef Name, FuncAttrs.addMemoryAttr(llvm::MemoryEffects::inaccessibleOrArgMemOnly()); FuncAttrs.addAttribute(llvm::Attribute::NoUnwind); } + if (TargetDecl->hasAttr()) { + FuncAttrs.addAttribute(llvm::Attribute::KitsuneReduction); + } if (const auto *RA = TargetDecl->getAttr(); RA && RA->getDeallocator() == nullptr) RetAttrs.addAttribute(llvm::Attribute::NoAlias); diff --git a/clang/lib/CodeGen/CGKitsune.cpp b/clang/lib/CodeGen/CGKitsune.cpp index 9f99ee449e832..d026d74f0e7bb 100644 --- a/clang/lib/CodeGen/CGKitsune.cpp +++ b/clang/lib/CodeGen/CGKitsune.cpp @@ -59,6 +59,7 @@ #include "clang/CodeGen/CGFunctionInfo.h" #include "clang/Frontend/FrontendDiagnostic.h" #include "llvm/IR/ValueMap.h" +#include "llvm/IR/FixedPointBuilder.h" using namespace clang; using namespace CodeGen; @@ -106,6 +107,8 @@ CodeGenFunction::GetTapirTargetAttr(ArrayRef Attrs) { return llvm::TTID::Cuda; case TapirTargetAttr::Hip: return llvm::TTID::Hip; + case TapirTargetAttr::GPU: + return llvm::TTID::GPU; case TapirTargetAttr::OpenCilk: return llvm::TTID::OpenCilk; case TapirTargetAttr::OpenMP: @@ -326,6 +329,12 @@ void CodeGenFunction::EmitForallStmt(const ForallStmt &S, // Evaluate the initialization before the loop. EmitStmt(S.getInit()); + // TODO: explain more + // We assume that the boolean is a binary operator and pre-compute RHS + auto* BO = dyn_cast(S.getCond()); + llvm::Value *RHS = EmitScalarExpr(BO->getRHS()); + + // In a parallel loop there will always be a condition block // so there is no need to test JumpDest Condition = getJumpDestInCurrentScope("forall.cond"); @@ -364,7 +373,59 @@ void CodeGenFunction::EmitForallStmt(const ForallStmt &S, // C99 6.8.5p2/p4: The first substatement is executed if the expression // compares unequal to 0. The condition must be a scalar type. - llvm::Value *BoolCondVal = EvaluateExprAsBool(S.getCond()); + + //llvm::Value *BoolCondVal = EvaluateExprAsBool(S.getCond()); + llvm::Value *LHS = EmitScalarExpr(BO->getLHS()); + QualType LHSTy = BO->getLHS()->getType(); + llvm::Value *BoolCondVal; + if (LHSTy->hasSignedIntegerRepresentation()) { + switch(BO->getOpcode()) { + case BO_GT: + BoolCondVal = Builder.CreateICmp(llvm::ICmpInst::ICMP_SGT, LHS, RHS, "cmp"); + break; + case BO_GE: + BoolCondVal = Builder.CreateICmp(llvm::ICmpInst::ICMP_SGE, LHS, RHS, "cmp"); + break; + case BO_LT: + BoolCondVal = Builder.CreateICmp(llvm::ICmpInst::ICMP_SLT, LHS, RHS, "cmp"); + break; + case BO_LE: + BoolCondVal = Builder.CreateICmp(llvm::ICmpInst::ICMP_SLE, LHS, RHS, "cmp"); + break; + case BO_NE: + BoolCondVal = Builder.CreateICmp(llvm::ICmpInst::ICMP_NE, LHS, RHS, "cmp"); + break; + case BO_EQ: + BoolCondVal = Builder.CreateICmp(llvm::ICmpInst::ICMP_EQ, LHS, RHS, "cmp"); + break; + default: + llvm_unreachable("Invalid comparison in forall"); + } + } else { + switch(BO->getOpcode()){ + case BO_GT: + BoolCondVal = Builder.CreateICmp(llvm::ICmpInst::ICMP_UGT, LHS, RHS, "cmp"); + break; + case BO_GE: + BoolCondVal = Builder.CreateICmp(llvm::ICmpInst::ICMP_UGE, LHS, RHS, "cmp"); + break; + case BO_LT: + BoolCondVal = Builder.CreateICmp(llvm::ICmpInst::ICMP_ULT, LHS, RHS, "cmp"); + break; + case BO_LE: + BoolCondVal = Builder.CreateICmp(llvm::ICmpInst::ICMP_ULE, LHS, RHS, "cmp"); + break; + case BO_NE: + BoolCondVal = Builder.CreateICmp(llvm::ICmpInst::ICMP_NE, LHS, RHS, "cmp"); + break; + case BO_EQ: + BoolCondVal = Builder.CreateICmp(llvm::ICmpInst::ICMP_EQ, LHS, RHS, "cmp"); + break; + default: + llvm_unreachable("Invalid comparison in forall"); + } + } + Builder.CreateCondBr( BoolCondVal, Detach, Sync.getBlock(), createProfileWeightsForLoop(S.getCond(), getProfileCount(S.getBody()))); diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index 5381edbe61a3d..9897492f05cee 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -161,6 +161,9 @@ static void CheckTTEnabled(const Driver &D, llvm::TTID TT) { if (!KITSUNE_REALM_ENABLED) D.Diag(diag::err_drv_kitsune_target_not_enabled) << llvm::toString(TT); return; + case llvm::TTID::GPU: + // TODO: Check this + return; case llvm::TTID::Serial: // The serial tapir target is always enabled return; @@ -229,13 +232,17 @@ static void CheckKitsuneOptions(const Driver &D, const ArgList &Args, // If --kokkos is provided, then a tapir target must also be provided. if (!Args.hasArg(options::OPT_tapir_EQ)) { - D.Diag(diag::err_drv_kitsune_kokkos_no_tapir) + D.Diag(diag::err_drv_kitsune_tapir_required) << Args.getLastArg(options::OPT_kokkos, options::OPT_kokkos_no_init) ->getSpelling(); return; } } + if (const Arg *A = Args.getLastArg(options::OPT_print_before_first)) + if (!Args.hasArg(options::OPT_tapir_EQ)) + D.Diag(diag::err_drv_kitsune_tapir_required) << A->getSpelling(); + // Check that the -ftapir flag has a valid value. This stops us from // reporting multiple errors because the flag is examined in several places. if (const Arg *A = Args.getLastArg(options::OPT_tapir_EQ)) { diff --git a/clang/lib/Driver/KitsuneOptionUtils.cpp b/clang/lib/Driver/KitsuneOptionUtils.cpp index a288bdf7f7477..00a2de5e70298 100644 --- a/clang/lib/Driver/KitsuneOptionUtils.cpp +++ b/clang/lib/Driver/KitsuneOptionUtils.cpp @@ -112,6 +112,8 @@ clang::getTapirTargetConfigFileName(const opt::ArgList &args) { return "qthreads.cfg"; case TTID::Realm: return "realm.cfg"; + case TTID::GPU: + return "gpu.cfg"; } llvm_unreachable("getTapirTargetConfigFile: TTID not handled"); } @@ -323,6 +325,7 @@ static bool parseKitsuneTTArgs(KitsuneOptions &kitOpts, TTID tt, case llvm::TTID::Realm: return parseKitsuneRealmArgs(kitOpts, args, optTable, diags); case llvm::TTID::Serial: + case llvm::TTID::GPU: // TODO:? return true; } llvm_unreachable("ParseKitsuneTTArgs: TTID not handled"); @@ -335,6 +338,7 @@ bool clang::parseKitsuneArgs(KitsuneOptions &kitOpts, const char *argv0, kitOpts.setKitsuneFrontend(IsKitsuneFrontend(argv0)); kitOpts.setStripmineLoops(args.hasArg(OPT_fstripmine)); + kitOpts.setPrintBeforeFirst(args.hasArg(OPT_print_before_first)); kitOpts.setTapirVerbose(args.hasArg(OPT_tapir_verbose)); kitOpts.setKitrtVerbose(args.hasArg(OPT_kitrt_verbose)); diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp index 9f7a3a4a83e23..c2d263e58a3af 100644 --- a/clang/lib/Driver/ToolChain.cpp +++ b/clang/lib/Driver/ToolChain.cpp @@ -2277,6 +2277,8 @@ void ToolChain::AddKitsunePreprocessorArgs(const ArgList &Args, switch (TT) { case TTID::Nolo: return; + case TTID::GPU: + return; case TTID::Cuda: return ExtractArgsFromString(KITSUNE_CUDA_EXTRA_PREPROCESSOR_FLAGS, CmdArgs, Args); @@ -2356,6 +2358,8 @@ void ToolChain::AddKitsuneCompilerArgs(const ArgList &Args, ExtractArgsFromString(KITSUNE_OPENCILK_EXTRA_COMPILER_FLAGS, CmdArgs, Args); return; + case llvm::TTID::GPU: + return; case TTID::OpenMP: AddKitsuneOpenMPCommonArgs(Args, CmdArgs); ExtractArgsFromString(KITSUNE_OPENMP_EXTRA_COMPILER_FLAGS, CmdArgs, Args); @@ -2385,6 +2389,7 @@ void ToolChain::AddKitsuneCompilerArgs(const ArgList &Args, if (std::optional TT = parseTapirTargetIfValid(Args)) { Args.AddLastArg(CmdArgs, options::OPT_ffp_contract); + Args.AddLastArg(CmdArgs, options::OPT_print_before_first); Args.AddLastArg(CmdArgs, options::OPT_kitrt_verbose); Args.AddLastArg(CmdArgs, options::OPT_tapir_verbose); Args.AddLastArg(CmdArgs, options::OPT_tapir_EQ); @@ -2563,6 +2568,8 @@ void ToolChain::AddKitsuneLinkerArgs(const ArgList &Args, AddKitsuneOpenCilkLinkerArgs(Args, CmdArgs); ExtractArgsFromString(KITSUNE_OPENCILK_EXTRA_LINKER_FLAGS, CmdArgs, Args); return; + case TTID::GPU: + return; case TTID::OpenMP: AddKitsuneOpenMPLinkerArgs(Args, CmdArgs); ExtractArgsFromString(KITSUNE_OPENMP_EXTRA_LINKER_FLAGS, CmdArgs, Args); diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp index c9309fbebae6f..9a47676b2fe95 100644 --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -4813,6 +4813,7 @@ void CompilerInvocationBase::GenerateKitsuneArgs(const KitsuneOptions &Opts, case llvm::TTID::Qthreads: case llvm::TTID::Realm: case llvm::TTID::Serial: + case llvm::TTID::GPU: return; } llvm_unreachable("GenerateKitsuneArgs: TTID not handled"); @@ -4849,6 +4850,9 @@ void CompilerInvocationBase::GenerateKitsuneArgs(const KitsuneOptions &Opts, if (Opts.getStripmineLoops()) GenerateArg(Consumer, OPT_fstripmine); + + if (Opts.getPrintBeforeFirst()) + GenerateArg(Consumer, OPT_print_before_first); } bool CompilerInvocation::CheckKitsuneArgs(const ArgList &Args, diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index 2ab0b218cf8c6..19f7b537003e6 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -7902,10 +7902,12 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL, case ParsedAttr::AT_TypeNullable: handleNullableTypeAttr(S, D, AL); break; - case ParsedAttr::AT_VTablePointerAuthentication: handleVTablePointerAuthentication(S, D, AL); break; + case ParsedAttr::AT_KitsuneReduction: + handleSimpleAttribute(S, D, AL); + break; } } diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp index 51a159697a0a1..bc93989d472ca 100644 --- a/flang/lib/Frontend/FrontendActions.cpp +++ b/flang/lib/Frontend/FrontendActions.cpp @@ -1053,6 +1053,9 @@ void CodeGenAction::runOptimizationPipeline(llvm::raw_pwrite_stream &os) { return; } + if (kitsuneOpts.hasTTID() && kitsuneOpts.getPrintBeforeFirst()) + llvm::errs() << *llvmModule << "\n"; + // Run the passes. mpm.run(*llvmModule, mam); diff --git a/kitsune-tests/reductions/l2.c b/kitsune-tests/reductions/l2.c new file mode 100644 index 0000000000000..335c6c585ec28 --- /dev/null +++ b/kitsune-tests/reductions/l2.c @@ -0,0 +1,49 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +reduction +void sum(double *a, double b, double unit){ + *a += b; +} + +double l2(uint64_t n, double* a){ + double red = 0; + forall(uint64_t i=0; i 1 ? atoi(argv[1]) : 28; + int niter = argc > 2 ? atoi(argv[2]) : 100; + uint64_t n = 1ULL< +#include +#include +#include +#include +#include +#include +#include + +reduction +void sum(double *a, double b, double unit){ + *a += b; +} + +double l2(uint64_t n, double* a){ + double red = 0; + forall(uint64_t i=0; i n/10) + sum(&red, a[i]*a[i], 0); + } + + return sqrt(red); +} + +int main(int argc, char** argv){ + int e = argc > 1 ? atoi(argv[1]) : 28; + int niter = argc > 2 ? atoi(argv[2]) : 100; + uint64_t n = 1ULL< +#include +#include +#include +#include +#include + +double l2(uint64_t n, double* a){ + double red = 0; + #pragma omp parallel for reduction(+:red) + for(uint64_t i=0; i 1 ? atoi(argv[1]) : 28; + int niter = argc > 2 ? atoi(argv[2]) : 100; + uint64_t n = 1ULL< +#include +#include +#include +#include +#include +#include +#include +#include + +reduction +void sum(int *a, int b, int unit){ + *a += b + unit; +} + +int triangle(uint64_t n, int* a){ + int red = 0; + forall(uint64_t i=0; i 1 ? atoi(argv[1]) : 100; + int niter = argc > 2 ? atoi(argv[2]) : 100; + uint64_t n = e;//1ULL< + +... +reduction void sum(double* l, double r, double unit){ + *l += r; +} + +double red=0.0; +forall(...){ + sum(&red, x, 0.0) ; +} +``` + + + diff --git a/kitsune/include/kitsune/Config/config.h.cmake b/kitsune/include/kitsune/Config/config.h.cmake index 63eef3ac05aea..4e93c06c9756d 100644 --- a/kitsune/include/kitsune/Config/config.h.cmake +++ b/kitsune/include/kitsune/Config/config.h.cmake @@ -274,4 +274,7 @@ constexpr unsigned KITSUNE_MAX_FIXED_THREADS_PER_BLOCK = 1024; // ----------------------------------------------------------------------------- +// Is the gpu target enabled +#cmakedefine01 KITSUNE_GPU_ENABLED + #endif // KITSUNE_CONFIG_H diff --git a/kitsune/include/kitsune/Core/Tapir.h b/kitsune/include/kitsune/Core/Tapir.h index d31e0b13c0c23..eb8da83bf17b9 100644 --- a/kitsune/include/kitsune/Core/Tapir.h +++ b/kitsune/include/kitsune/Core/Tapir.h @@ -55,7 +55,7 @@ enum class TTID : uint32_t { /// Lower to kitsune's JIT-enabled, GPU-agnostic runtime. /// FIXME: This has been disabled for now, but should be re-enabled shortly. - // GPUABI = 0x10, + GPU = 0x10, /// Lower to the qthreads runtime. /// FIXME: This is currently disabled and needs to be updated before it can be @@ -78,6 +78,7 @@ enum class TTID : uint32_t { /// FIXME: Almost certainly obsolete. OpenMP = 0x200, + }; /// Convert the integer to a \ref TTID. If the integer cannot be converted to a diff --git a/kitsune/include/kitsune/Core/TapirTargets.h b/kitsune/include/kitsune/Core/TapirTargets.h index 808fadf77b394..98b358d960775 100644 --- a/kitsune/include/kitsune/Core/TapirTargets.h +++ b/kitsune/include/kitsune/Core/TapirTargets.h @@ -41,8 +41,12 @@ #include "llvm/Transforms/Tapir/QthreadsABI.h" #endif // KITSUNE_QTHREADS_ENABLED -#if KITSNUE_REALM_ENABLED +#if KITSUNE_REALM_ENABLED #include "llvm/Transforms/Tapir/RealmABI.h" #endif // KITSUNE_REALM_ENABLED +#if KITSUNE_GPU_ENABLED +#include "llvm/Transforms/Tapir/GPUABI.h" +#endif // KITSUNE_REALM_ENABLED + // #endif // LLVM_TAPIR_TARGETS_H diff --git a/kitsune/include/kitsune/Frontend/KitsuneOptions.h b/kitsune/include/kitsune/Frontend/KitsuneOptions.h index a5eafaa4d9090..81c877cade54e 100644 --- a/kitsune/include/kitsune/Frontend/KitsuneOptions.h +++ b/kitsune/include/kitsune/Frontend/KitsuneOptions.h @@ -90,6 +90,14 @@ class KitsuneOptions { /// Should loop strip-mining be enabled. unsigned stripmineLoops : 1; + /// Should the LLVM Module be printed to stderr just before running the + /// optimization passes. Tapir currently requires at least -O1. This makes it + /// difficult to write tests that check that the frontend generates the + /// expected LLVM IR. If this restriction is ever removed, this option will no + /// longer be necessary since we could just pass -O0 to examine the IR + /// generated by the frontend. + unsigned printBeforeFirst : 1; + /// Enable verbose mode for the tapir target LLVM passes. This is different /// from -mllvm -debug-only=. When the verbose /// flag is set, the passes may print some subset of the information that is @@ -219,6 +227,10 @@ class KitsuneOptions { void setStripmineLoops(bool stripmineLoops = true) { this->stripmineLoops = stripmineLoops; } + + void setPrintBeforeFirst(bool printBeforeFirst = true) { + this->printBeforeFirst = printBeforeFirst; + } /// @} /// @{ @@ -292,6 +304,8 @@ class KitsuneOptions { bool getStripmineLoops() const { return stripmineLoops; } + bool getPrintBeforeFirst() const { return printBeforeFirst; } + bool getTapirVerbose() const { return tapirVerbose; } bool getKitrtVerbose() const { return kitrtVerbose; } diff --git a/kitsune/include/kitsune/kitsune.h b/kitsune/include/kitsune/kitsune.h index 5f24569bf8ac3..57003b9800ff7 100644 --- a/kitsune/include/kitsune/kitsune.h +++ b/kitsune/include/kitsune/kitsune.h @@ -8,6 +8,16 @@ #ifndef __KITSUNE_KITSUNE_H__ #define __KITSUNE_KITSUNE_H__ +#include +#include + +#if defined(reduction) +#warning found reduction definition: try puttin kitsune lower in include order +#else +#define reduction __attribute__((optnone, noinline, kitsune_reduction)) +#endif + + #if defined(spawn) // FIXME KITSUNE: Should this be an error instead of a warning? #warning encountered multiple definitions of spawn! diff --git a/kitsune/lib/Analysis/TapirTargetAnalysis.cpp b/kitsune/lib/Analysis/TapirTargetAnalysis.cpp index 51dc77aa323c9..e0c68d8b2a51c 100644 --- a/kitsune/lib/Analysis/TapirTargetAnalysis.cpp +++ b/kitsune/lib/Analysis/TapirTargetAnalysis.cpp @@ -87,6 +87,11 @@ createTT(TTID id, Module &m, const TapirTargetOptions &tto) { case TTID::Realm: return std::make_unique(m); #endif // KITSUNE_REALM_ENABLED + +#if KITSUNE_GPU_ENABLED + case TTID::GPU: + return std::make_unique(m, tto); +#endif // KITSUNE_REALM_ENABLED default: llvm_unreachable("createTT: TTID not handled"); diff --git a/kitsune/lib/Core/TapirTargetOptions.cpp b/kitsune/lib/Core/TapirTargetOptions.cpp index 4f6f5e6a10ea1..fec422593eba9 100644 --- a/kitsune/lib/Core/TapirTargetOptions.cpp +++ b/kitsune/lib/Core/TapirTargetOptions.cpp @@ -47,7 +47,7 @@ static cl::opt clEnumValN(TTID::Cuda, "cuda", ""), clEnumValN(TTID::Hip, "hip", ""), clEnumValN(TTID::OpenCilk, "opencilk", ""), - // clEnumValN(TTID::GPUABI, "gpuabi", ""), + clEnumValN(TTID::GPU, "gpu", ""), clEnumValN(TTID::Qthreads, "qthreads", ""), clEnumValN(TTID::Realm, "realm", ""), clEnumValN(TTID::Lambda, "lambda", ""), diff --git a/kitsune/lib/Support/Deserialize.cpp b/kitsune/lib/Support/Deserialize.cpp index 547d9a5806586..629afd5012d07 100644 --- a/kitsune/lib/Support/Deserialize.cpp +++ b/kitsune/lib/Support/Deserialize.cpp @@ -22,6 +22,7 @@ std::optional llvm::createTTIDFrom(StringRef s) { .Case("nolo", TTID::Nolo) .Case("cuda", TTID::Cuda) .Case("hip", TTID::Hip) + .Case("gpu", TTID::GPU) .Case("lambda", TTID::Lambda) .Case("omptask", TTID::OMPTask) .Case("opencilk", TTID::OpenCilk) @@ -45,8 +46,7 @@ std::optional llvm::createTTIDFrom(uint32_t u) { case 0x8: return TTID::OpenCilk; case 0x10: - llvm_unreachable("createTTIDFrom: GPUABI has not been enabled"); - // return TTID::GPUABI; + return TTID::GPU; case 0x20: return TTID::Qthreads; case 0x40: diff --git a/kitsune/lib/Support/ToString.cpp b/kitsune/lib/Support/ToString.cpp index 41b1163534580..04f66f6d8b5bc 100644 --- a/kitsune/lib/Support/ToString.cpp +++ b/kitsune/lib/Support/ToString.cpp @@ -37,6 +37,8 @@ std::string llvm::toString(const TTID &tt) { return "qthreads"; case TTID::Realm: return "realm"; + case TTID::GPU: + return "gpu"; } llvm_unreachable("toString: TTID not handled"); } diff --git a/kitsune/runtime/cuda/dylib_support.cpp b/kitsune/runtime/cuda/dylib_support.cpp index c518e72e712a9..907d4f4ca3808 100644 --- a/kitsune/runtime/cuda/dylib_support.cpp +++ b/kitsune/runtime/cuda/dylib_support.cpp @@ -114,7 +114,7 @@ bool __kitcuda_load_symbols() { DLSYM_LOAD(cuFuncGetAttribute); /* Context management */ - DLSYM_LOAD(cuCtxCreate_v3); + DLSYM_LOAD(cuCtxCreate_v4); DLSYM_LOAD(cuDevicePrimaryCtxRetain); DLSYM_LOAD(cuCtxGetCurrent); DLSYM_LOAD(cuCtxSetCurrent); diff --git a/kitsune/runtime/cuda/kitcuda.cpp b/kitsune/runtime/cuda/kitcuda.cpp index e26aafe2f5b24..76fc1c586228c 100644 --- a/kitsune/runtime/cuda/kitcuda.cpp +++ b/kitsune/runtime/cuda/kitcuda.cpp @@ -72,6 +72,7 @@ bool _kitcuda_initialized = false; int _kitcuda_device_id = -1; CUdevice _kitcuda_device = -1; +CUmemLocation _kitcuda_mem_location; CUcontext _kitcuda_context; // TODO: We currently don't use these values within the runtime but @@ -141,6 +142,9 @@ bool __kitcuda_initialize() { if (!__kitrt_get_env_value("KITCUDA_DEVICE_ID", _kitcuda_device_id)) _kitcuda_device_id = 0; + _kitcuda_mem_location.type = CU_MEM_LOCATION_TYPE_DEVICE; + _kitcuda_mem_location.id = _kitcuda_device_id; + assert(_kitcuda_device_id < device_count && "kitcuda: KITCUDA_DEVICE_ID value exceeds available number" " of devices."); diff --git a/kitsune/runtime/cuda/kitcuda.h b/kitsune/runtime/cuda/kitcuda.h index 20de2485db0c9..7ae93d4115d03 100644 --- a/kitsune/runtime/cuda/kitcuda.h +++ b/kitsune/runtime/cuda/kitcuda.h @@ -441,6 +441,7 @@ inline CUcontext __kitcuda_get_context() { #endif extern CUdevice _kitcuda_device; +extern CUmemLocation _kitcuda_mem_location; extern CUcontext _kitcuda_context; #define CU_SAFE_CALL(x) \ diff --git a/kitsune/runtime/cuda/kitcuda_dylib.h b/kitsune/runtime/cuda/kitcuda_dylib.h index 8e89edc835549..3ed7932e84bde 100644 --- a/kitsune/runtime/cuda/kitcuda_dylib.h +++ b/kitsune/runtime/cuda/kitcuda_dylib.h @@ -82,7 +82,7 @@ DECLARE_DLSYM(cuFuncGetName); DECLARE_DLSYM(cuFuncGetAttribute); /* Context management */ -DECLARE_DLSYM(cuCtxCreate_v3); +DECLARE_DLSYM(cuCtxCreate_v4); DECLARE_DLSYM(cuDevicePrimaryCtxRetain); DECLARE_DLSYM(cuCtxGetCurrent); DECLARE_DLSYM(cuCtxSetCurrent); diff --git a/kitsune/runtime/cuda/memory.cpp b/kitsune/runtime/cuda/memory.cpp index de1c1d24bee3d..5b8d87f490822 100644 --- a/kitsune/runtime/cuda/memory.cpp +++ b/kitsune/runtime/cuda/memory.cpp @@ -79,9 +79,9 @@ __kitcuda_mem_alloc_managed(size_t size) { // to occur on first touch -- thus our 'prefetch' status here is a bit // misleading (technically we are not prefetched to either host nor device). CU_SAFE_CALL(cuMemAdvise_p(devp, size, CU_MEM_ADVISE_SET_ACCESSED_BY, - _kitcuda_device)); + _kitcuda_mem_location)); CU_SAFE_CALL(cuMemAdvise_p(devp, size, CU_MEM_ADVISE_SET_PREFERRED_LOCATION, - _kitcuda_device)); + _kitcuda_mem_location)); int enable = 1; CU_SAFE_CALL( @@ -267,7 +267,7 @@ void *__kitcuda_mem_gpu_prefetch(void *vp, void *opaque_stream) { // CU_MEM_ADVISE_SET_READ_MOSTLY advice flag. CU_SAFE_CALL(cuMemAdvise_p((CUdeviceptr)vp, size, CU_MEM_ADVISE_SET_PREFERRED_LOCATION, - _kitcuda_device)); + _kitcuda_mem_location)); // Issue a prefetch request on the provided stream. If the given // stream is null, create a new stream and return it. Once issued @@ -280,7 +280,7 @@ void *__kitcuda_mem_gpu_prefetch(void *vp, void *opaque_stream) { else cu_stream = (CUstream)__kitcuda_get_thread_stream(); - CU_SAFE_CALL(cuMemPrefetchAsync_p((CUdeviceptr)vp, size, _kitcuda_device, + CU_SAFE_CALL(cuMemPrefetchAsync_p((CUdeviceptr)vp, size, _kitcuda_mem_location, 0, cu_stream)); _kitcuda_mem_alloc_mutex.lock(); __kitrt_mark_mem_prefetched(vp); @@ -322,9 +322,12 @@ void *__kitcuda_mem_host_prefetch(void *vp, void *opaque_stream) { // // TODO: A lot of work needs to go into seeing if we can be // smarter about device- and host-side prefetching. + CUmemLocation cpu; + cpu.type = CU_MEM_LOCATION_TYPE_HOST; + cpu.id = 0; CU_SAFE_CALL(cuMemAdvise_p((CUdeviceptr)vp, size, CU_MEM_ADVISE_SET_PREFERRED_LOCATION, - CU_DEVICE_CPU)); + cpu)); // Issue a prefetch request on the stream associated with the // calling thread. Once issued go ahead and mark the memory as // no long being prefetched to the device/GPU. This "mark" does @@ -336,7 +339,7 @@ void *__kitcuda_mem_host_prefetch(void *vp, void *opaque_stream) { else cu_stream = (CUstream)__kitcuda_get_thread_stream(); - CU_SAFE_CALL(cuMemPrefetchAsync_p((CUdeviceptr)vp, size, CU_DEVICE_CPU, + CU_SAFE_CALL(cuMemPrefetchAsync_p((CUdeviceptr)vp, size, cpu, 0, cu_stream)); __kitrt_set_mem_prefetch(vp, false); return cu_stream; diff --git a/kitsune/test/driver/print-before-first.c b/kitsune/test/driver/print-before-first.c new file mode 100644 index 0000000000000..2e6fd1db37d3a --- /dev/null +++ b/kitsune/test/driver/print-before-first.c @@ -0,0 +1,40 @@ +// Check that the --print-before-first option is handled correctly. +// +// ---------------------------------------------------------------------------- +// The --print-before-first option must be used with a Kitsune frontend +// +// RUN: not %clang -O1 --print-before-first %s \ +// RUN: -S -emit-llvm -o /dev/null 2>&1 \ +// RUN: | FileCheck %s -check-prefix FRONTEND +// +// FRONTEND: option '--print-before-first' must be used with a Kitsune frontend +// +// ---------------------------------------------------------------------------- +// The --print-before-first option requires the --tapir option +// +// RUN: not %kitcc -O1 --print-before-first %s \ +// RUN: -S -emit-llvm -o /dev/null 2>&1 \ +// RUN: | FileCheck %s -check-prefix TT -allow-empty +// +// TT: --tapir is required with '--print-before-first' +// +// ---------------------------------------------------------------------------- +// If the --print-before-first option has been implemented correctly, mem2reg +// will not have run, so a stack slot will have been created for the function +// argument. +// +// RUN: %kitcc -O1 --tapir=serial --print-before-first %s \ +// RUN: -S -emit-llvm -o /dev/null 2>&1 \ +// RUN: | FileCheck %s -check-prefix EARLY +// +// EARLY: define {{.+}}ptr @f(ptr {{.*}}%[[P:[^)]+]]) +// EARLY: %[[SLOT:.+]] = alloca ptr +// EARLY: store ptr %[[P]], ptr %[[SLOT]] +// EARLY: %[[RV:.+]] = load ptr, ptr %[[SLOT]] +// EARLY: ret ptr %[[RV]] +// +// ---------------------------------------------------------------------------- + +void* f(void* p) { + return p; +} diff --git a/kitsune/test/driver/print-before-first.f90 b/kitsune/test/driver/print-before-first.f90 new file mode 100644 index 0000000000000..1af4ae07173c3 --- /dev/null +++ b/kitsune/test/driver/print-before-first.f90 @@ -0,0 +1,39 @@ +! REQUIRES: kitfc +! +! Check that the --print-before-first option is handled correctly. +! +! ---------------------------------------------------------------------------- +! The --print-before-first option must be used with a Kitsune frontend +! +! RUN: not %flang -O1 --print-before-first %s \ +! RUN: -S -emit-llvm -o /dev/null 2>&1 \ +! RUN: | FileCheck %s -check-prefix FRONTEND +! +! FRONTEND: option '--print-before-first' must be used with a Kitsune frontend +! +! ---------------------------------------------------------------------------- +! The --print-before-first option requires the --tapir option +! +! RUN: not %kitfc -O1 --print-before-first %s \ +! RUN: -S -emit-llvm -o /dev/null 2>&1 \ +! RUN: | FileCheck %s -check-prefix TT -allow-empty +! +! TT: --tapir is required with '--print-before-first' +! +! ---------------------------------------------------------------------------- +! If the --print-before-first option has been implemented correctly, @_QQmain +! be called from @main. Since @_QQmain is empty, it will be absent after +! optimizations are run (because the empty body will have been inlined) +! +! RUN: %kitfc -O1 --tapir=serial --print-before-first %s \ +! RUN: -S -emit-llvm -o /dev/null 2>&1 \ +! RUN: | FileCheck %s -check-prefix EARLY +! +! EARLY: define {{.*}}i32 @main +! EARLY: call {{.+}} @_FortranAProgramStart +! EARLY: call {{.+}} @_QQmain +! EARLY: call {{.+}} @_FortranAProgramEndStatement +! +! ---------------------------------------------------------------------------- + +end program diff --git a/kitsune/test/tapir/cuda/kernel-opt.ll b/kitsune/test/tapir/cuda/kernel-opt.ll index 682292a342bed..189a5ad7f93ef 100644 --- a/kitsune/test/tapir/cuda/kernel-opt.ll +++ b/kitsune/test/tapir/cuda/kernel-opt.ll @@ -32,7 +32,13 @@ ; RUN: | FileCheck %s --check-prefix=O2 ; ; O2-NOT: = phi i64 -; O2: define {{.+}} @__kitcu_{{.+}}(i64 {{.*}}%[[UB:[^,]+]], i64 {{[^,]+}}, i64 {{[^,]+}}, ptr {{.*}}%[[BUF:[^,]+]], i64 {{.*}}%[[N:[^)]+]]) {{.*}}#[[ATTRS:[0-9]+]] +; O2: define {{.+}} @__kitcu_{{[^(]+}}( +; O2-SAME: i64 {{[^%]*}}%[[LB:[^,]+]], +; O2-SAME: i64 {{[^%]*}}%[[UB:[^,]+]], +; O2-SAME: i64 {{[^%]*}}%[[GRAINSIZE:[^,]+]], +; O2-SAME: ptr {{[^%]*}}%[[BUF:[^,]+]], +; O2-SAME: i64 {{[^%]*}}%[[N:[^)]+]]) +; O2-SAME: {{.*}}#[[ATTRS:[0-9]+]] ; O2-NEXT: [[BBENTRY:.+]]: ; O2-NEXT: %[[TID:.+]] = tail call {{(range.+ )?}}i32 @llvm.nvvm.read.ptx.sreg.tid.x() ; O2-NEXT: %[[BIDX:.+]] = tail call {{(range.+ )?}}i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() diff --git a/kitsune/test/tapir/cuda/kernel-threadiv-1d.ll b/kitsune/test/tapir/cuda/kernel-threadiv-1d.ll index 90132664556fd..ed8d90f8cb134 100644 --- a/kitsune/test/tapir/cuda/kernel-threadiv-1d.ll +++ b/kitsune/test/tapir/cuda/kernel-threadiv-1d.ll @@ -6,7 +6,11 @@ ; RUN: | %kit-mbc -S \ ; RUN: | FileCheck %s ; -; CHECK: define {{.+}}(i64 {{[^%]*}}%[[UB:[^,]+]], {{.+}}) #[[ATTRS:[0-9]+]] +; CHECK: define {{[^(]+}}( +; CHECK-SAME: i64 {{[^%]*}}%[[LB:[^,]+]], +; CHECK-SAME: i64 {{[^%]*}}%[[UB:[^,]+]], +; CHECK-SAME: i64 {{[^)]+}}) +; CHECK-SAME: #[[ATTRS:[0-9]+]] ; CHECK: %[[TID:.+]] = {{.*}}call i32 @llvm.nvvm.read.ptx.sreg.tid.x() ; CHECK: %[[BIDX:.+]] = {{.*}}call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() ; CHECK: %[[BDIM:.+]] = {{.*}}call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() diff --git a/kitsune/test/tapir/cuda/launch.ll b/kitsune/test/tapir/cuda/launch.ll index 4c81e9c68f494..f3a2a8f608c82 100644 --- a/kitsune/test/tapir/cuda/launch.ll +++ b/kitsune/test/tapir/cuda/launch.ll @@ -28,8 +28,8 @@ ; These are followed by a variable number of arguments that are to be passed to ; the kernel being launched. These are typically in the order ; -; - trip count ; - start index +; - trip count ; - grain size ; - ... ; @@ -44,8 +44,8 @@ ; CHECK-SAME: i32 0, ; CHECK-SAME: ptr {{.*}}@[[G_KERNEL_PROPS]], ; CHECK-SAME: ptr %[[STREAM]], -; CHECK-SAME: i64 %n, ; CHECK-SAME: i64 0, +; CHECK-SAME: i64 %n, ; CHECK-SAME: i64 1, ; CHECK-SAME: ptr %c, ; CHECK-SAME: i64 %n diff --git a/kitsune/test/tapir/hip/kernel-opt.ll b/kitsune/test/tapir/hip/kernel-opt.ll index 4f58eec184fd2..a16de3d57a85d 100644 --- a/kitsune/test/tapir/hip/kernel-opt.ll +++ b/kitsune/test/tapir/hip/kernel-opt.ll @@ -32,7 +32,13 @@ ; RUN: | FileCheck %s --check-prefix=O2 ; ; O2-NOT: = phi i64 -; O2: define {{.+}} @__kithip_{{.+}}(i64 {{.*}}%[[UB:[^,]+]], i64 {{[^,]+}}, i64 {{[^,]+}}, ptr {{.*}}%[[BUF:[^,]+]], i64 {{.*}}%[[N:[^)]+]]) {{.*}}#[[ATTRS:[0-9]+]] +; O2: define {{.+}} @__kithip_{{[^(]+}}( +; O2-SAME: i64 {{[^%]*}}%[[LB:[^,]+]], +; O2-SAME: i64 {{[^%]*}}%[[UB:[^,]+]], +; O2-SAME: i64 {{[^%]*}}%[[GRAINSIZE:[^,]+]], +; O2-SAME: ptr {{[^%]*}}%[[BUF:[^,]+]], +; O2-SAME: i64 {{[^%]*}}%[[N:[^)]+]]) +; O2-SAME: {{.*}}#[[ATTRS:[0-9]+]] ; O2-NEXT: [[BBENTRY:.+]]: ; O2-NEXT: %[[BUFCST:.+]] = addrspacecast ptr %[[BUF]] to ptr addrspace(1) ; O2-NEXT: %[[WITEM:.+]] = {{.*}}call i32 @llvm.amdgcn.workitem.id.x() diff --git a/kitsune/test/tapir/hip/kernel-threadiv-1d-y.ll b/kitsune/test/tapir/hip/kernel-threadiv-1d-y.ll index 8e93d1d33071b..008d681218a27 100644 --- a/kitsune/test/tapir/hip/kernel-threadiv-1d-y.ll +++ b/kitsune/test/tapir/hip/kernel-threadiv-1d-y.ll @@ -7,7 +7,11 @@ ; RUN: | %kit-mbc -S \ ; RUN: | FileCheck %s ; -; CHECK: define {{.+}}(i64 {{[^%]*}}%[[UB:[^,]+]], {{.+}}) #[[ATTRS:[0-9]+]] +; CHECK: define {{.+}}( +; CHECK-SAME: i64 {{[^%]*}}%[[LB:[^,]+]], +; CHECK-SAME: i64 {{[^%]*}}%[[UB:[^,]+]], +; CHECK-SAME: i64 {{[^)]+}}) +; CHECK-SAME: #[[ATTRS:[0-9]+]] ; CHECK: %[[WITEM:.+]] = {{.*}}call i32 @llvm.amdgcn.workitem.id.y() ; CHECK: %[[TID:.+]] = zext i32 %[[WITEM]] to i64 ; CHECK: %[[BDIM:.+]] = {{.*}}call i64 @__ockl_get_local_size(i32 1) diff --git a/kitsune/test/tapir/hip/kernel-threadiv-1d.ll b/kitsune/test/tapir/hip/kernel-threadiv-1d.ll index b2a91a1d97fc0..7247002ef907b 100644 --- a/kitsune/test/tapir/hip/kernel-threadiv-1d.ll +++ b/kitsune/test/tapir/hip/kernel-threadiv-1d.ll @@ -6,7 +6,11 @@ ; RUN: | %kit-mbc -S \ ; RUN: | FileCheck %s ; -; CHECK: define {{.+}}(i64 {{[^%]*}}%[[UB:[^,]+]], {{.+}}) #[[ATTRS:[0-9]+]] +; CHECK: define {{.+}}( +; CHECK-SAME: i64 {{[^%]*}}%[[LB:[^,]+]], +; CHECK-SAME: i64 {{[^%]*}}%[[UB:[^,]+]], +; CHECK-SAME: i64 {{[^)]+}}) +; CHECK-SAME: #[[ATTRS:[0-9]+]] ; CHECK: %[[WITEM:.+]] = {{.*}}call i32 @llvm.amdgcn.workitem.id.x() ; CHECK: %[[TID:.+]] = zext i32 %[[WITEM]] to i64 ; CHECK: %[[BDIM:.+]] = {{.*}}call i64 @__ockl_get_local_size(i32 0) diff --git a/kitsune/test/tapir/hip/launch.ll b/kitsune/test/tapir/hip/launch.ll index 72b8dec1de102..183806f2d18d1 100644 --- a/kitsune/test/tapir/hip/launch.ll +++ b/kitsune/test/tapir/hip/launch.ll @@ -29,8 +29,8 @@ ; These are followed by a variable number of arguments that are to be passed to ; the kernel being launched. These are typically in the order ; -; - trip count ; - start index +; - trip count ; - grain size ; - ... ; @@ -45,8 +45,8 @@ ; CHECK-SAME: i32 0, ; CHECK-SAME: ptr {{.*}}@[[G_KERNEL_PROPS]], ; CHECK-SAME: ptr %[[STREAM]], -; CHECK-SAME: i64 %n, ; CHECK-SAME: i64 0, +; CHECK-SAME: i64 %n, ; CHECK-SAME: i64 1, ; CHECK-SAME: ptr %c, ; CHECK-SAME: i64 %n diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt index 9192fca94451d..658ff27f06a8e 100644 --- a/llvm/CMakeLists.txt +++ b/llvm/CMakeLists.txt @@ -444,6 +444,7 @@ list(JOIN KITSUNE_ENABLED_LANGS_STR " " KITSUNE_ENABLED_LANGS_STR) # affects clang. And we have ensured that kitsune is configured before clang. set(KITSUNE_CUDA_ENABLED OFF CACHE INTERNAL "Enable 'cuda' tapir target" FORCE) set(KITSUNE_HIP_ENABLED OFF CACHE INTERNAL "Enable 'hip' tapir target" FORCE) +set(KITSUNE_GPU_ENABLED OFF CACHE INTERNAL "Enable 'gpu' tapir target" FORCE) set(KITSUNE_LAMBDA_ENABLED OFF CACHE INTERNAL "Enable 'lambda' tapir target" FORCE) set(KITSUNE_OMPTASK_ENABLED OFF CACHE INTERNAL "Enable 'omptask' tapir target" FORCE) set(KITSUNE_OPENCILK_ENABLED OFF CACHE INTERNAL "Enable 'opencilk' tapir target" FORCE) @@ -453,7 +454,7 @@ set(KITSUNE_REALM_ENABLED OFF CACHE INTERNAL "Enable 'realm' tapir target" FORCE set(KITSUNE_SERIAL_ENABLED ON CACHE INTERNAL "Enable 'serial' tapir target" FORCE) set(KITSUNE_KNOWN_TAPIR_TARGETS - "cuda;hip;lambda;omptask;opencilk;openmp;qthreads;realm;serial" + "cuda;hip;gpu;lambda;omptask;opencilk;openmp;qthreads;realm;serial" CACHE INTERNAL "All known Tapir targets" FORCE) @@ -493,6 +494,8 @@ foreach(target IN LISTS KITSUNE_ENABLED_TAPIR_TARGETS) message(FATAL_ERROR "Unknown Tapir target '${target}'") endif () endforeach () +message(WARNING "CUDA: ${KITSUNE_CUDA_ENABLED}") +message(WARNING "GPU: ${KITSUNE_GPU_ENABLED}") list(SORT KITSUNE_ENABLED_TAPIR_TARGETS_STR) list(JOIN KITSUNE_ENABLED_TAPIR_TARGETS_STR " " KITSUNE_ENABLED_TAPIR_TARGETS_STR) diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h index 14c6e28d55369..a45afe2dfe615 100644 --- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h +++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h @@ -813,6 +813,7 @@ enum AttributeKindCodes { ATTR_KIND_KIT_KERNEL_PROPS = 109, ATTR_KIND_KIT_KERNEL = 110, ATTR_KIND_KIT_DEVICE = 111, + ATTR_KIND_KIT_REDUCTION = 112, }; enum ComdatSelectionKindCodes { diff --git a/llvm/include/llvm/IR/Attributes.td b/llvm/include/llvm/IR/Attributes.td index 165573f719d59..b88603738b846 100644 --- a/llvm/include/llvm/IR/Attributes.td +++ b/llvm/include/llvm/IR/Attributes.td @@ -276,6 +276,9 @@ def ReadNone : EnumAttr<"readnone", IntersectAnd, [ParamAttr]>; /// Function only reads from memory. def ReadOnly : EnumAttr<"readonly", IntersectAnd, [ParamAttr]>; +/// Tapir reducer-related attributes. +def KitsuneReduction : EnumAttr<"kitsune_reduction", IntersectPreserve, [FnAttr]>; + /// Return value is always equal to this argument. def Returned : EnumAttr<"returned", IntersectAnd, [ParamAttr]>; diff --git a/llvm/include/llvm/Transforms/Tapir/CudaABI.h b/llvm/include/llvm/Transforms/Tapir/CudaABI.h index 15f9efd74c287..5d9edd5dd03e6 100644 --- a/llvm/include/llvm/Transforms/Tapir/CudaABI.h +++ b/llvm/include/llvm/Transforms/Tapir/CudaABI.h @@ -144,8 +144,6 @@ class CudaLoop : public LoopOutlineProcessor { // Cuda/PTX grid dimensions access. Function *CUGridDimX = nullptr, *CUGridDimY = nullptr, *CUGridDimZ = nullptr; - SmallVector OrderedInputs; - /// The GlobalValue's used in the loop that is being outlined. This includes /// functions, global variables, aliases and ifunc's. std::set UsedGlobalValues; @@ -161,25 +159,11 @@ class CudaLoop : public LoopOutlineProcessor { const TapirTargetOptions &TTOpts); ~CudaLoop(); - void setupLoopOutlineArgs(Function &F, ValueSet &HelperArgs, - SmallVectorImpl &HelperInputs, - ValueSet &InputSet, - const SmallVectorImpl &LCArgs, - const SmallVectorImpl &LCInputs, - const ValueSet &TLInputsFixed) override final; - - unsigned getIVArgIndex(const Function &F, - const ValueSet &Args) const override final; - - unsigned getLimitArgIndex(const Function &F, - const ValueSet &Args) const override final; - void preProcessTapirLoop(TapirLoopInfo &TL, ValueToValueMapTy &VMap) override; void postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out, ValueToValueMapTy &VMap) override final; void processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI, DominatorTree &DT) override final; - void remapData(ValueToValueMapTy &VMap) override final; }; } // namespace llvm diff --git a/llvm/include/llvm/Transforms/Tapir/GPUABI.h b/llvm/include/llvm/Transforms/Tapir/GPUABI.h index 7e0ccda6af380..0ad6808a5420f 100644 --- a/llvm/include/llvm/Transforms/Tapir/GPUABI.h +++ b/llvm/include/llvm/Transforms/Tapir/GPUABI.h @@ -26,13 +26,13 @@ class LLVMLoop; class GPUABI : public TapirTarget { LLVMLoop *LOP = nullptr; public: - GPUABI(Module &M) : TapirTarget(M) {} + GPUABI(Module &M, const TapirTargetOptions &TTO) : TapirTarget(M, TTO) {}; ~GPUABI() {} Value *lowerGrainsizeCall(CallInst *GrainsizeCall) override final; void lowerSync(SyncInst &SI) override final; void addHelperAttributes(Function &F) override final {} - void preProcessFunction(Function &F, TaskInfo &TI, + bool preProcessFunction(Function &F, TaskInfo &TI, bool OutliningTapirLoops) override final; void postProcessFunction(Function &F, bool OutliningTapirLoops) override final; @@ -49,8 +49,7 @@ class GPUABI : public TapirTarget { void processSubTaskCall(TaskOutlineInfo &TOI, DominatorTree &DT) override final; - LoopOutlineProcessor *getLoopOutlineProcessor(const TapirLoopInfo *TL, - OptimizationLevel OptLevel = OptimizationLevel=O2) + LoopOutlineProcessor *getLoopOutlineProcessor(const TapirLoopInfo *TL) override final; }; diff --git a/llvm/include/llvm/Transforms/Tapir/HipABI.h b/llvm/include/llvm/Transforms/Tapir/HipABI.h index 77fbc393dab01..dc818073cf4b4 100644 --- a/llvm/include/llvm/Transforms/Tapir/HipABI.h +++ b/llvm/include/llvm/Transforms/Tapir/HipABI.h @@ -159,31 +159,6 @@ class HipLoop : public LoopOutlineProcessor { const TapirTargetOptions &TTO); ~HipLoop(); - /// Prepares the set HelperArgs of function arguments for the outlined helper - /// function Helper for a Tapir loop. Also prepares the list HelperInputs of - /// input values passed to a call to Helper. HelperArgs and HelperInputs are - /// derived from the loop-control arguments LCArgs and loop-control inputs - /// LCInputs for the Tapir loop, as well the set TLInputsFixed of arguments to - /// the task underlying the Tapir loop. - void setupLoopOutlineArgs(Function &F, ValueSet &HelperArgs, - SmallVectorImpl &HelperInputs, - ValueSet &InputSet, - const SmallVectorImpl &LCArgs, - const SmallVectorImpl &LCInputs, - const ValueSet &TLInputsFixed) override; - - /// Returns an integer identifying the index of the helper-function argument - /// in Args that specifies the starting iteration number. This return value - /// must complement the behavior of setupLoopOutlineArgs(). - unsigned getIVArgIndex(const Function &F, - const ValueSet &Args) const override; - - /// Returns an integer identifying the index of the helper-function argument - /// in Args that specifies the ending iteration number. This return value - /// must complement the behavior of setupLoopOutlineArgs(). - unsigned getLimitArgIndex(const Function &F, - const ValueSet &Args) const override; - /// Process the TapirLoop before it is outlined -- just prior to the /// outlining occurs. This allows the VMap and related details to be /// customized prior to outlining related operations (e.g. cloning of @@ -199,8 +174,6 @@ class HipLoop : public LoopOutlineProcessor { void processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI, DominatorTree &DT) override; - void remapData(ValueToValueMapTy &VMap) override final; - private: Value *emitWorkItemId(IRBuilder<> &Builder, int ItemIndex); Value *emitWorkGroupId(IRBuilder<> &Builder, int ItemIndex); @@ -220,8 +193,6 @@ class HipLoop : public LoopOutlineProcessor { FunctionCallee HipWorkGroupIdXFn, HipWorkGroupIdYFn, HipWorkGroupIdZFn; FunctionCallee HipBlockDimFn; - SmallVector OrderedInputs; - /// The GlobalValue's used in the loop that is being outlined. This includes /// functions, global variables, aliases and ifunc's. std::set UsedGlobalValues; diff --git a/llvm/include/llvm/Transforms/Tapir/LoopStripMine.h b/llvm/include/llvm/Transforms/Tapir/LoopStripMine.h index 270b77794620c..4940f3d7d463a 100644 --- a/llvm/include/llvm/Transforms/Tapir/LoopStripMine.h +++ b/llvm/include/llvm/Transforms/Tapir/LoopStripMine.h @@ -47,7 +47,8 @@ Loop *StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount, AssumptionCache *AC, TaskInfo *TI, OptimizationRemarkEmitter *ORE, bool PreserveLCSSA, bool ParallelEpilog, bool NeedNestedSync, - Loop **Remainderloop = nullptr); + Loop **Remainderloop = nullptr, + bool GPU = false); } // end namespace llvm diff --git a/llvm/include/llvm/Transforms/Tapir/LoopStripMinePass.h b/llvm/include/llvm/Transforms/Tapir/LoopStripMinePass.h index 5b130c3e89d62..f148f20ff1820 100644 --- a/llvm/include/llvm/Transforms/Tapir/LoopStripMinePass.h +++ b/llvm/include/llvm/Transforms/Tapir/LoopStripMinePass.h @@ -24,7 +24,7 @@ class LoopStripMinePass : public PassInfoMixin { public: explicit LoopStripMinePass() {} - PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); + PreservedAnalyses run(Module &F, ModuleAnalysisManager &AM); }; } // end namespace llvm diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index 35c4a8bd78229..945f422ef6a0e 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -2260,6 +2260,8 @@ static Attribute::AttrKind getAttrFromCode(uint64_t Code) { return Attribute::KitKernel; case bitc::ATTR_KIND_KIT_DEVICE: return Attribute::KitDevice; + case bitc::ATTR_KIND_KIT_REDUCTION: + return Attribute::KitsuneReduction; } } diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index 05a103aae5ddd..c9451018b070b 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -960,6 +960,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) { return bitc::ATTR_KIND_KIT_KERNEL; case Attribute::KitDevice: return bitc::ATTR_KIND_KIT_DEVICE; + case Attribute::KitsuneReduction: + return bitc::ATTR_KIND_KIT_REDUCTION; case Attribute::EndAttrKinds: llvm_unreachable("Can not encode end-attribute kinds marker."); case Attribute::None: diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index 57c38dcd5aa1e..c21e98f38e980 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -14,6 +14,7 @@ /// //===----------------------------------------------------------------------===// +#include "kitsune/Analysis/TapirTargetAnalysis.h" #include "kitsune/Passes/PipelineUtils.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" @@ -1597,34 +1598,6 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level, // rather than on each loop in an inside-out manner, and so they are actually // function passes. - // Stripmine Tapir loops, if pass is enabled. - if (PTO.LoopStripmine && Level.getSpeedupLevel() > 1 && - !Level.isOptimizingForSize()) { - LoopPassManager LPM1, LPM2; - LPM1.addPass( - IndVarSimplifyPass(/*WidenIndVars=*/true, /*TapirLoopsOnly=*/true)); - OptimizePM.addPass( - createFunctionToLoopPassAdaptor(std::move(LPM1), - /*UseMemorySSA=*/true, - /*UseBlockFrequencyInfo=*/true)); - OptimizePM.addPass(LoopStripMinePass()); - // Cleanup tasks after stripmining loops. - OptimizePM.addPass(TaskSimplifyPass()); - // Cleanup after stripmining loops. - LPM2.addPass(LoopSimplifyCFGPass()); - LPM2.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, - /*AllowSpeculation=*/true)); - OptimizePM.addPass( - createFunctionToLoopPassAdaptor(std::move(LPM2), - /*UseMemorySSA=*/true, - /*UseBlockFrequencyInfo=*/true)); - // Don't run IndVarSimplify at this point, as it can actually inhibit - // vectorization in some cases. - OptimizePM.addPass(JumpThreadingPass()); - OptimizePM.addPass(CorrelatedValuePropagationPass()); - OptimizePM.addPass(InstCombinePass()); - } - invokeVectorizerStartEPCallbacks(OptimizePM, Level); LoopPassManager LPM; @@ -1760,8 +1733,41 @@ PassBuilder::buildTapirLoopLoweringPipeline(OptimizationLevel Level, FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM2), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/false)); - MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); + // Stripmine Tapir loops, if pass is enabled. + // This was moved from buildModuleOptimizationPipeline, I (George) expect it to be + // beneficial for all parallel backends. Needs performance evaluation to confirm. + if (PTO.LoopStripmine) { + // We need to do tapir target analysis to determine how to stripmine + MPM.addPass(RequireAnalysisPass()); + //MPM.addPass( + //createModuleToFunctionPassAdaptor(InvalidateAnalysisPass())); + + LoopPassManager LPM1, LPM2; + LPM1.addPass(IndVarSimplifyPass(true, true)); + FPM.addPass( + createFunctionToLoopPassAdaptor(std::move(LPM1), + /*UseMemorySSA=*/true, + /*UseBlockFrequencyInfo=*/true)); + MPM.addPass(LoopStripMinePass()); + // Cleanup tasks after stripmining loops. + FPM.addPass(TaskSimplifyPass()); + // Cleanup after stripmining loops. + LPM2.addPass(LoopSimplifyCFGPass()); + LPM2.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, + /*AllowSpeculation=*/true)); + FPM.addPass( + createFunctionToLoopPassAdaptor(std::move(LPM2), + /*UseMemorySSA=*/true, + /*UseBlockFrequencyInfo=*/true)); + // Don't run IndVarSimplify at this point, as it can actually inhibit + // vectorization in some cases. + FPM.addPass(JumpThreadingPass()); + FPM.addPass(CorrelatedValuePropagationPass()); + FPM.addPass(InstCombinePass()); + } + + MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); // Outline Tapir loops as needed. MPM.addPass(LoopSpawningPass(Level)); if (VerifyTapirLowering) diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index 8e62e9a2b97f6..063eccc8c83a8 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -192,6 +192,7 @@ MODULE_PASS("kit-kernel-properties", RecomputeKernelPropertiesPass()) MODULE_PASS("kit-prefetch", PrefetchingPass()) MODULE_PASS("kit-lower-intrinsics", LowerKitsuneIntrinsicsPass()) MODULE_PASS("kit-strip-addr-spaces", StripKitsuneAddrSpacesPass()) +MODULE_PASS("loop-stripmine", LoopStripMinePass()) #undef MODULE_PASS @@ -510,7 +511,6 @@ FUNCTION_PASS("loop-fusion", LoopFusePass()) FUNCTION_PASS("loop-load-elim", LoopLoadEliminationPass()) FUNCTION_PASS("loop-simplify", LoopSimplifyPass()) FUNCTION_PASS("loop-sink", LoopSinkPass()) -FUNCTION_PASS("loop-stripmine", LoopStripMinePass()) FUNCTION_PASS("loop-versioning", LoopVersioningPass()) FUNCTION_PASS("lower-atomic", LowerAtomicPass()) FUNCTION_PASS("lower-constant-intrinsics", LowerConstantIntrinsicsPass()) diff --git a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp index d32b829e2ad79..79e530c6362b1 100644 --- a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp +++ b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp @@ -284,6 +284,10 @@ bool DeadArgumentEliminationPass::removeDeadArgumentsFromCallers(Function &F) { if (F.hasFnAttribute(Attribute::Naked)) return false; + // Don't operate on optnone (we need this for kitsune reductions) + if (F.hasFnAttribute(Attribute::OptimizeNone)) + return false; + if (F.use_empty()) return false; @@ -730,6 +734,10 @@ bool DeadArgumentEliminationPass::removeDeadStuffFromFunction(Function *F) { if (FrozenFunctions.count(F)) return false; + // Don't operate on optnone (we need this for kitsune reductions) + if (F->hasFnAttribute(Attribute::OptimizeNone)) + return false; + // Start by computing a new prototype for the function, which is the same as // the old function, but has fewer arguments and a different return type. FunctionType *FTy = F->getFunctionType(); diff --git a/llvm/lib/Transforms/Tapir/CudaABI.cpp b/llvm/lib/Transforms/Tapir/CudaABI.cpp index 6d72fa29a937f..ec731bcc7a9d3 100644 --- a/llvm/lib/Transforms/Tapir/CudaABI.cpp +++ b/llvm/lib/Transforms/Tapir/CudaABI.cpp @@ -198,78 +198,6 @@ CudaLoop::~CudaLoop() { << KernelName << "'.\n"); } -void CudaLoop::setupLoopOutlineArgs(Function &F, ValueSet &HelperArgs, - SmallVectorImpl &HelperInputs, - ValueSet &InputSet, - const SmallVectorImpl &LCArgs, - const SmallVectorImpl &LCInputs, - const ValueSet &TLInputsFixed) { - LLVM_DEBUG(dbgs() << "debug[cuabi]: setting up loop outline arguments...\n"); - - // Add the loop control inputs -- the first parameter defines the extent of - // the index space (the number of threads to launch). - { - Argument *EndArg = cast(LCArgs[1]); - EndArg->setName("runSize"); - HelperArgs.insert(EndArg); - - Value *InputVal = LCInputs[1]; - HelperInputs.push_back(InputVal); - InputSet.insert(InputVal); - } - - // The second parameter defines the start of the index space. - { - Argument *StartArg = cast(LCArgs[0]); - StartArg->setName("runStart"); - HelperArgs.insert(StartArg); - - Value *InputVal = LCInputs[0]; - HelperInputs.push_back(InputVal); - InputSet.insert(InputVal); - } - - // The third parameter defines the grain size, if it is not constant. - if (!isa(LCInputs[2])) { - Argument *GrainsizeArg = cast(LCArgs[2]); - GrainsizeArg->setName("grainSize"); - HelperArgs.insert(GrainsizeArg); - - Value *InputVal = LCInputs[2]; - HelperInputs.push_back(InputVal); - InputSet.insert(InputVal); - } - - // Add the loop-centric kernel parameters (i.e., variables/arrays - // used in the loop body). - LLVM_DEBUG(dbgs() << " - adding loop-centric kernel arguments...\n"); - for (Value *V : TLInputsFixed) { - HelperArgs.insert(V); - HelperInputs.push_back(V); - LLVM_DEBUG(dbgs() << " - arg: " << V->getName() << "\n"); - } - - LLVM_DEBUG(dbgs() << " - adding helper kernel arguments...\n"); - for (Value *V : HelperInputs) { - OrderedInputs.push_back(V); - LLVM_DEBUG(dbgs() << " - helper arg: " << V->getName() << "\n"); - } - - LLVM_DEBUG(dbgs() << " - done.\n"); -} - -unsigned CudaLoop::getIVArgIndex(const Function &F, - const ValueSet &Args) const { - // The argument for the primary induction variable is the second input. - return 1; -} - -unsigned CudaLoop::getLimitArgIndex(const Function &F, - const ValueSet &Args) const { - // The argument for the loop limit is the first input. - return 0; -} - void CudaLoop::preProcessTapirLoop(TapirLoopInfo &TL, ValueToValueMapTy &VMap) { LLVM_DEBUG(dbgs() << "debug[cuabi]: -preprocessing loop for kernel '" << KernelName << "'.\n"); @@ -378,23 +306,19 @@ void CudaLoop::postProcessOutline(TapirLoopInfo &TLI, TaskOutlineInfo &Out, // loops use canonical induction variables, valid iterations range from 0 to // the loop limit with stride 1. The End argument encodes the loop limit. Get // end and grainsize arguments - Argument *End; - Value *Grainsize; - { - // TODO: We only support a grain size of 1 right now. Not clear if this - // could be a future optimization but strip mining on our current tests only - // results in degraded performance... - auto OutlineArgsIter = KernelF->arg_begin(); - // End argument is the first LC arg. - End = &*OutlineArgsIter++; - - // Get the grainsize value, which is either constant or the third LC arg. - // if (unsigned ConstGrainsize = TLI.getGrainsize()) - // Grainsize = ConstantInt::get(PrimaryIV->getType(), ConstGrainsize); - // else - Grainsize = - ConstantInt::get(PrimaryIV->getType(), DefaultGrainSize.getValue()); - } + + // End argument is always the second argument in the kernel function. + Argument *End = KernelF->getArg(1); + + // Get the grainsize value, which is either constant or the third LC arg. + // TODO: We only support a grain size of 1 right now. Not clear if this + // could be a future optimization but strip mining on our current tests only + // results in degraded performance. + // if (unsigned ConstGrainsize = TLI.getGrainsize()) + // Grainsize = ConstantInt::get(PrimaryIV->getType(), ConstGrainsize); + // else + Value *Grainsize = + ConstantInt::get(PrimaryIV->getType(), DefaultGrainSize.getValue()); IRBuilder<> B(Entry->getTerminator()); @@ -433,14 +357,6 @@ void CudaLoop::postProcessOutline(TapirLoopInfo &TLI, TaskOutlineInfo &Out, ClonedCond->setOperand(TripCountIdx, ThreadEnd); } -void CudaLoop::remapData(ValueToValueMapTy &VMap) { - for (auto &V : OrderedInputs) { - if (auto MappedV = VMap[V]) { - V = MappedV; - } - } -} - void CudaLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI, DominatorTree &DT) { LLVM_DEBUG(dbgs() << "cudaloop: processing outlined loop call...\n" @@ -468,12 +384,13 @@ void CudaLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI, TapirLoopHints Hints(TL.getLoop()); Value *TPB = ConstantInt::get(Int32Ty, Hints.getThreadsPerBlock()); - BasicBlock *RCBB = TOI.ReplCall->getParent(); - BasicBlock *NewBB = RCBB->splitBasicBlock(TOI.ReplCall); + CallBase* CallOutlined = cast(TOI.ReplCall); + BasicBlock *RCBB = CallOutlined->getParent(); + BasicBlock *NewBB = RCBB->splitBasicBlock(CallOutlined); IRBuilder<> Builder(&NewBB->front()); // Deal with type mismatches for the trip count. - Value *TripCount = OrderedInputs[0]; + Value *TripCount = CallOutlined->getArgOperand(1); if (TripCount->getType() != Int64Ty) TripCount = Builder.CreateSExtOrBitCast(TripCount, Int64Ty, "cast.tc"); @@ -485,7 +402,7 @@ void CudaLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI, Builder.CreateIntrinsic(PtrTy, Intrinsic::kit_thread_stream, {CTT}); std::vector Args = {CTT, EmbFB, KName, TripCount, TPB, KProps, CudaStream}; - for (Value *Inp : OrderedInputs) + for (Value *Inp : CallOutlined->args()) Args.push_back(Inp); // TODO: We should probably have the launch and sync kitsune intrinsics take @@ -506,7 +423,7 @@ void CudaLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI, // the global again). copyNonConstGlobalsDToH(UsedGlobalValues, TTID::Cuda, M, Builder); - TOI.ReplCall->eraseFromParent(); + CallOutlined->eraseFromParent(); LLVM_DEBUG(dbgs() << "*** finished processing outlined call.\n"); } diff --git a/llvm/lib/Transforms/Tapir/GPUABI.cpp b/llvm/lib/Transforms/Tapir/GPUABI.cpp index d1909db60de42..c6502fbeb8938 100644 --- a/llvm/lib/Transforms/Tapir/GPUABI.cpp +++ b/llvm/lib/Transforms/Tapir/GPUABI.cpp @@ -12,11 +12,7 @@ // //===----------------------------------------------------------------------===// -#pragma warning "GPUABI has been deprecated" -#if 0 - #include "llvm/Transforms/Tapir/GPUABI.h" -#include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/Module.h" #include "llvm/IR/Verifier.h" @@ -25,24 +21,37 @@ #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/GVN.h" -#include "llvm/Transforms/Vectorize.h" -#include "llvm/Support/SmallVectorMemoryBuffer.h" #include "llvm/Bitcode/BitcodeWriter.h" #include "llvm/IRReader/IRReader.h" #include "llvm/MC/TargetRegistry.h" -#include #include using namespace llvm; #define DEBUG_TYPE "gpuabi" +// JIT compiler kernel at containing function entry, makes timing easier at the +// cost of less laziness +static cl::opt + JIT("jit-callsite", cl::init(false), cl::NotHidden, + cl::desc("Wait until parallel loop is called to jit kernel. " + "(default=false)")); + Value *GPUABI::lowerGrainsizeCall(CallInst *GrainsizeCall) { - Value *Grainsize = ConstantInt::get(GrainsizeCall->getType(), 8); + IRBuilder<> BH(GrainsizeCall); + auto *M = GrainsizeCall->getModule(); + Type *LLVMInt64Ty = Type::getInt64Ty(M->getContext()); + Type *LLVMInt32Ty = Type::getInt64Ty(M->getContext()); + // we have to cast + Value *GSO = BH.CreateIntCast(GrainsizeCall->getOperand(0), LLVMInt64Ty, false); + Value *GS = BH.CreateCall(M->getOrInsertFunction("gpuGridSize", LLVMInt32Ty, LLVMInt64Ty), {GSO}); + Value *GSN = BH.CreateIntCast(GS, GrainsizeCall->getType(), false); + //FunctionCallee GGS = M->getOrInsertFunction("gpuGridSize", LLVMInt64Ty); // Replace uses of grainsize intrinsic call with this grainsize value. - GrainsizeCall->replaceAllUsesWith(Grainsize); - return Grainsize; + //GrainsizeCall->setCalledFunction(GGS); + GrainsizeCall->replaceAllUsesWith(GSN); + return GSN; } void GPUABI::lowerSync(SyncInst &SI) { @@ -54,8 +63,9 @@ void GPUABI::postProcessOutlinedTask(llvm::Function&, llvm::Instruction*, llvm:: void GPUABI::preProcessRootSpawner(llvm::Function&, BasicBlock *TFEntry){} void GPUABI::postProcessRootSpawner(llvm::Function&, BasicBlock *TFEntry){} -void GPUABI::preProcessFunction(Function &F, TaskInfo &TI, +bool GPUABI::preProcessFunction(Function &F, TaskInfo &TI, bool OutliningTapirLoops) { + return false; } void GPUABI::postProcessFunction(Function &F, bool OutliningTapirLoops) { @@ -68,8 +78,7 @@ void GPUABI::processSubTaskCall(TaskOutlineInfo &TOI, DominatorTree &DT) { } LoopOutlineProcessor * -GPUABI::getLoopOutlineProcessor(const TapirLoopInfo *TL, - OptimizationLevel OptLevel = OptimizationLevel=O2) { +GPUABI::getLoopOutlineProcessor(const TapirLoopInfo *TL) { if(!LOP) return new LLVMLoop(M); return LOP; @@ -79,7 +88,7 @@ GPUABI::getLoopOutlineProcessor(const TapirLoopInfo *TL, unsigned LLVMLoop::NextKernelID = 0; LLVMLoop::LLVMLoop(Module &M) - : LoopOutlineProcessor(M, LLVMM), LLVMM("kernelModule", M.getContext()) { + : LoopOutlineProcessor(M, TTOpts), LLVMM("kernelModule", M.getContext()) { ValueToValueMapTy VMap; // LLVMMptr = CloneModule(M, vmap, [](const GlobalValue* gv) { return false; }); // And named metadata.... @@ -93,19 +102,16 @@ LLVMLoop::LLVMLoop(Module &M) // Setup an LLVM triple. Triple LLVMTriple("spir64-unknown-unknown"); - LLVMM.setTargetTriple(LLVMTriple.str()); + LLVMM.setTargetTriple(LLVMTriple); // Insert runtime-function declarations in LLVM host modules. Type *LLVMInt32Ty = Type::getInt32Ty(LLVMM.getContext()); - Type *LLVMInt64Ty = Type::getInt64Ty(LLVMM.getContext()); GetThreadIdx = LLVMM.getOrInsertFunction("gtid", LLVMInt32Ty); - Function* getid = LLVMM.getFunction("gtid"); + Type *VoidTy = Type::getVoidTy(M.getContext()); - Type *VoidPtrTy = Type::getInt8PtrTy(M.getContext()); + Type *VoidPtrTy = PointerType::getUnqual(M.getContext()); Type *VoidPtrPtrTy = VoidPtrTy->getPointerTo(); - Type *Int8Ty = Type::getInt8Ty(M.getContext()); - Type *Int32Ty = Type::getInt32Ty(M.getContext()); Type *Int64Ty = Type::getInt64Ty(M.getContext()); GPUInit = M.getOrInsertFunction("initRuntime", VoidTy); GPULaunchKernel = M.getOrInsertFunction("launchBCKernel", VoidPtrTy, VoidPtrTy, Int64Ty, VoidPtrPtrTy, Int64Ty); @@ -177,17 +183,12 @@ unsigned LLVMLoop::getLimitArgIndex(const Function &F, const ValueSet &Args) void LLVMLoop::postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out, ValueToValueMapTy &VMap) { - LLVMContext &Ctx = M.getContext(); - Type *Int8Ty = Type::getInt8Ty(Ctx); - Type *Int32Ty = Type::getInt32Ty(Ctx); - //Type *Int64Ty = Type::getInt64Ty(Ctx); - //Type *VoidPtrTy = Type::getInt8PtrTy(Ctx); Task *T = TL.getTask(); Loop *L = TL.getLoop(); - BasicBlock *Entry = cast(VMap[L->getLoopPreheader()]); BasicBlock *Header = cast(VMap[L->getHeader()]); + BasicBlock *Latch = cast(VMap[L->getLoopLatch()]); BasicBlock *Exit = cast(VMap[TL.getExitBlock()]); PHINode *PrimaryIV = cast(VMap[TL.getPrimaryInduction().first]); Value *PrimaryIVInput = PrimaryIV->getIncomingValueForBlock(Entry); @@ -201,10 +202,18 @@ void LLVMLoop::postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out, // Get the thread ID for this invocation of Helper. IRBuilder<> B(Entry->getTerminator()); Value *ThreadIdx = B.CreateCall(GetThreadIdx); - //Value *BlockIdx = B.CreateCall(GetBlockIdx, ConstantInt::get(Int32Ty, 0)); - //Value *BlockDim = B.CreateCall(GetBlockDim, ConstantInt::get(Int32Ty, 0)); Value *ThreadID = B.CreateIntCast(ThreadIdx, PrimaryIV->getType(), false); + // Loop should be handled in stripmining, here we just remove the loop by setting it to a jump + BranchInst *BI = cast(Latch->getTerminator()); + if(BI->getSuccessor(0) == Exit) + BI->setCondition(ConstantInt::get(BI->getCondition()->getType(), true)); + else + BI->setCondition(ConstantInt::get(BI->getCondition()->getType(), false)); + + //AV.push_back(ValueAsMetadata::get(ConstantInt::get(Type::getInt32Ty(LLVMCtx), + // 1))); + Function *Helper = Out.Outline; Helper->setName("kitsune_kernel"); @@ -218,25 +227,12 @@ void LLVMLoop::postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out, // the loop limit with stride 1. The End argument encodes the loop limit. // Get end and grainsize arguments Argument *End; - Value *Grainsize; { - auto OutlineArgsIter = Helper->arg_begin(); + auto *OutlineArgsIter = Helper->arg_begin(); // End argument is the first LC arg. End = &*OutlineArgsIter; - - // Get the grainsize value, which is either constant or the third LC arg. - // ReplaceInstWithInst(gep, GetElementPtrInst::Create( - if (unsigned ConstGrainsize = TL.getGrainsize()) - Grainsize = ConstantInt::get(PrimaryIV->getType(), ConstGrainsize); - else - // Grainsize argument is the third LC arg. - Grainsize = &*++(++OutlineArgsIter); } - ThreadID = B.CreateMul(ThreadID, Grainsize); - Value *ThreadEndGrain = B.CreateAdd(ThreadID, Grainsize); - Value *Cmp = B.CreateICmp(ICmpInst::ICMP_ULT, ThreadEndGrain, End); - Value *ThreadEnd = B.CreateSelect(Cmp, ThreadEndGrain, End); - Value *Cond = B.CreateICmpUGE(ThreadID, ThreadEnd); + Value *Cond = B.CreateICmpUGE(ThreadID, End); ReplaceInstWithInst(Entry->getTerminator(), BranchInst::Create(Exit, Header, Cond)); @@ -246,10 +242,10 @@ void LLVMLoop::postProcessOutline(TapirLoopInfo &TL, TaskOutlineInfo &Out, // Update cloned loop condition to use the thread-end value. unsigned TripCountIdx = 0; ICmpInst *ClonedCond = cast(VMap[TL.getCondition()]); - if (ClonedCond->getOperand(0) != ThreadEnd) + if (ClonedCond->getOperand(0) != End) ++TripCountIdx; - ClonedCond->setOperand(TripCountIdx, ThreadEnd); - assert(ClonedCond->getOperand(TripCountIdx) == ThreadEnd && + ClonedCond->setOperand(TripCountIdx, End); + assert(ClonedCond->getOperand(TripCountIdx) == End && "End argument not used in condition"); } @@ -258,12 +254,9 @@ void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI, DominatorTree &DT) { LLVMContext &Ctx = M.getContext(); Type *Int8Ty = Type::getInt8Ty(Ctx); - Type *Int32Ty = Type::getInt32Ty(Ctx); Type *Int64Ty = Type::getInt64Ty(Ctx); - Type *VoidPtrTy = Type::getInt8PtrTy(Ctx); + PointerType *VoidPtrTy = PointerType::getUnqual(Ctx); - //Task *T = TL.getTask(); - //Instruction *ReplCall = cast(TOI.ReplCall); LLVM_DEBUG(dbgs() << "Running processOutlinedLoopCall: " << LLVMM); Function *Parent = TOI.ReplCall->getFunction(); Value *TripCount = OrderedInputs[0]; @@ -276,7 +269,6 @@ void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI, // Compile the kernel //LLVMM.getFunctionList().remove(TOI.Outline); //TOI.Outline->eraseFromParent(); - LLVMContext &LLVMCtx = LLVMM.getContext(); ValueToValueMapTy VMap; // We recursively add definitions and declarations to the device module @@ -305,7 +297,8 @@ void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI, SmallVector Returns; CloneFunctionInto(deviceF, f, VMap, CloneFunctionChangeType::DifferentModule, Returns); // GPU calls are slow as balls, try to force inlining - deviceF->addFnAttr(Attribute::AlwaysInline); + if(!deviceF->hasFnAttribute(Attribute::NoInline)) + deviceF->addFnAttr(Attribute::AlwaysInline); todo.push_back(deviceF); } } @@ -354,12 +347,9 @@ void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI, PassManager->add(createReassociatePass()); PassManager->add(createGVNPass()); PassManager->add(createCFGSimplificationPass()); - PassManager->add(createLoopVectorizePass()); - PassManager->add(createSLPVectorizerPass()); //PassManager->add(createBreakCriticalEdgesPass()); //PassManager->add(createConstantPropagationPass()); PassManager->add(createDeadCodeEliminationPass()); - PassManager->add(createDeadStoreEliminationPass()); //PassManager->add(createInstructionCombiningPass()); PassManager->add(createCFGSimplificationPass()); PassManager->add(createDeadCodeEliminationPass()); @@ -389,15 +379,6 @@ void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI, GlobalValue::PrivateLinkage, LLVMBC, "gpu_" + Twine("kitsune_kernel")); - //Value* TripCount = isSRetInput(TOI.InputSet[0]) ? TOI.InputSet[1] : TOI.InputSet[0]; - //Value *RunStart = ReplCall->getArgOperand(getIVArgIndex(*Parent, - // TOI.InputSet)); - //Value *TripCount = ReplCall->getArgOperand(getLimitArgIndex(*Parent, - // TOI.InputSet)); - - Value *KernelID = ConstantInt::get(Int32Ty, MyKernelID); - Value *LLVMPtr = B.CreateBitCast(LLVMGlobal, VoidPtrTy); - Type *VoidPtrPtrTy = VoidPtrTy->getPointerTo(); Constant *kernelSize = ConstantInt::get(Int64Ty, LLVMGlobal->getInitializer()->getType()->getArrayNumElements()); @@ -418,23 +399,12 @@ void LLVMLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI, B.CreateStore(VoidVPtr, argPtr); } - Value *Grainsize = TL.getGrainsize() ? - ConstantInt::get(TripCount->getType(), TL.getGrainsize()) : - OrderedInputs[2]; - - //Type *Int64Ty = Type::getInt64Ty(LLVMM.getContext()); - Value *RunSizeQ = B.CreateUDiv(TripCount, Grainsize); - Value *RunRem = B.CreateURem(TripCount, Grainsize); - Value *IsRem = B.CreateICmp(ICmpInst::ICMP_UGT, RunRem, ConstantInt::get(RunRem->getType(), 0)); - Value *IsRemAdd = B.CreateZExt(IsRem, RunSizeQ->getType()); - Value *RunSize = B.CreateZExt(B.CreateAdd(RunSizeQ, IsRemAdd), Int64Ty); - - Value* argsPtr = B.CreateConstInBoundsGEP2_32(arrayType, argArray, 0, 0); - Value* bcPtr = B.CreateConstInBoundsGEP2_32(LLVMGlobal->getValueType(), LLVMGlobal, 0, 0); - Value* stream = B.CreateCall(GPULaunchKernel, { bcPtr, kernelSize, argsPtr, RunSize }); + Value* argsPtr = B.CreateConstInBoundsGEP2_32(arrayType, argArray, 0, 0); + Value* bcPtr = B.CreateConstInBoundsGEP2_32(LLVMGlobal->getValueType(), LLVMGlobal, 0, 0); + Value *TripCount64 = B.CreateIntCast(TripCount, Int64Ty, true); + Value* stream = B.CreateCall(GPULaunchKernel, { bcPtr, kernelSize, argsPtr, TripCount64 }); B.CreateCall(GPUWaitKernel, stream); LLVM_DEBUG(dbgs() << "Finished processOutlinedLoopCall: " << M); } -#endif diff --git a/llvm/lib/Transforms/Tapir/HipABI.cpp b/llvm/lib/Transforms/Tapir/HipABI.cpp index d2b27ce7dcce0..38a9cfdec855d 100644 --- a/llvm/lib/Transforms/Tapir/HipABI.cpp +++ b/llvm/lib/Transforms/Tapir/HipABI.cpp @@ -221,75 +221,6 @@ HipLoop::HipLoop(Module &M, Module &KM, StringRef Name, HipLoop::~HipLoop() { /* no-op */ } -// TODO: Can we also transform the arguments into a different address space here -// and avoid our use of 'mutate' elsewhere in the code? -void HipLoop::setupLoopOutlineArgs(Function &F, ValueSet &HelperArgs, - SmallVectorImpl &HelperInputs, - ValueSet &InputSet, - const SmallVectorImpl &LCArgs, - const SmallVectorImpl &LCInputs, - const ValueSet &TLInputsFixed) { - LLVM_DEBUG(dbgs() << "\n\n" - << "hipabi: SETTING UP LOOP OUTLINE ARGUMENTS FOR '" - << F.getName() << "()'.\n"); - - // Add the loop control inputs -- the first parameter defines the extent of - // the index space. - { - Argument *EndArg = cast(LCArgs[1]); - EndArg->setName(".kern.input_size"); // nice for debugging... - HelperArgs.insert(EndArg); - - Value *InputVal = LCInputs[1]; - HelperInputs.push_back(InputVal); - InputSet.insert(InputVal); - } - - // The second parameter defines the start of the index space. - { - Argument *StartArg = cast(LCArgs[0]); - StartArg->setName(".kern.start_idx"); - HelperArgs.insert(StartArg); - - Value *InputVal = LCInputs[0]; - HelperInputs.push_back(InputVal); - InputSet.insert(InputVal); - } - - // The third parameter defines the grain size, if it is not constant. - if (!isa(LCInputs[2])) { - Argument *GrainsizeArg = cast(LCArgs[2]); - GrainsizeArg->setName(".kern.grain_size"); - HelperArgs.insert(GrainsizeArg); - - Value *InputVal = LCInputs[2]; - HelperInputs.push_back(InputVal); - InputSet.insert(InputVal); - } - - // Add the loop-centric kernel parameters (i.e., variables/arrays - // used in the loop body). - for (Value *V : TLInputsFixed) { - HelperArgs.insert(V); - HelperInputs.push_back(V); - } - - for (Value *V : HelperInputs) { - OrderedInputs.push_back(V); - } -} - -unsigned HipLoop::getIVArgIndex(const Function &F, const ValueSet &Args) const { - // The argument for the primary induction variable is the second input. - return 1; -} - -unsigned HipLoop::getLimitArgIndex(const Function &F, - const ValueSet &Args) const { - // The argument for the loop limit is the first input. - return 0; -} - void HipLoop::preProcessTapirLoop(TapirLoopInfo &TL, ValueToValueMapTy &VMap) { bool VerboseMode = getOptions().getTapirVerbose(); if (VerboseMode) { @@ -429,22 +360,19 @@ void HipLoop::postProcessOutline(TapirLoopInfo &TLI, TaskOutlineInfo &Out, // loops use canonical induction variables, valid iterations range from 0 to // the loop limit with stride 1. The End argument encodes the limit. // Get end and grain size arguments - Argument *End; - Value *Grainsize; - { - // TODO: We really only want a grain size of 1 for now... - auto OutlineArgsIter = KernelF->arg_begin(); - // End argument is the first LC arg. - End = &*OutlineArgsIter++; - - // Get the grain size value, which is either constant or the third LC - // arg. - // if (unsigned ConstGrainsize = TLI.getGrainsize()) - // Grainsize = ConstantInt::get(PrimaryIV->getType(), ConstGrainsize); - // else - Grainsize = - ConstantInt::get(PrimaryIV->getType(), DefaultGrainSize.getValue()); - } + + // End argument is always the second argument in the kernel function. + Argument *End = KernelF->getArg(1); + + // Get the grainsize value, which is either constant or the third LC arg. + // TODO: We only support a grain size of 1 right now. Not clear if this + // could be a future optimization but strip mining on our current tests only + // results in degraded performance. + // if (unsigned ConstGrainsize = TLI.getGrainsize()) + // Grainsize = ConstantInt::get(PrimaryIV->getType(), ConstGrainsize); + // else + Value *Grainsize = + ConstantInt::get(PrimaryIV->getType(), DefaultGrainSize.getValue()); IRBuilder<> Builder(Entry->getTerminator()); @@ -500,12 +428,6 @@ void HipLoop::postProcessOutline(TapirLoopInfo &TLI, TaskOutlineInfo &Out, ClonedCond->setOperand(TripCountIdx, ThreadEnd); } -void HipLoop::remapData(ValueToValueMapTy &VMap) { - for (auto &V : OrderedInputs) - if (Value *MappedV = VMap[V]) - V = MappedV; -} - void HipLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI, DominatorTree &DT) { LLVM_DEBUG(dbgs() << "hiploop: processing outlined loop call...\n" @@ -541,12 +463,13 @@ void HipLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI, else TPB = ConstantInt::get(Int32Ty, 0); - BasicBlock *RCBB = TOI.ReplCall->getParent(); - BasicBlock *NewBB = RCBB->splitBasicBlock(TOI.ReplCall); + CallBase *CallOutlined = cast(TOI.ReplCall); + BasicBlock *RCBB = CallOutlined->getParent(); + BasicBlock *NewBB = RCBB->splitBasicBlock(CallOutlined); IRBuilder<> Builder(&NewBB->front()); // Deal with type mismatches for the trip count. - Value *TripCount = OrderedInputs[0]; + Value *TripCount = CallOutlined->getArgOperand(1); if (TripCount->getType() != Int64Ty) TripCount = Builder.CreateSExtOrBitCast(TripCount, Int64Ty, "cast.tc"); @@ -558,7 +481,7 @@ void HipLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI, Builder.CreateIntrinsic(PtrTy, Intrinsic::kit_thread_stream, {CTT}); std::vector Args = {CTT, EmbFB, KName, TripCount, TPB, KProps, HipStream}; - for (Value *Inp : OrderedInputs) + for (Value *Inp : CallOutlined->args()) Args.push_back(Inp); // TODO: We should probably have the launch and sync kitsune intrinsics take @@ -579,7 +502,7 @@ void HipLoop::processOutlinedLoopCall(TapirLoopInfo &TL, TaskOutlineInfo &TOI, // the global again). copyNonConstGlobalsDToH(UsedGlobalValues, TTID::Hip, M, Builder); - TOI.ReplCall->eraseFromParent(); + CallOutlined->eraseFromParent(); LLVM_DEBUG(dbgs() << "*** finished processing outlined call.\n"); } diff --git a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp index ca651c64c7ece..b1db7abb46776 100644 --- a/llvm/lib/Transforms/Tapir/LoopStripMine.cpp +++ b/llvm/lib/Transforms/Tapir/LoopStripMine.cpp @@ -40,6 +40,7 @@ #include "llvm/Transforms/Utils/SimplifyIndVar.h" #include "llvm/Transforms/Utils/TapirUtils.h" #include "llvm/Transforms/Utils/UnrollLoop.h" +#include "llvm/Transforms/Utils/LoopPeel.h" using namespace llvm; @@ -766,7 +767,8 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount, const TargetTransformInfo &TTI, AssumptionCache *AC, TaskInfo *TI, OptimizationRemarkEmitter *ORE, bool PreserveLCSSA, bool ParallelEpilog, - bool NeedNestedSync, Loop **RemainderLoop) { + bool NeedNestedSync, Loop **RemainderLoop, + bool GPU) { Task *T = getTapirLoopForStripMining(L, *TI, ORE); if (!T) return nullptr; @@ -815,6 +817,7 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount, BasicBlock *Latch = L->getLoopLatch(); BasicBlock *Header = L->getHeader(); BasicBlock *TaskEntry = T->getEntry(); + assert(isa(Header->getTerminator()) && "Header not terminated by a detach."); DetachInst *DI = cast(Header->getTerminator()); @@ -825,6 +828,10 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount, unsigned ExitIndex = LatchBR->getSuccessor(0) == Header ? 1 : 0; BasicBlock *LatchExit = LatchBR->getSuccessor(ExitIndex); + + Function *F = Header->getParent(); + LLVM_DEBUG(dbgs() << "Function before strip mining\n" << *F); + // We will use the increment of the primary induction variable to derive // wrapping flags. Instruction *PrimaryInc = @@ -878,7 +885,12 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount, return nullptr; } - LLVM_DEBUG(dbgs() << "Stripmining loop using grainsize " << Count << "\n"); + if(GPU){ + LLVM_DEBUG(dbgs() << "Stripmining loop using grainsize " << "gpu grainsize call" << "\n"); + } + else { + LLVM_DEBUG(dbgs() << "Stripmining loop using grainsize " << Count << "\n"); + } using namespace ore; ORE->emit([&]() { return OptimizationRemark(LSM_NAME, "Stripmined", @@ -894,6 +906,7 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount, // ... // Latch // LatchExit + Module *M = F->getParent(); // Insert the epilog remainder. BasicBlock *NewPreheader; @@ -937,41 +950,74 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount, // *EpilogPreheader // LatchExit - IRBuilder<> B(PreheaderBR); + IRBuilder<> B(PreheaderBR); Value *ModVal; - // Calculate ModVal = (BECount + 1) % Count. - // Note that TripCount is BECount + 1. - if (isPowerOf2_32(Count)) { - // When Count is power of 2 we don't BECount for epilog case. However we'll - // need it for a branch around stripmined loop for prolog case. - ModVal = B.CreateAnd(TripCount, Count - 1, "xtraiter"); - // 1. There are no iterations to be run in the prolog/epilog loop. - // OR - // 2. The addition computing TripCount overflowed. - // - // If (2) is true, we know that TripCount really is (1 << BEWidth) and so - // the number of iterations that remain to be run in the original loop is a - // multiple Count == (1 << Log2(Count)) because Log2(Count) <= BEWidth (we - // explicitly check this above). - if (TL.isInclusiveRange()) - ModVal = B.CreateAdd(ModVal, ConstantInt::get(ModVal->getType(), 1)); - } else { - // As (BECount + 1) can potentially unsigned overflow we count - // (BECount % Count) + 1 which is overflow safe as BECount % Count < Count. - Value *ModValTmp = B.CreateURem(BECount, - ConstantInt::get(BECount->getType(), - Count)); - Value *ModValAdd = B.CreateAdd(ModValTmp, - ConstantInt::get(ModValTmp->getType(), 1)); - // At that point (BECount % Count) + 1 could be equal to Count. - // To handle this case we need to take mod by Count one more time. - ModVal = B.CreateURem(ModValAdd, - ConstantInt::get(BECount->getType(), Count), - "xtraiter"); + Value *StepSize; + Value *BranchVal; + // Int the gpu case we don't need an epilogue + // If we start with forall(i=0..n) + // GPU stripmine converts to + // forall(i=0; igetEntryBlock().getFirstNonPHI()); + + Instruction* bloc; + if(Instruction* I = dyn_cast(TripCount)){ + bloc = I->getNextNode(); + } else { + bloc = F->getEntryBlock().getTerminator(); + } + + IRBuilder<> B2(bloc); + StepSize = B2.CreateCall( + Intrinsic::getOrInsertDeclaration(M, Intrinsic::tapir_loop_grainsize, + { TripCount->getType() }), { TripCount }); + + + BranchVal = B.CreateICmpULE(ModVal, ConstantInt::get(ModVal->getType(), 0)); + } + else { + // Calculate ModVal = (BECount + 1) % Count. + // Note that TripCount is BECount + 1. + if (isPowerOf2_32(Count)) { + // When Count is power of 2 we don't BECount for epilog case. However we'll + // need it for a branch around stripmined loop for prolog case. + ModVal = B.CreateAnd(TripCount, Count - 1, "xtraiter"); + // 1. There are no iterations to be run in the prolog/epilog loop. + // OR + // 2. The addition computing TripCount overflowed. + // + // If (2) is true, we know that TripCount really is (1 << BEWidth) and so + // the number of iterations that remain to be run in the original loop is a + // multiple Count == (1 << Log2(Count)) because Log2(Count) <= BEWidth (we + // explicitly check this above). + if (TL.isInclusiveRange()) + ModVal = B.CreateAdd(ModVal, ConstantInt::get(ModVal->getType(), 1)); + } else { + // As (BECount + 1) can potentially unsigned overflow we count + // (BECount % Count) + 1 which is overflow safe as BECount % Count < Count. + Value *ModValTmp = B.CreateURem(BECount, + ConstantInt::get(BECount->getType(), + Count)); + Value *ModValAdd = B.CreateAdd(ModValTmp, + ConstantInt::get(ModValTmp->getType(), 1)); + // At that point (BECount % Count) + 1 could be equal to Count. + // To handle this case we need to take mod by Count one more time. + ModVal = B.CreateURem(ModValAdd, + ConstantInt::get(BECount->getType(), Count), + "xtraiter"); + } + BranchVal = B.CreateICmpSLT( + BECount, ConstantInt::get(BECount->getType(), + TL.isInclusiveRange() ? Count : Count - 1)); } - Value *BranchVal = B.CreateICmpSLT( - BECount, ConstantInt::get(BECount->getType(), - TL.isInclusiveRange() ? Count : Count - 1)); BasicBlock *RemainderLoopBB = NewExit; BasicBlock *StripminedLoopBB = NewPreheader; // Branch to either remainder (extra iterations) loop or stripmined loop. @@ -980,7 +1026,6 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount, if (DT) DT->changeImmediateDominator(NewExit, Preheader); - Function *F = Header->getParent(); // Get an ordered list of blocks in the loop to help with the ordering of the // cloned blocks in the prolog/epilog code LoopBlocksDFS LoopBlocks(L); @@ -1049,21 +1094,22 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount, // TODO: For stripmine factor 2 remainder loop will have 1 iterations. // Do not create 1 iteration loop. // bool CreateRemainderLoop = (Count != 2); - bool CreateRemainderLoop = true; + bool CreateRemainderLoop = !GPU; // Clone all the basic blocks in the loop. If Count is 2, we don't clone // the loop, otherwise we create a cloned loop to execute the extra // iterations. This function adds the appropriate CFG connections. BasicBlock *InsertBot = LatchExit; BasicBlock *InsertTop = EpilogPreheader; - *RemainderLoop = - cloneLoopBlocks(L, ModVal, CreateRemainderLoop, true, UnrollRemainder, - InsertTop, InsertBot, NewPreheader, NewBlocks, LoopBlocks, - ExtraTaskBlocks, SharedEHTaskBlocks, VMap, DT, LI, Count); + if(CreateRemainderLoop){ + *RemainderLoop = + cloneLoopBlocks(L, ModVal, CreateRemainderLoop, true, UnrollRemainder, + InsertTop, InsertBot, NewPreheader, NewBlocks, LoopBlocks, + ExtraTaskBlocks, SharedEHTaskBlocks, VMap, DT, LI, Count); - // Insert the cloned blocks into the function. - F->splice(InsertBot->getIterator(), &*F, NewBlocks[0]->getIterator(), - F->end()); + // Insert the cloned blocks into the function. + F->splice(InsertBot->getIterator(), &*F, NewBlocks[0]->getIterator(), + F->end()); // Loop structure should be the following: // Epilog @@ -1082,51 +1128,51 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount, // Rewrite the cloned instruction operands to use the values created when the // clone is created. - for (BasicBlock *BB : NewBlocks) - for (Instruction &I : *BB) - RemapInstruction(&I, VMap, - RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); + for (BasicBlock *BB : NewBlocks) + for (Instruction &I : *BB) + RemapInstruction(&I, VMap, + RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); - // Serialize the cloned loop body to render the inner loop serial. - { - // Translate all the analysis for the new cloned task. - SmallVector ClonedReattaches; - for (Instruction *I : Reattaches) - ClonedReattaches.push_back(cast(VMap[I])); - SmallPtrSet ClonedEHBlockPreds; - for (BasicBlock *B : EHBlockPreds) - ClonedEHBlockPreds.insert(cast(VMap[B])); - SmallVector ClonedEHBlocks; - for (BasicBlock *B : EHBlocksToClone) - ClonedEHBlocks.push_back(cast(VMap[B])); - // Landing pads and detached-rethrow instructions may or may not have been - // cloned. - SmallPtrSet ClonedInlinedLPads; - for (LandingPadInst *LPad : InlinedLPads) { - if (VMap[LPad]) - ClonedInlinedLPads.insert(cast(VMap[LPad])); - else - ClonedInlinedLPads.insert(LPad); - } - SmallVector ClonedDetachedRethrows; - for (Instruction *DR : DetachedRethrows) { - if (VMap[DR]) - ClonedDetachedRethrows.push_back(cast(VMap[DR])); - else - ClonedDetachedRethrows.push_back(DR); + // Serialize the cloned loop body to render the inner loop serial. + { + // Translate all the analysis for the new cloned task. + SmallVector ClonedReattaches; + for (Instruction *I : Reattaches) + ClonedReattaches.push_back(cast(VMap[I])); + SmallPtrSet ClonedEHBlockPreds; + for (BasicBlock *B : EHBlockPreds) + ClonedEHBlockPreds.insert(cast(VMap[B])); + SmallVector ClonedEHBlocks; + for (BasicBlock *B : EHBlocksToClone) + ClonedEHBlocks.push_back(cast(VMap[B])); + // Landing pads and detached-rethrow instructions may or may not have been + // cloned. + SmallPtrSet ClonedInlinedLPads; + for (LandingPadInst *LPad : InlinedLPads) { + if (VMap[LPad]) + ClonedInlinedLPads.insert(cast(VMap[LPad])); + else + ClonedInlinedLPads.insert(LPad); + } + SmallVector ClonedDetachedRethrows; + for (Instruction *DR : DetachedRethrows) { + if (VMap[DR]) + ClonedDetachedRethrows.push_back(cast(VMap[DR])); + else + ClonedDetachedRethrows.push_back(DR); + } + DetachInst *ClonedDI = cast(VMap[DI]); + // Serialize the new task. + SerializeDetach(ClonedDI, ParentEntry, EHCont, EHContLPadVal, + ClonedReattaches, &ClonedEHBlocks, &ClonedEHBlockPreds, + &ClonedInlinedLPads, &ClonedDetachedRethrows, + NeedToInsertTaskFrame, DT, nullptr, LI); } - DetachInst *ClonedDI = cast(VMap[DI]); - // Serialize the new task. - SerializeDetach(ClonedDI, ParentEntry, EHCont, EHContLPadVal, - ClonedReattaches, &ClonedEHBlocks, &ClonedEHBlockPreds, - &ClonedInlinedLPads, &ClonedDetachedRethrows, - NeedToInsertTaskFrame, DT, nullptr, LI); } // Detach the stripmined loop. Value *SyncReg = DI->getSyncRegion(), *NewSyncReg; BasicBlock *EpilogPred, *LoopDetEntry, *LoopReattach; - Module *M = F->getParent(); if (ParallelEpilog) { ORE->emit([&]() { return OptimizationRemark(LSM_NAME, "ParallelEpil", @@ -1339,48 +1385,66 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount, // // TODO: Generalize to handle non-power-of-2 counts. assert(isPowerOf2_32(Count) && "Count is not a power of 2."); - Value *TestVal = B2.CreateUDiv(TripCount, - ConstantInt::get(TripCount->getType(), Count), - "stripiter"); - // Value *TestVal = B2.CreateSub(TripCount, ModVal, "stripiter", true, true); - - // Value *TestCmp = B2.CreateICmpUGT(TestVal, - // ConstantInt::get(TestVal->getType(), 0), - // TestVal->getName() + ".ncmp"); - // ReplaceInstWithInst(NewPreheader->getTerminator(), - // BranchInst::Create(Header, LatchExit, TestCmp)); - // DT->changeImmediateDominator(LatchExit, - // DT->findNearestCommonDominator(LatchExit, - // NewPreheader)); - - // Add new counter for new outer loop. - // - // We introduce a new primary induction variable, NewIdx, into the outer loop, - // which counts up to the outer-loop trip count from 0, stepping by 1. In - // contrast to counting down from the outer-loop trip count, this new variable - // ensures that future loop passes, including LoopSpawning, can process this - // outer loop when we're done. - PHINode *NewIdx = PHINode::Create(TestVal->getType(), 2, "niter"); - NewIdx->insertBefore(NewHeader->getFirstNonPHIIt()); - B2.SetInsertPoint(NewLatch->getTerminator()); - // Instruction *IdxSub = cast( - // B2.CreateSub(NewIdx, ConstantInt::get(NewIdx->getType(), 1), - // NewIdx->getName() + ".nsub")); - // IdxSub->copyIRFlags(PrimaryInc); - Instruction *IdxAdd = cast( - B2.CreateAdd(NewIdx, ConstantInt::get(NewIdx->getType(), 1), - NewIdx->getName() + ".nadd")); - IdxAdd->copyIRFlags(PrimaryInc); - - // NewIdx->addIncoming(TestVal, NewPreheader); - // NewIdx->addIncoming(IdxSub, NewLatch); - // Value *IdxCmp = B2.CreateIsNull(IdxSub, NewIdx->getName() + ".ncmp"); - NewIdx->addIncoming(ConstantInt::get(TestVal->getType(), 0), LoopDetEntry); - NewIdx->addIncoming(IdxAdd, NewLatch); - Value *IdxCmp = B2.CreateICmpEQ(IdxAdd, TestVal, - NewIdx->getName() + ".ncmp"); - ReplaceInstWithInst(NewLatch->getTerminator(), - BranchInst::Create(LoopReattach, NewHeader, IdxCmp)); + PHINode *NewIdx; + if(GPU){ + Value *TestVal = StepSize; + NewIdx = PHINode::Create(TestVal->getType(), 2, "niter", + NewHeader->getFirstNonPHIIt()); + B2.SetInsertPoint(NewLatch->getTerminator()); + Instruction *IdxAdd = cast( + B2.CreateAdd(NewIdx, ConstantInt::get(NewIdx->getType(), 1), + NewIdx->getName() + ".nadd")); + IdxAdd->copyIRFlags(PrimaryInc); + NewIdx->addIncoming(ConstantInt::get(TestVal->getType(), 0), LoopDetEntry); + NewIdx->addIncoming(IdxAdd, NewLatch); + Value *IdxCmp = B2.CreateICmpEQ(IdxAdd, TestVal, + NewIdx->getName() + ".ncmp"); + ReplaceInstWithInst(NewLatch->getTerminator(), + BranchInst::Create(LoopReattach, NewHeader, IdxCmp)); + } else { + Value *TestVal = B2.CreateUDiv(TripCount, + ConstantInt::get(TripCount->getType(), Count), + "stripiter"); + // Value *TestVal = B2.CreateSub(TripCount, ModVal, "stripiter", true, true); + + // Value *TestCmp = B2.CreateICmpUGT(TestVal, + // ConstantInt::get(TestVal->getType(), 0), + // TestVal->getName() + ".ncmp"); + // ReplaceInstWithInst(NewPreheader->getTerminator(), + // BranchInst::Create(Header, LatchExit, TestCmp)); + // DT->changeImmediateDominator(LatchExit, + // DT->findNearestCommonDominator(LatchExit, + // NewPreheader)); + + // Add new counter for new outer loop. + // + // We introduce a new primary induction variable, NewIdx, into the outer loop, + // which counts up to the outer-loop trip count from 0, stepping by 1. In + // contrast to counting down from the outer-loop trip count, this new variable + // ensures that future loop passes, including LoopSpawning, can process this + // outer loop when we're done. + NewIdx = PHINode::Create(TestVal->getType(), 2, "niter", + NewHeader->getFirstNonPHIIt()); + B2.SetInsertPoint(NewLatch->getTerminator()); + // Instruction *IdxSub = cast( + // B2.CreateSub(NewIdx, ConstantInt::get(NewIdx->getType(), 1), + // NewIdx->getName() + ".nsub")); + // IdxSub->copyIRFlags(PrimaryInc); + Instruction *IdxAdd = cast( + B2.CreateAdd(NewIdx, ConstantInt::get(NewIdx->getType(), 1), + NewIdx->getName() + ".nadd")); + IdxAdd->copyIRFlags(PrimaryInc); + + // NewIdx->addIncoming(TestVal, NewPreheader); + // NewIdx->addIncoming(IdxSub, NewLatch); + // Value *IdxCmp = B2.CreateIsNull(IdxSub, NewIdx->getName() + ".ncmp"); + NewIdx->addIncoming(ConstantInt::get(TestVal->getType(), 0), LoopDetEntry); + NewIdx->addIncoming(IdxAdd, NewLatch); + Value *IdxCmp = B2.CreateICmpEQ(IdxAdd, TestVal, + NewIdx->getName() + ".ncmp"); + ReplaceInstWithInst(NewLatch->getTerminator(), + BranchInst::Create(LoopReattach, NewHeader, IdxCmp)); + } DT->changeImmediateDominator(NewLatch, NewHeader); // The block structure of the stripmined loop should now look like so: // @@ -1460,75 +1524,112 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount, // Update all of the old PHI nodes B2.SetInsertPoint(NewEntry->getTerminator()); - Instruction *CountVal = cast( - B2.CreateMul(ConstantInt::get(NewIdx->getType(), Count), - NewIdx)); - CountVal->copyIRFlags(PrimaryInduction); - for (auto &InductionEntry : *TL.getInductionVars()) { - PHINode *OrigPhi = InductionEntry.first; - const InductionDescriptor &II = InductionEntry.second; - if (II.getStep()->isZero()) - // Nothing to do for this Phi - continue; - // Get the new step value for this Phi. - Value *PhiCount = !II.getStep()->getType()->isIntegerTy() - ? B2.CreateCast(Instruction::SIToFP, CountVal, - II.getStep()->getType()) - : B2.CreateSExtOrTrunc(CountVal, II.getStep()->getType()); - Value *NewStart = emitTransformedIndex(B2, PhiCount, SE, DL, II); - - // Get the old increment instruction for this Phi - int Idx = OrigPhi->getBasicBlockIndex(NewEntry); - OrigPhi->setIncomingValue(Idx, NewStart); + if(GPU){ + } + else{ + Instruction *CountVal = cast( + B2.CreateMul(ConstantInt::get(NewIdx->getType(), Count), + NewIdx)); + CountVal->copyIRFlags(PrimaryInduction); + for (auto &InductionEntry : *TL.getInductionVars()) { + PHINode *OrigPhi = InductionEntry.first; + const InductionDescriptor &II = InductionEntry.second; + if (II.getStep()->isZero()) + // Nothing to do for this Phi + continue; + // Get the new step value for this Phi. + Value *PhiCount = !II.getStep()->getType()->isIntegerTy() + ? B2.CreateCast(Instruction::SIToFP, CountVal, + II.getStep()->getType()) + : B2.CreateSExtOrTrunc(CountVal, II.getStep()->getType()); + Value *NewStart = emitTransformedIndex(B2, PhiCount, SE, DL, II); + + // Get the old increment instruction for this Phi + int Idx = OrigPhi->getBasicBlockIndex(NewEntry); + OrigPhi->setIncomingValue(Idx, NewStart); + } } - // Add new induction variable for inner loop. - PHINode *InnerIdx = - PHINode::Create(PrimaryInduction->getType(), 2, "inneriter"); - InnerIdx->insertBefore(Header->getFirstNonPHIIt()); - Value *InnerTestVal = ConstantInt::get(PrimaryInduction->getType(), Count); - B2.SetInsertPoint(LatchBR); - Instruction *InnerSub = cast( - B2.CreateSub(InnerIdx, ConstantInt::get(InnerIdx->getType(), 1), - InnerIdx->getName() + ".nsub")); - InnerSub->copyIRFlags(PrimaryInc); - // Instruction *InnerAdd = cast( - // B2.CreateAdd(InnerIdx, ConstantInt::get(InnerIdx->getType(), 1), - // InnerIdx->getName() + ".nadd")); - // InnerAdd->copyIRFlags(PrimaryInc); - Value *InnerCmp; - if (LatchBR->getSuccessor(0) == Header) - InnerCmp = B2.CreateIsNotNull(InnerSub, InnerIdx->getName() + ".ncmp"); - else - InnerCmp = B2.CreateIsNull(InnerSub, InnerIdx->getName() + ".ncmp"); - InnerIdx->addIncoming(InnerTestVal, NewEntry); - InnerIdx->addIncoming(InnerSub, Latch); - // if (LatchBR->getSuccessor(0) == Header) - // InnerCmp = B2.CreateICmpNE(InnerAdd, InnerTestVal, - // InnerIdx->getName() + ".ncmp"); - // else - // InnerCmp = B2.CreateICmpEQ(InnerAdd, InnerTestVal, - // InnerIdx->getName() + ".ncmp"); - // InnerIdx->addIncoming(ConstantInt::get(InnerIdx->getType(), 0), NewEntry); - // InnerIdx->addIncoming(InnerAdd, Latch); - LatchBR->setCondition(InnerCmp); + if(GPU){ + // GPU mode: inner loop strides by grainsize. + PHINode *InnerIdx = + PHINode::Create(PrimaryInduction->getType(), 2, "inneriter", + Header->getFirstNonPHIIt()); + // Initialize inner index to zero. + //Value *Zero = ConstantInt::get(PrimaryInduction->getType(), 0); + B2.SetInsertPoint(LatchBR->getParent()->getFirstNonPHIIt()); + // Instead of subtracting one, add the grainsize. + + Value *NextIdx = B2.CreateAdd(InnerIdx, StepSize, + InnerIdx->getName() + ".nadd_stride"); + + //NextIdx->copyIRFlags(PrimaryInc); + // Check if the new index is still within the original trip count. + InnerIdx->addIncoming(NewIdx, NewEntry); + InnerIdx->addIncoming(NextIdx, Latch); + Value *InnerCmp; + if (LatchBR->getSuccessor(0) == Header) + InnerCmp = B2.CreateICmpULT(NextIdx, TripCount, + InnerIdx->getName() + ".ncmp_final"); + + else + InnerCmp = B2.CreateICmpUGE(NextIdx, TripCount, + InnerIdx->getName() + ".ncmp_final"); + + LatchBR->setCondition(InnerCmp); + // In the gpu case, we actually want to replace the induction variable + PrimaryInduction->replaceAllUsesWith(InnerIdx); + } else { + // Add new induction variable for inner loop. + PHINode *InnerIdx = PHINode::Create(PrimaryInduction->getType(), 2, + "inneriter", + Header->getFirstNonPHIIt()); + Value *InnerTestVal = ConstantInt::get(PrimaryInduction->getType(), Count); + B2.SetInsertPoint(LatchBR); + Instruction *InnerSub = cast( + B2.CreateSub(InnerIdx, ConstantInt::get(InnerIdx->getType(), 1), + InnerIdx->getName() + ".nsub")); + InnerSub->copyIRFlags(PrimaryInc); + // Instruction *InnerAdd = cast( + // B2.CreateAdd(InnerIdx, ConstantInt::get(InnerIdx->getType(), 1), + // InnerIdx->getName() + ".nadd")); + // InnerAdd->copyIRFlags(PrimaryInc); + Value *InnerCmp; + if (LatchBR->getSuccessor(0) == Header) + InnerCmp = B2.CreateIsNotNull(InnerSub, InnerIdx->getName() + ".ncmp"); + else + InnerCmp = B2.CreateIsNull(InnerSub, InnerIdx->getName() + ".ncmp"); + InnerIdx->addIncoming(InnerTestVal, NewEntry); + InnerIdx->addIncoming(InnerSub, Latch); + // if (LatchBR->getSuccessor(0) == Header) + // InnerCmp = B2.CreateICmpNE(InnerAdd, InnerTestVal, + // InnerIdx->getName() + ".ncmp"); + // else + // InnerCmp = B2.CreateICmpEQ(InnerAdd, InnerTestVal, + // InnerIdx->getName() + ".ncmp"); + // InnerIdx->addIncoming(ConstantInt::get(InnerIdx->getType(), 0), NewEntry); + // InnerIdx->addIncoming(InnerAdd, Latch); + LatchBR->setCondition(InnerCmp); + } // Connect the epilog code to the original loop and update the PHI functions. B2.SetInsertPoint(EpilogPreheader->getTerminator()); - // Compute the start of the epilog iterations. We use a divide and multiply - // by the power-of-2 count to simplify the SCEV's of the induction variables - // for later analysis passes. - // Value *EpilStartIter = B2.CreateSub(TripCount, ModVal); - Value *EpilStartIter = - B2.CreateMul(B2.CreateUDiv(TripCount, - ConstantInt::get(TripCount->getType(), Count)), - ConstantInt::get(TripCount->getType(), Count)); - if (Instruction *ESIInst = dyn_cast(EpilStartIter)) - ESIInst->copyIRFlags(PrimaryInc); - connectEpilog(TL, EpilStartIter, ModVal, EpilogPred, LoopReattach, NewExit, - LatchExit, Preheader, EpilogPreheader, VMap, DT, LI, SE, DL, - PreserveLCSSA); + if(!GPU){ + // Compute the start of the epilog iterations. We use a divide and multiply + // by the power-of-2 count to simplify the SCEV's of the induction variables + // for later analysis passes. + // Value *EpilStartIter = B2.CreateSub(TripCount, ModVal); + Value *EpilStartIter = + B2.CreateMul(B2.CreateUDiv(TripCount, + ConstantInt::get(TripCount->getType(), Count)), + ConstantInt::get(TripCount->getType(), Count)); + if (Instruction *ESIInst = dyn_cast(EpilStartIter)) + ESIInst->copyIRFlags(PrimaryInc); + connectEpilog(TL, EpilStartIter, ModVal, EpilogPred, LoopReattach, NewExit, + LatchExit, Preheader, EpilogPreheader, VMap, DT, LI, SE, DL, + PreserveLCSSA); + } // If this loop is nested, then the loop stripminer changes the code in the // any of its parent loops, so the Scalar Evolution pass needs to be run @@ -1548,29 +1649,276 @@ Loop *llvm::StripMineLoop(Loop *L, unsigned Count, bool AllowExpensiveTripCount, // } // Record that the remainder loop was derived from a Tapir loop. - (*RemainderLoop)->setDerivedFromTapirLoop(); + if(!GPU) + (*RemainderLoop)->setDerivedFromTapirLoop(); // At this point, the code is well formed. We now simplify the new loops, // doing constant propagation and dead code elimination as we go. simplifyLoopAfterStripMine(L, /*SimplifyIVs*/ true, LI, SE, DT, TTI, AC); simplifyLoopAfterStripMine(NewLoop, /*SimplifyIVs*/ true, LI, SE, DT, TTI, AC); - simplifyLoopAfterStripMine(*RemainderLoop, /*SimplifyIVs*/ true, LI, SE, DT, - TTI, AC); + if(!GPU) + simplifyLoopAfterStripMine(*RemainderLoop, /*SimplifyIVs*/ true, LI, SE, DT, + TTI, AC); + // TODO: update all the analyses manually #ifndef NDEBUG - DT->verify(); - LI->verify(*DT); + //DT->verify(); + //LI->verify(*DT); #endif // Record that the old loop was derived from a Tapir loop. L->setDerivedFromTapirLoop(); // Update TaskInfo manually using the updated DT. - if (TI) + //if (TI) // FIXME: Recalculating TaskInfo for the whole function is wasteful. // Optimize this routine in the future. - TI->recalculate(*F, *DT); + //TI->recalculate(*F, *DT); + + // Reductions take a parallel loop + // forall(i=0; i N){ + // for(...) // epilogue logic + // BODY + // sum(&reds[i], a[j]; 0.0) + // } + // for(i=0; i& blocks = L->getBlocks(); + std::set> reductions; + for (BasicBlock *BB : blocks){ + for (Instruction &I : *BB) { + if(auto ci = dyn_cast(&I)){ + auto f = ci->getCalledFunction(); + if(f->getAttributes().hasAttrSomewhere(Attribute::KitsuneReduction)){ + LLVM_DEBUG(dbgs() << "Found reduction var: " << ci->getArgOperand(0)->getName() << + "with reduction function: " << f->getName() << "\n"); + auto ty = ci->getArgOperand(1)->getType(); + reductions.insert(std::make_pair(ci, ty)); + //TODO: check the type to confirm valid reduction + } + } + } + } + + // To make the stripmining work for multiple backends, we parameterize on the step and the termination condition + // Roughly speaking, we want the CPU to look like + // n = p*s + k + // forall(i = 0; i> redMap; + // TODO: Modify the strip mining outer loop to be smaller: currently we are + // stack allocating n/2048 reduction values. + // TODO: Initialize local reductions with unit values + // TODO: move insertion point for reduction allocation + // TODO: free reduction allocation + Instruction* bloc = nullptr; + if(Instruction* I = dyn_cast(TripCount)){ + bloc = I->getParent()->getTerminator(); + } else { + bloc = F->getEntryBlock().getTerminator(); + } + IRBuilder<> RB(bloc); + Value *outerIters; + if(!GPU) + outerIters = RB.CreateUDiv(TripCount, + ConstantInt::get(TripCount->getType(), Count), + "stripiter"); + else + outerIters = StepSize; + + // Here we iterate over the reductions (calls to reduction functions), and + // allocate the local reduction variable array, and build the association + // array redMap, and replace references to the original reduction variable + // with references to the new local reduction variable in the body of the + // inner loop + auto nred = RB.CreateAdd(outerIters, ConstantInt::get(outerIters->getType(), 1)); + for(auto &pair : reductions){ + // TODO: generic allocation/free calls + auto ci = pair.first; + auto ptr = ci->getArgOperand(0); + auto unit = ci->getArgOperand(2); + auto ty = pair.second; + auto gmmTy = FunctionType::get(ptr->getType(), { nred->getType() }, false); + auto arrSize = RB.CreateMul(nred, ConstantInt::get(nred->getType(), DL.getTypeAllocSize(ty))); + auto al = RB.CreateCall(M->getOrInsertFunction("gpuManagedMalloc", gmmTy), {arrSize}); + //auto al = RB.CreateBitCast(rm, ty); + //auto al = RB.CreateAlloca(ty, nred, ptr->getName() + "_reduction"); + IRBuilder<> BH(NewLoop->getHeader()->getTerminator()); + auto lptr = BH.CreateBitCast( + BH.CreateGEP(ty, al, NewIdx), + ptr->getType()); + redMap.push_back(std::make_tuple(ci, ptr, al, ty, unit)); + // Assume there is more than one element, and + // use the first element for the first iteration of the loop. + // roughly: + // red = init; + // forall(i = ...){ + // red = reduce(red, body(i)); + // } + // red = init; + // localred[m+1]; + // + // forall(k ∈ 0..m-1){ + // localred[i] = body(j_0); + // for(j ∈ j_k_1..j_k_l-1) + // reduce(localred+i, body(j)); + // } + // for( j ∈ j_k_m .. n ) + // reduce(localred+m, body(j)); + // + // for(k ∈ 0..m) + // reduce(&red, localred[k]); + // + ptr->replaceUsesWithIf(lptr, [L](Use &u){ + if(auto I = dyn_cast(u.getUser())){ + return L->contains(I->getParent()); + } else { + return false; + }; + }); + } + + // Epilog "join" of reduction values stored in local reduction value arrays. + // Should be able to use redMap to map original pointer (which is still used + // to reduce the remainder of the strimined loop, so you probably want to + // start the reduction with that value). + LLVM_DEBUG(dbgs() << "Function after strip mining, before reduction epilogue\n" << *F); + + if(!reductions.empty()){ + ValueToValueMapTy VMap; + SmallVector CIS; + for(auto &BB : NewLoop->blocks()){ + // We find the location that we reduce into and create a store of unit + // TODO: Get unit value for reduction + for(auto &I : *BB){ + if(auto *CI = dyn_cast(&I)){ + auto *F = CI->getCalledFunction(); + if(F->getAttributes().hasAttrSomewhere(Attribute::KitsuneReduction)){ + // this must be defined in the outer parallel loop but before the inner loop + IRBuilder<> PB(dyn_cast(CI->getArgOperand(0))->getNextNode()); + PB.CreateStore(CI->getArgOperand(2), CI->getArgOperand(0)); + CIS.push_back(&I); + F->removeFnAttr(Attribute::NoInline); + F->removeFnAttr(Attribute::OptimizeNone); + } + } + } + } + + // We insert the reduction code at every sync corresponding to the strimined + // loop + // + // Sync + // RedEpiHeader + // RedEpiBody + // RedEpiExit + + Instruction* term = LatchExit->getTerminator(); + BasicBlock *PostSync = term->getSuccessor(0); + BasicBlock* RedEpiHeader = BasicBlock::Create(LatchExit->getContext(), "reductionEpilogue", LatchExit->getParent(), LatchExit); + RedEpiHeader->moveAfter(LatchExit); + ReplaceInstWithInst(term, SyncInst::Create(RedEpiHeader, SyncReg)); + BranchInst::Create(PostSync, RedEpiHeader); + PHINode *Idx = PHINode::Create(outerIters->getType(), 2, + "reductionepilogueidx", + RedEpiHeader->getFirstNonPHIIt()); + IRBuilder<> BH(RedEpiHeader, RedEpiHeader->getFirstNonPHIIt()); + Idx->addIncoming(ConstantInt::get(outerIters->getType(), 0), LatchExit); + Instruction *bodyTerm, *exitTerm; + Value *cmp = BH.CreateCmp(CmpInst::ICMP_NE, Idx, outerIters); + SplitBlockAndInsertIfThenElse(cmp, RedEpiHeader->getTerminator(), &bodyTerm, &exitTerm); + + IRBuilder<> BB(bodyTerm); + // For each reduction, get the allocated thread local reduced values and + // reduce them. + for(auto& kv : redMap){ + const auto [ ci, ptr, al, ty, unit ] = kv; + auto lptr = BB.CreateBitCast( + BB.CreateGEP(ty, al, Idx), + ptr->getType()); + auto x = BB.CreateLoad(ty, lptr); + BB.SetCurrentDebugLocation(ci->getDebugLoc()); + BB.CreateCall(ci->getCalledFunction(), { ptr, x , unit}); + } + Value *IdxAdd = + BB.CreateAdd(Idx, ConstantInt::get(Idx->getType(), 1), + Idx->getName() + ".add"); + BasicBlock* body = bodyTerm->getParent(); + BasicBlock* loopExit = exitTerm->getParent(); + Idx->addIncoming(IdxAdd, body); + ReplaceInstWithInst(bodyTerm, BranchInst::Create(RedEpiHeader)); + + // Update Loopinfo with reduction loop + Loop* RL = LI->AllocateLoop(); + if(ParentLoop) ParentLoop->addChildLoop(RL); + else LI->addTopLevelLoop(RL); + if(!ParentLoop){ + RL->addBasicBlockToLoop(RedEpiHeader, *LI); + RL->addBasicBlockToLoop(body, *LI); + } else { + LI->changeLoopFor(RedEpiHeader, RL); + RL->addBlockEntry(RedEpiHeader); + LI->changeLoopFor(body, RL); + RL->addBlockEntry(body); + } + } + + LLVM_DEBUG(dbgs() << "Function after reduction epilogue\n" << *F); + + // TODO: fix DT updates + //DT->recalculate(*F); + +#ifndef NDEBUG + //DT->verify(); + //LI->verify(*DT); +#endif return NewLoop; } diff --git a/llvm/lib/Transforms/Tapir/LoopStripMinePass.cpp b/llvm/lib/Transforms/Tapir/LoopStripMinePass.cpp index 9866dcd1fb9ff..56c81b0396bbb 100644 --- a/llvm/lib/Transforms/Tapir/LoopStripMinePass.cpp +++ b/llvm/lib/Transforms/Tapir/LoopStripMinePass.cpp @@ -130,8 +130,8 @@ static bool tryToStripMineLoop( "form.\n"); return false; } - bool StripMiningRequested = - (hasLoopStripmineTransformation(L) == TM_ForcedByUser); + //bool StripMiningRequested = + // (hasLoopStripmineTransformation(L) == TM_ForcedByUser); TargetTransformInfo::StripMiningPreferences SMP = gatherStripMiningPreferences(L, SE, TTI, ProvidedCount); @@ -152,12 +152,12 @@ static bool tryToStripMineLoop( // If the loop size is unknown, then we cannot compute a stripmining count for // it. - if (!ExplicitCount && UnknownSize) { - LLVM_DEBUG(dbgs() << " Not stripmining loop with unknown size.\n"); - ORE.emit(createMissedAnalysis("UnknownSize", L) - << "Cannot stripmine loop with unknown size."); - return false; - } + //if (!ExplicitCount && UnknownSize) { + // LLVM_DEBUG(dbgs() << " Not stripmining loop with unknown size.\n"); + // ORE.emit(createMissedAnalysis("UnknownSize", L) + // << "Cannot stripmine loop with unknown size."); + // return false; + // } // If the loop size is enormous, then we might want to use a stripmining count // of 1 for it. @@ -212,12 +212,14 @@ static bool tryToStripMineLoop( // If the loop contains potentially expensive function calls, then we don't // want to stripmine it. + /* if (NumCalls > 0 && !ExplicitCount && !StripMiningRequested) { LLVM_DEBUG(dbgs() << " Skipping loop with expensive function calls.\n"); ORE.emit(createMissedAnalysis("ExpensiveCalls", L) << "Not stripmining loop with potentially expensive calls."); return false; } + */ // Make sure the count is a power of 2. if (!isPowerOf2_32(SMP.Count)) @@ -282,12 +284,30 @@ static bool tryToStripMineLoop( // Save loop properties before it is transformed. MDNode *OrigLoopID = L->getLoopID(); + // TODO: change this to check tapir loop attributes for custom target + bool GPU = false; + + TTID target = TGI.hasTTID() ? TGI.getTTID() : TTID::OpenCilk; + + switch(target){ + // We don't want to stripmine for serial targets + case TTID::Serial: + case TTID::Cuda: + case TTID::Hip: + return false; + case TTID::GPU: + GPU = true; + break; + default: + break; + } + // Stripmine the loop Loop *RemainderLoop = nullptr; Loop *NewLoop = StripMineLoop(L, SMP.Count, SMP.AllowExpensiveTripCount, SMP.UnrollRemainder, LI, &SE, &DT, TTI, &AC, TI, &ORE, PreserveLCSSA, ParallelEpilog, - NeedNestedSync, &RemainderLoop); + NeedNestedSync, &RemainderLoop, GPU); if (!NewLoop) return false; @@ -307,13 +327,10 @@ static bool tryToStripMineLoop( return true; } -PreservedAnalyses LoopStripMinePass::run(Function &F, - FunctionAnalysisManager &AM) { - Module& M = *F.getParent(); - - auto &MAM = AM.getResult(F); - const TapirTargetInfo &TGI = *MAM.getCachedResult(M); +static bool loopStripMineImpl(Function &F, + FunctionAnalysisManager &AM, + TapirTargetInfo &TGI) { auto &TLI = AM.getResult(F); auto &SE = AM.getResult(F); auto &LI = AM.getResult(F); @@ -372,7 +389,7 @@ PreservedAnalyses LoopStripMinePass::run(Function &F, // The parent must not be damaged by stripmining! #ifndef NDEBUG if (LoopChanged && ParentL) - ParentL->verifyLoop(); + //ParentL->verifyLoop(); #endif // Clear any cached analysis results for L if we removed it completely. @@ -380,8 +397,29 @@ PreservedAnalyses LoopStripMinePass::run(Function &F, LAM->clear(L, LoopName); } + return Changed; +} + +PreservedAnalyses LoopStripMinePass::run(Module &M, + ModuleAnalysisManager &AM){ + auto &FAM = AM.getResult(M).getManager(); + auto &TGI = AM.getResult(M); + + bool Changed = false; + + for(auto &F: M){ + if(!F.empty()) + Changed |= loopStripMineImpl(F, FAM, TGI); + } + if (!Changed) return PreservedAnalyses::all(); - return getLoopPassPreservedAnalyses(); + PreservedAnalyses PA = PreservedAnalyses::none(); + // If we've changed, assume we've not preserved anything + PA.preserve(); + return PA; + //return PreservedAnalyses::none(); + } + diff --git a/llvm/lib/Transforms/Tapir/LoweringUtils.cpp b/llvm/lib/Transforms/Tapir/LoweringUtils.cpp index e90970df06f0f..fd17fbce9d882 100644 --- a/llvm/lib/Transforms/Tapir/LoweringUtils.cpp +++ b/llvm/lib/Transforms/Tapir/LoweringUtils.cpp @@ -25,6 +25,10 @@ #include "llvm/Support/ToolOutputFile.h" #include "llvm/Transforms/IPO/FunctionAttrs.h" #include "llvm/Transforms/Tapir/Outline.h" +#include "llvm/Transforms/Tapir/QthreadsABI.h" +#include "llvm/Transforms/Tapir/RealmABI.h" +#include "llvm/Transforms/Tapir/GPUABI.h" +#include "llvm/Transforms/Tapir/SerialABI.h" #include "llvm/Transforms/Tapir/TapirLoopInfo.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" diff --git a/llvm/test/Transforms/Tapir/jump-threading-detach-continue-2.ll b/llvm/test/Transforms/Tapir/jump-threading-detach-continue-2.ll index 990c577e2e781..a338ee84a92b9 100644 --- a/llvm/test/Transforms/Tapir/jump-threading-detach-continue-2.ll +++ b/llvm/test/Transforms/Tapir/jump-threading-detach-continue-2.ll @@ -1,7 +1,7 @@ ; Check that jump threading does not thread a detach-continue edge if ; it does not also thread the corresponding reattach-continue edge. ; -; RUN: opt < %s -passes="cgscc(devirt<4>(inline,function(loop(indvars,loop-unroll-full),gvn<>,instcombine,loop-mssa(licm)))),function(loop-stripmine,jump-threading)" -unroll-peel-max-count=0 -require-parallel-epilog -S | FileCheck %s +; RUN: opt < %s -passes="cgscc(devirt<4>(inline,function(loop(indvars,loop-unroll-full),gvn<>,instcombine,loop-mssa(licm)))),loop-stripmine,function(jump-threading)" -unroll-peel-max-count=0 -require-parallel-epilog -S | FileCheck %s target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Transforms/Tapir/loop-stripmine-zero-iter-pfor-loop.ll b/llvm/test/Transforms/Tapir/loop-stripmine-zero-iter-pfor-loop.ll index 338af8c741b74..b1cd73cfed784 100644 --- a/llvm/test/Transforms/Tapir/loop-stripmine-zero-iter-pfor-loop.ll +++ b/llvm/test/Transforms/Tapir/loop-stripmine-zero-iter-pfor-loop.ll @@ -1,6 +1,6 @@ ; Check that loop-stripmining generates the correct IR to enable a zero-iteration parallel loop to be optimized away. ; -; RUN: opt < %s -passes="cgscc(devirt<4>(inline,function(loop(indvars),sroa))),function(loop-stripmine,early-cse,instcombine),function(simplifycfg)" -S | FileCheck %s +; RUN: opt < %s -passes="cgscc(devirt<4>(inline,function(loop(indvars),sroa))),loop-stripmine,function(early-cse,instcombine),function(simplifycfg)" -S | FileCheck %s target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Transforms/Tapir/missed-loop-opts-test.ll b/llvm/test/Transforms/Tapir/missed-loop-opts-test.ll index fecf057069ec4..b217ffb6059fb 100644 --- a/llvm/test/Transforms/Tapir/missed-loop-opts-test.ll +++ b/llvm/test/Transforms/Tapir/missed-loop-opts-test.ll @@ -1,6 +1,6 @@ ; Check that Tapir loops can be peeled and subsequently stripmined. ; -; RUN: opt < %s -passes='loop(loop-unroll-full),gvn,loop(tapir-indvars),loop-stripmine' -S | FileCheck %s +; RUN: opt < %s -passes='function(loop(loop-unroll-full),gvn,loop(tapir-indvars)),loop-stripmine' -S | FileCheck %s target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu"