Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
1c4744b
[kitsune] Add print-before-first command line option
tarunprabhu Oct 20, 2025
8c040e3
[kitsune] Simplify GPU-centric tapir targets
tarunprabhu Oct 20, 2025
a8ee49d
Basic loop stripmine-based implicit parallel reduction (sum only) cod…
stelleg Dec 10, 2021
bacfd20
Explicit parallel reductions via unital magmas added
stelleg Dec 10, 2021
42d542d
Added reductions documentation
stelleg Dec 10, 2021
d51fb68
Implicit sum reductions functional
stelleg Jun 28, 2022
bf2a469
Outlined implementation approach for updated reduction implementation
stelleg Feb 1, 2023
c8bdc81
Working reductions, sequential semantics preserved for commutative as…
stelleg Feb 21, 2023
de32b10
updated reduction example with timing
stelleg Jun 13, 2023
df151f9
Fixed reductions use of pointer types
stelleg Sep 21, 2023
7714266
working reductions on 16.x
stelleg Sep 12, 2024
ca900c8
added reductions example makefile
stelleg Sep 16, 2024
430aa38
Added openmp reduce for comparison
stelleg Oct 31, 2024
36537a1
GPU reductions via stripmining pass working
stelleg May 6, 2025
36be3a5
gpuGridSize takes iteration count
stelleg May 22, 2025
957be4c
Fix to ensure clang generates loop invariant bound for RHS of comparison
stelleg Jul 18, 2025
3f4d462
CUDA 13 fixes
stelleg Oct 2, 2025
9fe76e2
Added reduction keyword header
stelleg Dec 18, 2025
3ff073e
Working simple reductions
stelleg Dec 18, 2025
5e7420f
GPU ABI fixes
stelleg Dec 18, 2025
d5db34c
Make stripmining not depenedent on O2 or O3
stelleg Dec 18, 2025
316b838
Make dead argument elimination disabled by optnone
stelleg Dec 18, 2025
7113766
21.x rebase fixes
stelleg Jan 8, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions clang/include/clang/Basic/Attr.td
Original file line number Diff line number Diff line change
Expand Up @@ -5229,6 +5229,14 @@ def NonString : InheritableAttr {
let Documentation = [NonStringDocs];
}

// +===== kitsune-/tapir-centric attributes

def KitsuneReduction : InheritableAttr {
let Spellings = [Clang<"kitsune_reduction">];
let Subjects = SubjectList<[FunctionLike]>;
let Documentation = [TapirRTDocs];
}

def TapirTarget : StmtAttr {
let Spellings = [CXX11<"tapir","target">];

Expand All @@ -5247,9 +5255,9 @@ def TapirTarget : StmtAttr {
ErrorDiag, "'parallel' statements">;
let Args = [
EnumArgument<"TapirTargetAttrType", "TapirTargetAttrTy", /*is_string=*/ true,
["nolo", "serial", "cuda", "hip", "opencilk", "openmp",
["nolo", "serial", "cuda", "hip", "gpu", "opencilk", "openmp",
"qthreads", "realm"],
["Nolo", "Serial", "Cuda", "Hip", "OpenCilk", "OpenMP",
["Nolo", "Serial", "Cuda", "Hip", "GPU", "OpenCilk", "OpenMP",
"Qthreads", "Realm"],
0>
];
Expand Down
5 changes: 3 additions & 2 deletions clang/include/clang/Basic/DiagnosticDriverKinds.td
Original file line number Diff line number Diff line change
Expand Up @@ -901,6 +901,9 @@ def err_drv_unsupported_option_argument_for_frontend : Error<
def err_drv_kitsune_missing_required : Error<
"missing required option '--%0'">;

def err_drv_kitsune_tapir_required : Error<
"--tapir is required with '%0'">;

def err_drv_kitsune_optzns_required : Error<
"--tapir requires optimization level O1 or higher">;
def err_drv_kitsune_lto_o2_required : Error<
Expand Down Expand Up @@ -944,8 +947,6 @@ def err_drv_opencilk_missing_abi_bitcode: Error<

def err_drv_kitsune_kokkos_disabled : Error<
"kokkos support was not enabled when kitsune was built">;
def err_drv_kitsune_kokkos_no_tapir : Error<
"--tapir is required with '%0'">;

def err_drv_kitsune_target_not_enabled: Error<
"tapir target '%0' was not enabled when kitsune was built">;
Expand Down
15 changes: 12 additions & 3 deletions clang/include/clang/Driver/Options.td
Original file line number Diff line number Diff line change
Expand Up @@ -4283,7 +4283,7 @@ def fno_trigraphs : Flag<["-"], "fno-trigraphs">, Group<f_Group>,
HelpText<"Do not process trigraph sequences">,
Visibility<[ClangOption, CC1Option]>;
def funique_source_file_names: Flag<["-"], "funique-source-file-names">, Group<f_Group>,
HelpText<"Allow the compiler to assume that each translation unit has a unique "
HelpText<"Allow the compiler to assume that each translation unit has a unique "
"source file identifier (see -funique-source-file-identifier) at link time">;
def fno_unique_source_file_names: Flag<["-"], "fno-unique-source-file-names">;
def unique_source_file_identifier_EQ: Joined<["-"], "funique-source-file-identifier=">, Group<f_Group>,
Expand Down Expand Up @@ -7084,7 +7084,7 @@ defm android_pad_segment : BooleanFFlag<"android-pad-segment">, Group<f_Group>;
def shared_libflangrt : Flag<["-"], "shared-libflangrt">,
HelpText<"Link the flang-rt shared library">, Group<Link_Group>,
Visibility<[FlangOption]>, Flags<[NoArgumentUnused]>;
def static_libflangrt : Flag<["-"], "static-libflangrt">,
def static_libflangrt : Flag<["-"], "static-libflangrt">,
HelpText<"Link the flang-rt static library">, Group<Link_Group>,
Visibility<[FlangOption]>, Flags<[NoArgumentUnused]>;

Expand Down Expand Up @@ -9508,14 +9508,23 @@ def fkokkos_no_init : Flag<["-"], "fkokkos-no-init">, Alias<kokkos_no_init>,
Visibility<[ClangOption, CC1Option]>,
HelpText<"DEPRECATED: Use --kokkos-no-init">;

// Tapir currently requires at least -O1. This makes it difficult to write tests
// that check that the frontend generates the expected LLVM IR. If this
// restriction is ever removed, this option will no longer be necessary since
// we could just pass -O0 to examine the IR generated by the frontend.
def print_before_first: Joined<["--"], "print-before-first">,
Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>,
HelpText<"Print the LLVM Module to stderr before running the optimization "
"pipeline">;

def kitrt_verbose: Joined<["--"], "kitrt-verbose">,
Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>,
HelpText<"Enable verbose mode in kitsune's runtime">;

def tapir_EQ : Joined<["--"], "tapir=">,
Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>,
MetaVarName<"<target>">,
Values<"none,serial,cuda,hip,opencilk">,
Values<"nolo,serial,cuda,hip,opencilk">,
HelpText<"The primary tapir target">;

def ftapir_EQ : Joined<["-"], "ftapir=">, Alias<tapir_EQ>,
Expand Down
4 changes: 4 additions & 0 deletions clang/lib/CodeGen/BackendUtil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1216,6 +1216,10 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
return;
}

const KitsuneOptions& kitOpts = CI.getKitsuneOpts();
if (kitOpts.hasTTID() && kitOpts.getPrintBeforeFirst())
llvm::errs() << *TheModule << "\n";

// Now that we have all of the passes ready, run them.
{
PrettyStackTraceString CrashInfo("Optimizer");
Expand Down
3 changes: 3 additions & 0 deletions clang/lib/CodeGen/CGCall.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2512,6 +2512,9 @@ void CodeGenModule::ConstructAttributeList(StringRef Name,
FuncAttrs.addMemoryAttr(llvm::MemoryEffects::inaccessibleOrArgMemOnly());
FuncAttrs.addAttribute(llvm::Attribute::NoUnwind);
}
if (TargetDecl->hasAttr<KitsuneReductionAttr>()) {
FuncAttrs.addAttribute(llvm::Attribute::KitsuneReduction);
}
if (const auto *RA = TargetDecl->getAttr<RestrictAttr>();
RA && RA->getDeallocator() == nullptr)
RetAttrs.addAttribute(llvm::Attribute::NoAlias);
Expand Down
63 changes: 62 additions & 1 deletion clang/lib/CodeGen/CGKitsune.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
#include "clang/CodeGen/CGFunctionInfo.h"
#include "clang/Frontend/FrontendDiagnostic.h"
#include "llvm/IR/ValueMap.h"
#include "llvm/IR/FixedPointBuilder.h"

using namespace clang;
using namespace CodeGen;
Expand Down Expand Up @@ -106,6 +107,8 @@ CodeGenFunction::GetTapirTargetAttr(ArrayRef<const Attr *> Attrs) {
return llvm::TTID::Cuda;
case TapirTargetAttr::Hip:
return llvm::TTID::Hip;
case TapirTargetAttr::GPU:
return llvm::TTID::GPU;
case TapirTargetAttr::OpenCilk:
return llvm::TTID::OpenCilk;
case TapirTargetAttr::OpenMP:
Expand Down Expand Up @@ -326,6 +329,12 @@ void CodeGenFunction::EmitForallStmt(const ForallStmt &S,
// Evaluate the initialization before the loop.
EmitStmt(S.getInit());

// TODO: explain more
// We assume that the boolean is a binary operator and pre-compute RHS
auto* BO = dyn_cast<BinaryOperator>(S.getCond());
llvm::Value *RHS = EmitScalarExpr(BO->getRHS());


// In a parallel loop there will always be a condition block
// so there is no need to test
JumpDest Condition = getJumpDestInCurrentScope("forall.cond");
Expand Down Expand Up @@ -364,7 +373,59 @@ void CodeGenFunction::EmitForallStmt(const ForallStmt &S,

// C99 6.8.5p2/p4: The first substatement is executed if the expression
// compares unequal to 0. The condition must be a scalar type.
llvm::Value *BoolCondVal = EvaluateExprAsBool(S.getCond());

//llvm::Value *BoolCondVal = EvaluateExprAsBool(S.getCond());
llvm::Value *LHS = EmitScalarExpr(BO->getLHS());
QualType LHSTy = BO->getLHS()->getType();
llvm::Value *BoolCondVal;
if (LHSTy->hasSignedIntegerRepresentation()) {
switch(BO->getOpcode()) {
case BO_GT:
BoolCondVal = Builder.CreateICmp(llvm::ICmpInst::ICMP_SGT, LHS, RHS, "cmp");
break;
case BO_GE:
BoolCondVal = Builder.CreateICmp(llvm::ICmpInst::ICMP_SGE, LHS, RHS, "cmp");
break;
case BO_LT:
BoolCondVal = Builder.CreateICmp(llvm::ICmpInst::ICMP_SLT, LHS, RHS, "cmp");
break;
case BO_LE:
BoolCondVal = Builder.CreateICmp(llvm::ICmpInst::ICMP_SLE, LHS, RHS, "cmp");
break;
case BO_NE:
BoolCondVal = Builder.CreateICmp(llvm::ICmpInst::ICMP_NE, LHS, RHS, "cmp");
break;
case BO_EQ:
BoolCondVal = Builder.CreateICmp(llvm::ICmpInst::ICMP_EQ, LHS, RHS, "cmp");
break;
default:
llvm_unreachable("Invalid comparison in forall");
}
} else {
switch(BO->getOpcode()){
case BO_GT:
BoolCondVal = Builder.CreateICmp(llvm::ICmpInst::ICMP_UGT, LHS, RHS, "cmp");
break;
case BO_GE:
BoolCondVal = Builder.CreateICmp(llvm::ICmpInst::ICMP_UGE, LHS, RHS, "cmp");
break;
case BO_LT:
BoolCondVal = Builder.CreateICmp(llvm::ICmpInst::ICMP_ULT, LHS, RHS, "cmp");
break;
case BO_LE:
BoolCondVal = Builder.CreateICmp(llvm::ICmpInst::ICMP_ULE, LHS, RHS, "cmp");
break;
case BO_NE:
BoolCondVal = Builder.CreateICmp(llvm::ICmpInst::ICMP_NE, LHS, RHS, "cmp");
break;
case BO_EQ:
BoolCondVal = Builder.CreateICmp(llvm::ICmpInst::ICMP_EQ, LHS, RHS, "cmp");
break;
default:
llvm_unreachable("Invalid comparison in forall");
}
}

Builder.CreateCondBr(
BoolCondVal, Detach, Sync.getBlock(),
createProfileWeightsForLoop(S.getCond(), getProfileCount(S.getBody())));
Expand Down
9 changes: 8 additions & 1 deletion clang/lib/Driver/Driver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,9 @@ static void CheckTTEnabled(const Driver &D, llvm::TTID TT) {
if (!KITSUNE_REALM_ENABLED)
D.Diag(diag::err_drv_kitsune_target_not_enabled) << llvm::toString(TT);
return;
case llvm::TTID::GPU:
// TODO: Check this
return;
case llvm::TTID::Serial:
// The serial tapir target is always enabled
return;
Expand Down Expand Up @@ -229,13 +232,17 @@ static void CheckKitsuneOptions(const Driver &D, const ArgList &Args,

// If --kokkos is provided, then a tapir target must also be provided.
if (!Args.hasArg(options::OPT_tapir_EQ)) {
D.Diag(diag::err_drv_kitsune_kokkos_no_tapir)
D.Diag(diag::err_drv_kitsune_tapir_required)
<< Args.getLastArg(options::OPT_kokkos, options::OPT_kokkos_no_init)
->getSpelling();
return;
}
}

if (const Arg *A = Args.getLastArg(options::OPT_print_before_first))
if (!Args.hasArg(options::OPT_tapir_EQ))
D.Diag(diag::err_drv_kitsune_tapir_required) << A->getSpelling();

// Check that the -ftapir flag has a valid value. This stops us from
// reporting multiple errors because the flag is examined in several places.
if (const Arg *A = Args.getLastArg(options::OPT_tapir_EQ)) {
Expand Down
4 changes: 4 additions & 0 deletions clang/lib/Driver/KitsuneOptionUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,8 @@ clang::getTapirTargetConfigFileName(const opt::ArgList &args) {
return "qthreads.cfg";
case TTID::Realm:
return "realm.cfg";
case TTID::GPU:
return "gpu.cfg";
}
llvm_unreachable("getTapirTargetConfigFile: TTID not handled");
}
Expand Down Expand Up @@ -323,6 +325,7 @@ static bool parseKitsuneTTArgs(KitsuneOptions &kitOpts, TTID tt,
case llvm::TTID::Realm:
return parseKitsuneRealmArgs(kitOpts, args, optTable, diags);
case llvm::TTID::Serial:
case llvm::TTID::GPU: // TODO:?
return true;
}
llvm_unreachable("ParseKitsuneTTArgs: TTID not handled");
Expand All @@ -335,6 +338,7 @@ bool clang::parseKitsuneArgs(KitsuneOptions &kitOpts, const char *argv0,

kitOpts.setKitsuneFrontend(IsKitsuneFrontend(argv0));
kitOpts.setStripmineLoops(args.hasArg(OPT_fstripmine));
kitOpts.setPrintBeforeFirst(args.hasArg(OPT_print_before_first));

kitOpts.setTapirVerbose(args.hasArg(OPT_tapir_verbose));
kitOpts.setKitrtVerbose(args.hasArg(OPT_kitrt_verbose));
Expand Down
7 changes: 7 additions & 0 deletions clang/lib/Driver/ToolChain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2277,6 +2277,8 @@ void ToolChain::AddKitsunePreprocessorArgs(const ArgList &Args,
switch (TT) {
case TTID::Nolo:
return;
case TTID::GPU:
return;
case TTID::Cuda:
return ExtractArgsFromString(KITSUNE_CUDA_EXTRA_PREPROCESSOR_FLAGS,
CmdArgs, Args);
Expand Down Expand Up @@ -2356,6 +2358,8 @@ void ToolChain::AddKitsuneCompilerArgs(const ArgList &Args,
ExtractArgsFromString(KITSUNE_OPENCILK_EXTRA_COMPILER_FLAGS, CmdArgs,
Args);
return;
case llvm::TTID::GPU:
return;
case TTID::OpenMP:
AddKitsuneOpenMPCommonArgs(Args, CmdArgs);
ExtractArgsFromString(KITSUNE_OPENMP_EXTRA_COMPILER_FLAGS, CmdArgs, Args);
Expand Down Expand Up @@ -2385,6 +2389,7 @@ void ToolChain::AddKitsuneCompilerArgs(const ArgList &Args,

if (std::optional<TTID> TT = parseTapirTargetIfValid(Args)) {
Args.AddLastArg(CmdArgs, options::OPT_ffp_contract);
Args.AddLastArg(CmdArgs, options::OPT_print_before_first);
Args.AddLastArg(CmdArgs, options::OPT_kitrt_verbose);
Args.AddLastArg(CmdArgs, options::OPT_tapir_verbose);
Args.AddLastArg(CmdArgs, options::OPT_tapir_EQ);
Expand Down Expand Up @@ -2563,6 +2568,8 @@ void ToolChain::AddKitsuneLinkerArgs(const ArgList &Args,
AddKitsuneOpenCilkLinkerArgs(Args, CmdArgs);
ExtractArgsFromString(KITSUNE_OPENCILK_EXTRA_LINKER_FLAGS, CmdArgs, Args);
return;
case TTID::GPU:
return;
case TTID::OpenMP:
AddKitsuneOpenMPLinkerArgs(Args, CmdArgs);
ExtractArgsFromString(KITSUNE_OPENMP_EXTRA_LINKER_FLAGS, CmdArgs, Args);
Expand Down
4 changes: 4 additions & 0 deletions clang/lib/Frontend/CompilerInvocation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4813,6 +4813,7 @@ void CompilerInvocationBase::GenerateKitsuneArgs(const KitsuneOptions &Opts,
case llvm::TTID::Qthreads:
case llvm::TTID::Realm:
case llvm::TTID::Serial:
case llvm::TTID::GPU:
return;
}
llvm_unreachable("GenerateKitsuneArgs: TTID not handled");
Expand Down Expand Up @@ -4849,6 +4850,9 @@ void CompilerInvocationBase::GenerateKitsuneArgs(const KitsuneOptions &Opts,

if (Opts.getStripmineLoops())
GenerateArg(Consumer, OPT_fstripmine);

if (Opts.getPrintBeforeFirst())
GenerateArg(Consumer, OPT_print_before_first);
}

bool CompilerInvocation::CheckKitsuneArgs(const ArgList &Args,
Expand Down
4 changes: 3 additions & 1 deletion clang/lib/Sema/SemaDeclAttr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7902,10 +7902,12 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
case ParsedAttr::AT_TypeNullable:
handleNullableTypeAttr(S, D, AL);
break;

case ParsedAttr::AT_VTablePointerAuthentication:
handleVTablePointerAuthentication(S, D, AL);
break;
case ParsedAttr::AT_KitsuneReduction:
handleSimpleAttribute<KitsuneReductionAttr>(S, D, AL);
break;
}
}

Expand Down
3 changes: 3 additions & 0 deletions flang/lib/Frontend/FrontendActions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1053,6 +1053,9 @@ void CodeGenAction::runOptimizationPipeline(llvm::raw_pwrite_stream &os) {
return;
}

if (kitsuneOpts.hasTTID() && kitsuneOpts.getPrintBeforeFirst())
llvm::errs() << *llvmModule << "\n";

// Run the passes.
mpm.run(*llvmModule, mam);

Expand Down
49 changes: 49 additions & 0 deletions kitsune-tests/reductions/l2.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#include<time.h>
#include<math.h>
#include<stdio.h>
#include<stdlib.h>
#include<kitsune.h>
#include<gpu.h>
#include<stdint.h>
#include<assert.h>
#include<omp.h>

reduction
void sum(double *a, double b, double unit){
*a += b;
}

double l2(uint64_t n, double* a){
double red = 0;
forall(uint64_t i=0; i<n; i++){
sum(&red, a[i] * a[i], 0.0);
}

return sqrt(red);
}

int main(int argc, char** argv){
int e = argc > 1 ? atoi(argv[1]) : 28;
int niter = argc > 2 ? atoi(argv[2]) : 100;
uint64_t n = 1ULL<<e;
double* arr = (double*)gpuManagedMalloc(sizeof(double) * n);

forall(uint64_t i=0; i<n; i++){
arr[i] = i;
}

l2(n, arr);

double par = 0;
double before = omp_get_wtime();
for(int i=0; i<niter; i++){
par = l2(n, arr);
}
double after = omp_get_wtime();
double partime = after - before;

printf("%f in %f s\n" , par, partime);
double bw = (double)((1ULL<<e) * niter * sizeof(double)) / (1000000000.0 * partime);
printf("bandwidth: %f GB/s \n" , bw);
}

Loading