diff --git a/mlir/include/mlir/Dialect/Rock/Tuning/RockTuning.h b/mlir/include/mlir/Dialect/Rock/Tuning/RockTuning.h index d4ebe9ea3975..622d9f27554b 100644 --- a/mlir/include/mlir/Dialect/Rock/Tuning/RockTuning.h +++ b/mlir/include/mlir/Dialect/Rock/Tuning/RockTuning.h @@ -52,6 +52,9 @@ struct ParamEntry { struct TuningParamSet { llvm::SetVector tuningRange; KernelType primaryOpType; + // The tuning kind that was actually used (may differ from requested kind, + // e.g. Greedy falls back to Exhaustive for non-accel). + TuningParamSetKind effectiveKind; }; struct TuningParamSpaceSettings { diff --git a/mlir/lib/Dialect/Rock/Tuning/RockTuningImpl.cpp b/mlir/lib/Dialect/Rock/Tuning/RockTuningImpl.cpp index ca0afea45388..5be7e7f66f2c 100644 --- a/mlir/lib/Dialect/Rock/Tuning/RockTuningImpl.cpp +++ b/mlir/lib/Dialect/Rock/Tuning/RockTuningImpl.cpp @@ -965,6 +965,7 @@ createTunableParamSpace(ModuleOp mod, TuningParamSetKind kind, rock::TuningParamSpaceSettings &settings) { struct TuningParamSet *newSpace; newSpace = new TuningParamSet(); + newSpace->effectiveKind = kind; // create range and heuristic WalkResult findPrimary = @@ -974,6 +975,7 @@ createTunableParamSpace(ModuleOp mod, TuningParamSetKind kind, // greedy is not implemented for non-accel if (!archInfo.isAccel(op) && kind == TuningParamSetKind::Greedy) { kind = TuningParamSetKind::Exhaustive; + newSpace->effectiveKind = kind; llvm::errs() << "Greedy tuning not implemented for non-accel, using " "Exhaustive instead\n"; } diff --git a/mlir/tools/rocmlir-tuning-driver/ConcurrentQueue.h b/mlir/tools/rocmlir-tuning-driver/ConcurrentQueue.h index a2833936300e..92d4f69fa4c1 100644 --- a/mlir/tools/rocmlir-tuning-driver/ConcurrentQueue.h +++ b/mlir/tools/rocmlir-tuning-driver/ConcurrentQueue.h @@ -1,4 +1,4 @@ -//===- ConcurrentQueue.h - Simple MPMC queue --------------------*- C++ -*-===// +//===- ConcurrentQueue.h - Rate-adaptive MPMC queue -------------*- C++ -*-===// // // Part of the rocMLIR Project, under the Apache License v2.0 with LLVM // Exceptions. See https://llvm.org/LICENSE.txt for license information. @@ -11,7 +11,9 @@ #include "llvm/Support/Compiler.h" +#include #include +#include #include #include #include @@ -21,26 +23,39 @@ namespace rocmlir::tuningdriver { template class ConcurrentQueue { public: + // If maxCapacity is 0, the queue is unbounded + explicit ConcurrentQueue(size_t maxCapacity = 0) : maxCapacity(maxCapacity) {} + template bool push(U &&item) { if (LLVM_UNLIKELY(done.load(std::memory_order_relaxed))) return false; // Early exit if terminated { - std::lock_guard lock(mtx); + std::unique_lock lock(mtx); + + if (maxCapacity > 0) { + cvNotFull.wait(lock, [this] { + return queue.size() < currentCapacity || + done.load(std::memory_order_relaxed); + }); + } + if (LLVM_UNLIKELY(done.load(std::memory_order_relaxed))) - return false; // Double-check after acquiring the lock + return false; queue.emplace(std::forward(item)); } - cv.notify_one(); + cvNotEmpty.notify_one(); return true; } bool pop(T &item) { std::unique_lock lock(mtx); - cv.wait(lock, [this] { + + bool starved = queue.empty(); + cvNotEmpty.wait(lock, [this] { return !queue.empty() || done.load(std::memory_order_relaxed); }); @@ -49,20 +64,51 @@ class ConcurrentQueue { item = std::move(queue.front()); queue.pop(); + + if (maxCapacity > 0) { + if (starved) { + // If the queue was empty, increase the capacity + currentCapacity = std::min(currentCapacity + 1, maxCapacity); + consecutiveFed = 0; + } else { + ++consecutiveFed; + if (consecutiveFed >= fedShrinkThreshold) { + // Decrease the capacity if the queue has been fed for a while + currentCapacity = + std::max(currentCapacity / 2, static_cast(1)); + consecutiveFed = 0; + } + } + } + + lock.unlock(); + + cvNotFull.notify_one(); return true; } void terminate() { - done.store(true, std::memory_order_relaxed); - cv.notify_all(); + { + std::lock_guard lock(mtx); + done.store(true, std::memory_order_relaxed); + } + cvNotEmpty.notify_all(); + cvNotFull.notify_all(); } bool isTerminated() const { return done.load(std::memory_order_relaxed); } private: + static constexpr size_t fedShrinkThreshold = 4; + + const size_t maxCapacity; + size_t currentCapacity{maxCapacity}; + size_t consecutiveFed{0}; + std::queue queue; std::mutex mtx; - std::condition_variable cv; + std::condition_variable cvNotEmpty; + std::condition_variable cvNotFull; std::atomic done{false}; }; diff --git a/mlir/tools/rocmlir-tuning-driver/rocmlir-tuning-driver.cpp b/mlir/tools/rocmlir-tuning-driver/rocmlir-tuning-driver.cpp index 23f8506f837b..1048ab72a8e3 100644 --- a/mlir/tools/rocmlir-tuning-driver/rocmlir-tuning-driver.cpp +++ b/mlir/tools/rocmlir-tuning-driver/rocmlir-tuning-driver.cpp @@ -56,9 +56,6 @@ #include #include -// Utilities to allocate buffers -#include "../utils/performance/common/benchmarkUtils.h" - #include "CacheFlush.h" #include "ConcurrentQueue.h" @@ -183,30 +180,6 @@ static OwningOpRef parseMLIRInput(StringRef inputFilename, return parseSourceFile(sourceMgr, context); } -static benchmark::DataType getDataType(Type inputType) { - if (inputType.isF32()) { - return benchmark::DataType::F32; - } else if (inputType.isInteger(32)) { - return benchmark::DataType::I32; - } else if (inputType.isF16()) { - return benchmark::DataType::F16; - } else if (inputType.isBF16()) { - return benchmark::DataType::BF16; - } else if (inputType.isInteger(8)) { - return benchmark::DataType::I8; - } else if (isa(inputType)) { - return benchmark::DataType::F8; - } else if (isa(inputType)) { - return benchmark::DataType::F8E8M0FNU; - } else if (isa(inputType)) { - return benchmark::DataType::F4; - } else { - llvm::errs() << "Unknown data type: " << inputType << "\n"; - llvm_unreachable("Kernels only accept ints or floats"); - } -} - // intentionally leaky macro #define HIPCHECK(expr) \ if (hipSuccess != (expr)) { \ @@ -431,28 +404,14 @@ measureLargeKernel(unsigned iterations, hipStream_t stream, } // In order to match rocprof, returns time in nanoseconds -static FailureOr -benchmarkKernels(ArrayRef binaries, - ArrayRef funcNames, ArrayRef blockSizes, - ArrayRef gridSizes, ArrayRef hostBuffers, - MutableArrayRef gpuBuffers, - ArrayRef bufferSizes, const BenchmarkParams ¶ms) { +static FailureOr benchmarkKernels(ArrayRef binaries, + ArrayRef funcNames, + ArrayRef blockSizes, + ArrayRef gridSizes, + MutableArrayRef gpuBuffers, + hipStream_t stream, + const BenchmarkParams ¶ms) { bool benchmarkMode = !params.benchmarkConfig.empty(); - hipStream_t stream; - HIPCHECK(hipStreamCreate(&stream)); - auto streamCleanup = llvm::make_scope_exit([&]() { - hipError_t destroyStatus = hipStreamDestroy(stream); - if (destroyStatus != hipSuccess) { - llvm::errs() << "HIP error in hipStreamDestroy: " - << hipGetErrorString(destroyStatus) << "\n"; - } - }); - - // Initialize device buffers - for (size_t i = 0; i < bufferSizes.size(); i++) { - HIPCHECK(hipMemcpyAsync(gpuBuffers[i], hostBuffers[i], bufferSizes[i], - hipMemcpyHostToDevice, stream)); - } // HIP wants an array of pointers to each argument std::vector argPointers; @@ -702,12 +661,19 @@ static LogicalResult runTuningLoop(ModuleOp source) { backendOpts.optLevel = 3; backendOpts.suppressDiagnostic = true; - // 3. Initialize host buffers and allocate device buffers - std::vector hostBuffers; + // 3. Create HIP stream and allocate device buffers + hipStream_t stream; + HIPCHECK(hipStreamCreate(&stream)); + auto streamCleanup = llvm::make_scope_exit([&]() { + hipError_t status = hipStreamDestroy(stream); + if (status != hipSuccess) { + llvm::errs() << "HIP error in hipStreamDestroy: " + << hipGetErrorString(status) << "\n"; + } + }); + std::vector gpuBuffers; auto bufferCleanup = llvm::make_scope_exit([&]() { - for (void *buffer : hostBuffers) - free(buffer); for (void *buffer : gpuBuffers) { // hipFree does not allow nullptrs, so make sure to check for it first if (!buffer) @@ -722,21 +688,17 @@ static LogicalResult runTuningLoop(ModuleOp source) { llvm::errs() << "Failed to cleanup cache flush artifacts\n"; } }); - assert(argTypes.size() == bufferLengths.size() && - "number of arguments and buffer lengths must match"); - for (auto [argType, bufferLength] : llvm::zip(argTypes, bufferLengths)) { - benchmark::DataType type = getDataType(getElementTypeOrSelf(argType)); - void *hostBuffer = benchmark::allocAndFill(type, bufferLength); + for (size_t bufferLength : bufferLengths) { void *gpuBuffer = nullptr; hipError_t hipStatus = hipMalloc(&gpuBuffer, bufferLength); if (hipStatus != hipSuccess) { - free(hostBuffer); - llvm::errs() << "HIP error in hipMalloc(gpuBuffer): " - << hipGetErrorString(hipStatus) << "\n"; + llvm::errs() << "HIP error in hipMalloc: " << hipGetErrorString(hipStatus) + << "\n"; return failure(); } - hostBuffers.push_back(hostBuffer); gpuBuffers.push_back(gpuBuffer); + // 0x3F decodes as a finite, non-zero value for all floating point types + HIPCHECK(hipMemsetAsync(gpuBuffer, 0x3F, bufferLength, stream)); } // 4. Multi-iteration tuning loop @@ -750,13 +712,31 @@ static LogicalResult runTuningLoop(ModuleOp source) { sleepUs, showStats, showAllMeasurements, tuningSpaceKind, numCompileThreads, benchmarkConfig, waitForCompiles}; - unsigned numTuningIterations = - rock::getNumberOfIterations(benchmarkParams.tuningSpaceKind); + rock::TuningParamSetKind effectiveKind = benchmarkParams.tuningSpaceKind; + unsigned numTuningIterations = rock::getNumberOfIterations(effectiveKind); if (!benchmarkParams.benchmarkConfig.empty() && numTuningIterations != 1) { llvm::errs() << "benchmarking should do a single tuning iteration\n"; return failure(); } + // Serialize source module once (shared by all threads for parsing). + // The source module doesn't change between greedy iterations. + std::string sourceModuleStr; + { + llvm::raw_string_ostream sourceOs(sourceModuleStr); + source->print(sourceOs); + } + + unsigned maxThreads = (benchmarkParams.numCompileThreads > 0) + ? benchmarkParams.numCompileThreads + : std::thread::hardware_concurrency(); + if (maxThreads == 0) + maxThreads = 4; + + // Thread resources are cached and reused across greedy iterations to avoid + // re-creating MLIRContexts, PassManagers, and re-parsing the source module. + std::vector threadResources; + // Main iteration loop - wraps config generation, compilation, AND // benchmarking for (unsigned iterIdx = 0; iterIdx < numTuningIterations; ++iterIdx) { @@ -779,6 +759,11 @@ static LogicalResult runTuningLoop(ModuleOp source) { return failure(); } + // The tuning space may have fallen back to a different kind (e.g. Greedy + // -> Exhaustive for non-accel), so adjust the iteration count. + effectiveKind = tuningSpace->effectiveKind; + numTuningIterations = rock::getNumberOfIterations(effectiveKind); + for (rock::RockTuningParamAttrInterface tuningAttr : tuningSpace->tuningRange) { SmallString<64> perfConfig; @@ -787,55 +772,47 @@ static LogicalResult runTuningLoop(ModuleOp source) { } } - // Determine number of parallel threads - unsigned numThreads = (benchmarkParams.numCompileThreads > 0) - ? benchmarkParams.numCompileThreads - : std::thread::hardware_concurrency(); - if (numThreads == 0) - numThreads = 4; // fallback - - // Don't create more threads than configs to compile - numThreads = std::min(numThreads, static_cast(configs.size())); - - // Serialize source module once (shared by all threads for parsing) - std::string sourceModuleStr; - { - llvm::raw_string_ostream sourceOs(sourceModuleStr); - source->print(sourceOs); - } + unsigned numThreads = + std::min(maxThreads, static_cast(configs.size())); // PHASE 2: Pre-initialize thread resources (contexts, PassManagers, parsed // modules). This avoids the expensive per-config overhead of creating // contexts, parsing modules, and building pipelines. + // On the first iteration all resources are created; subsequent iterations + // only grow the pool if more threads are needed. // Note: MLIR's MLIRContext cannot be safely shared across parallel pass // executions, so each thread needs its own context. - std::vector threadResources(numThreads); - std::atomic initFailed{false}; - - { - std::vector initThreads; - initThreads.reserve(numThreads); - for (unsigned i = 0; i < numThreads; ++i) { - initThreads.emplace_back([&, i]() { - if (!threadResources[i].initialize(sourceModuleStr, applicabilityOpts, - compilationKernOpts, - backendOpts)) { - initFailed.store(true, std::memory_order_relaxed); - } - }); - } - for (auto &t : initThreads) { - t.join(); + if (static_cast(numThreads) > threadResources.size()) { + unsigned oldSize = threadResources.size(); + threadResources.resize(numThreads); + std::atomic initFailed{false}; + { + std::vector initThreads; + initThreads.reserve(numThreads - oldSize); + for (unsigned i = oldSize; i < numThreads; ++i) { + initThreads.emplace_back([&, i]() { + if (!threadResources[i].initialize( + sourceModuleStr, applicabilityOpts, compilationKernOpts, + backendOpts)) { + initFailed.store(true, std::memory_order_relaxed); + } + }); + } + for (auto &t : initThreads) { + t.join(); + } } - } - if (initFailed.load(std::memory_order_relaxed)) { - llvm::errs() << "Failed to initialize thread resources\n"; - return failure(); + if (initFailed.load(std::memory_order_relaxed)) { + llvm::errs() << "Failed to initialize thread resources\n"; + return failure(); + } } // PHASE 3: Parallel compilation phase using pre-initialized resources - ConcurrentQueue compilationResults; + // Unbounded queue if we wait for all compilations to avoid blocking + size_t maxCapacity = benchmarkParams.waitForCompiles ? 0 : numThreads; + ConcurrentQueue compilationResults(maxCapacity); std::mutex outputMutex; // For thread-safe console output // Compile a single config using pre-initialized thread resources @@ -990,10 +967,9 @@ static LogicalResult runTuningLoop(ModuleOp source) { assert(result.status == CompilationStatus::Success && "Unexpected compilation status in benchmarking phase"); - FailureOr timing = - benchmarkKernels(result.hipModules, kernelFuncNames, - result.blockSizes, result.gridSizes, hostBuffers, - gpuBuffers, bufferLengths, benchmarkParams); + FailureOr timing = benchmarkKernels( + result.hipModules, kernelFuncNames, result.blockSizes, + result.gridSizes, gpuBuffers, stream, benchmarkParams); if (failed(timing)) { llvm::errs() << "Kernel execution failed\n"; @@ -1003,7 +979,7 @@ static LogicalResult runTuningLoop(ModuleOp source) { validResults++; // Find best config - if (rock::needToUpdateBest(benchmarkParams.tuningSpaceKind)) { + if (rock::needToUpdateBest(effectiveKind)) { if (timing.value() < bestTimeOverall) { bestTimeOverall = timing.value(); bestConfigOverall = result.perfConfig;