From 573295405025232130b11b9e0ec842795c903931 Mon Sep 17 00:00:00 2001 From: Umang Yadav Date: Tue, 24 Feb 2026 20:15:53 +0000 Subject: [PATCH 1/3] Add single barrier for single wave kernels --- .../Dialect/Rock/Transforms/RockPipeline.cpp | 79 ++++- .../Rock/rock-pipeline-early-exit.mlir | 2 +- .../Rock/test_rock_pipeline_wave_barrier.mlir | 271 ++++++++++++++++++ mlir/test/e2e/CMakeLists.txt | 5 + mlir/test/e2e/GemmOneWaveBarrier.cfg | 3 + mlir/test/e2e/GemmOneWaveBarrier.toml | 33 +++ .../e2e/GemmOneWaveBarrierDirectToLDS.cfg | 5 + .../e2e/GemmOneWaveBarrierDirectToLDS.toml | 33 +++ mlir/test/e2e/GemmOneWaveBarrierFp8.cfg | 5 + mlir/test/e2e/GemmOneWaveBarrierFp8.toml | 33 +++ mlir/test/e2e/PrGemmOneWaveBarrier.cfg | 3 + mlir/test/e2e/PrGemmOneWaveBarrier.toml | 22 ++ .../e2e/PrGemmOneWaveBarrierDirectToLDS.cfg | 3 + .../e2e/PrGemmOneWaveBarrierDirectToLDS.toml | 22 ++ 14 files changed, 512 insertions(+), 7 deletions(-) create mode 100644 mlir/test/Dialect/Rock/test_rock_pipeline_wave_barrier.mlir create mode 100644 mlir/test/e2e/GemmOneWaveBarrier.cfg create mode 100644 mlir/test/e2e/GemmOneWaveBarrier.toml create mode 100644 mlir/test/e2e/GemmOneWaveBarrierDirectToLDS.cfg create mode 100644 mlir/test/e2e/GemmOneWaveBarrierDirectToLDS.toml create mode 100644 mlir/test/e2e/GemmOneWaveBarrierFp8.cfg create mode 100644 mlir/test/e2e/GemmOneWaveBarrierFp8.toml create mode 100644 mlir/test/e2e/PrGemmOneWaveBarrier.cfg create mode 100644 mlir/test/e2e/PrGemmOneWaveBarrier.toml create mode 100644 mlir/test/e2e/PrGemmOneWaveBarrierDirectToLDS.cfg create mode 100644 mlir/test/e2e/PrGemmOneWaveBarrierDirectToLDS.toml diff --git a/mlir/lib/Dialect/Rock/Transforms/RockPipeline.cpp b/mlir/lib/Dialect/Rock/Transforms/RockPipeline.cpp index bc3abe13ef26..bfc4f4e6c242 100644 --- a/mlir/lib/Dialect/Rock/Transforms/RockPipeline.cpp +++ b/mlir/lib/Dialect/Rock/Transforms/RockPipeline.cpp @@ -21,6 +21,7 @@ #include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/MemRef/Transforms/Transforms.h" +#include "mlir/Dialect/Rock/IR/GetRockInfo.h" #include "mlir/Dialect/Rock/IR/Rock.h" #include "mlir/Dialect/Rock/Passes.h" #include "mlir/Dialect/Rock/Transforms/RockMultibuffer.h" @@ -471,6 +472,71 @@ DagType pruneGraph(const DagType &dag) { return prunedGraph; } +// Determine if the backward barrier can be skipped for single-wave kernels. +// +// For scheduleVersion 1 (Default) or 3 (DirectToLDSDefault), the loop +// structure is: +// GlobalLoad -> DSWrite -> (fwd barrier) -> DSRead + MFMA +// +// The forward barrier ensures DSWrites complete before DSReads start. +// For the loop-carried dependency (backward barrier), we need to ensure +// DSReads from iteration i finish before DSWrites from iteration i+1. +// +// When blockSize <= waveSize (single wave), this is guaranteed because +// GPU issues instructions in order within a wave - once DSReads have been +// issued, they have read the data from the buffers, so DSWrites can proceed +// without an explicit barrier. +bool canSkipBackwardBarrierForOneWave(func::FuncOp func, scf::ForOp forOp) { + // Check if this is a single-wave kernel + auto maybeBlockSize = rock::getBlockSize(func); + if (failed(maybeBlockSize)) + return false; + + int64_t blockSize = maybeBlockSize->getInt(); + + // Check if arch attribute exists before calling getArchValue which + // triggers llvm_unreachable if arch is missing + if (!func->hasAttr("arch") && !func->hasAttr("mhal.arch")) + return false; + + StringAttr arch = rock::getArchValue(func); + if (!arch) + return false; + + int64_t waveSize = rock::lookupArchInfo(arch).waveSize; + bool isOneWave = (blockSize <= waveSize); + if (!isOneWave) + return false; + + // for nested loops, it may require more analysis. For now, only support + // single loop. + int forOpCount = 0; + func.walk([&](scf::ForOp) { ++forOpCount; }); + if (forOpCount != 1) + return false; + + // Find the scheduleVersion from ThreadwiseGemmAccelOp within the loop. + // The scheduleVersion is stored in the params attribute of the op. + std::optional scheduleVersion; + forOp.walk([&](rock::ThreadwiseGemmAccelOp gemmOp) { + rock::RockAccelTuningParamAttrInterface params = gemmOp.getParams(); + scheduleVersion = params.getScheduleVersion(); + }); + + if (!scheduleVersion.has_value()) + return false; + + // Check if the schedule version supports skipping the backward barrier. + // Only scheduleVersion 1 (Default) and 3 (DirectToLDSDefault) + // have the loop structure that allows skipping the backward barrier. + bool canSkip = (*scheduleVersion == 1 || *scheduleVersion == 3); + + LLVM_DEBUG(DBGS() << "canSkipBackwardBarrierForOneWave: isOneWave=" + << isOneWave << ", scheduleVersion=" << *scheduleVersion + << ", canSkip=" << canSkip << "\n"); + return canSkip; +} + // Utility function to place an empty stage before or after another `stage`. The // empty stage will contain an `lds_barrier` if `isBarrier` is set to true rock::StageOp placeEmptyStage(IRRewriter &rewriter, Location loc, @@ -493,8 +559,8 @@ rock::StageOp placeEmptyStage(IRRewriter &rewriter, Location loc, // initiation interval twice as big and pipeline as usual. This function // takes also care to update the initiation interval, so that the caller // does not have to know how `placeBarrier` internally works. -void placeBarriers(IRRewriter &rewriter, Location loc, scf::ForOp forOp, - ArrayRef stages, +void placeBarriers(IRRewriter &rewriter, Location loc, func::FuncOp func, + scf::ForOp forOp, ArrayRef stages, SetVector &allocs, SmallVector &extendedStages, int64_t &initiationInterval, int64_t numIterations) { @@ -503,8 +569,9 @@ void placeBarriers(IRRewriter &rewriter, Location loc, scf::ForOp forOp, dag = pruneGraph(dag); // If there is a loop, we probably need a backward barrier, i.e., - // an LDS barrier that takes the loop dependency into account - const bool addBackwardBarrier = numIterations > 1; + // an LDS barrier that takes the loop dependency into account. + bool canSkipBackwardBarrier = canSkipBackwardBarrierForOneWave(func, forOp); + const bool addBackwardBarrier = numIterations > 1 && !canSkipBackwardBarrier; DenseMap timeSlotMap; int timeSlot = 0; @@ -768,8 +835,8 @@ void RockPipeline::runOnOperation() { SmallVector extendedStages; // use "multiAllocs" to place LDS barriers, no need to explicitly place // barriers for registers or globals - placeBarriers(rewriter, loc, forOp, stages, multiAllocs, extendedStages, - ii, numIterations); + placeBarriers(rewriter, loc, func, forOp, stages, multiAllocs, + extendedStages, ii, numIterations); ScheduleType schedule; // use all "resources" to generate dependency graph and generate schedule createSchedule(extendedStages, resources, ii, schedule, diff --git a/mlir/test/Dialect/Rock/rock-pipeline-early-exit.mlir b/mlir/test/Dialect/Rock/rock-pipeline-early-exit.mlir index 93d7bfce7506..ad0edbaf0dc7 100644 --- a/mlir/test/Dialect/Rock/rock-pipeline-early-exit.mlir +++ b/mlir/test/Dialect/Rock/rock-pipeline-early-exit.mlir @@ -4,7 +4,7 @@ // COUNT-COUNT-1: rock.lds_barrier module { - func.func @pipeline_loop_in_scf_if(%arg0: memref<128xf16>, %arg1: memref<128xf16>, %arg2: memref<128xf16>, %arg3: i32) attributes {block_size = 64 : i32, grid_size = 1 : i32, kernel} { + func.func @pipeline_loop_in_scf_if(%arg0: memref<128xf16>, %arg1: memref<128xf16>, %arg2: memref<128xf16>, %arg3: i32) attributes {arch = "amdgcn-amd-amdhsa:gfx90a", block_size = 64 : i32, grid_size = 1 : i32, kernel} { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %c4 = arith.constant 4 : index diff --git a/mlir/test/Dialect/Rock/test_rock_pipeline_wave_barrier.mlir b/mlir/test/Dialect/Rock/test_rock_pipeline_wave_barrier.mlir new file mode 100644 index 000000000000..4a635f24abe4 --- /dev/null +++ b/mlir/test/Dialect/Rock/test_rock_pipeline_wave_barrier.mlir @@ -0,0 +1,271 @@ +// RUN: rocmlir-opt %s --rock-pipeline="rock-pipeline-remove-stages=false" | FileCheck %s +// RUN: rocmlir-opt %s --rock-pipeline="rock-pipeline-remove-stages=true" | FileCheck %s --check-prefix=REMOVE-STAGES + +// This test file verifies the optimization that skips backward LDS barriers +// for single-wave kernels with specific schedule versions. + +// Test for single-wave kernel with scheduleVersion=1 (Default) +// When blockSize <= waveSize and scheduleVersion is 1 or 3, backward barriers should be skipped +// For scheduleVersion=1, the loop has 3 stages: GlobalRead, LDSWrite, LDSRead +// CHECK-LABEL: rock_pipeline_one_wave_schedule_v1 +// REMOVE-STAGES-LABEL: rock_pipeline_one_wave_schedule_v1 +// For single-wave with scheduleVersion=1, we should NOT see a second barrier in the loop +// (backward barrier is skipped) +// Prologue stores to LDS: +// REMOVE-STAGES: memref.store {{.*}} : memref<128xf16, #gpu.address_space> +// REMOVE-STAGES: scf.for +// Inside loop - only ONE barrier (forward), no backward barrier for single-wave +// REMOVE-STAGES: rock.lds_barrier +// REMOVE-STAGES: memref.load {{.*}} : memref<128xf16, #gpu.address_space> +// REMOVE-STAGES: rock.threadwise_gemm_accel +// REMOVE-STAGES-NOT: rock.lds_barrier +// REMOVE-STAGES: memref.store {{.*}} : memref<128xf16, #gpu.address_space> +// REMOVE-STAGES: } +// Epilogue barrier and LDS read: +// REMOVE-STAGES: rock.lds_barrier +// REMOVE-STAGES: return +func.func @rock_pipeline_one_wave_schedule_v1(%input : memref<16xf16, #gpu.address_space>, %output : memref<16xf16, #gpu.address_space>) attributes {block_size = 64 : i32, arch = "amdgcn-amd-amdhsa:gfx90a"} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + + // 128 f16 elements = 256 bytes + %rawLds = rock.alloc() : memref<256xi8, #gpu.address_space> + %rawRegA = rock.alloc() : memref<32xi8, #gpu.address_space> + %rawRegB = rock.alloc() : memref<32xi8, #gpu.address_space> + %matrixA = memref.alloc() : memref<1x2xvector<4xf16>, #gpu.address_space> + %matrixB = memref.alloc() : memref<1x2xvector<4xf16>, #gpu.address_space> + %matrixC = memref.alloc() : memref<1x1xvector<4xf32>, #gpu.address_space> + + %lds = memref.view %rawLds[%c0][] : memref<256xi8, #gpu.address_space> to memref<128xf16, #gpu.address_space> + %regA = memref.view %rawRegA[%c0][] : memref<32xi8, #gpu.address_space> to memref<16xf16, #gpu.address_space> + %regB = memref.view %rawRegB[%c0][] : memref<32xi8, #gpu.address_space> to memref<16xf16, #gpu.address_space> + + scf.for %arg3 = %c0 to %c16 step %c1 { + rock.stage { + %a = memref.load %input[%arg3] : memref<16xf16, #gpu.address_space> + memref.store %a, %regA[%arg3] : memref<16xf16, #gpu.address_space> + rock.yield + }{name="GlobalRead"} + rock.stage { + %a = memref.load %regA[%arg3] : memref<16xf16, #gpu.address_space> + memref.store %a, %lds[%arg3] : memref<128xf16, #gpu.address_space> + rock.yield + }{name="LDSWrite"} + rock.stage { + %a = memref.load %lds[%arg3] : memref<128xf16, #gpu.address_space> + memref.store %a, %regB[%arg3] : memref<16xf16, #gpu.address_space> + %tid = rock.workitem_id : index + rock.threadwise_gemm_accel %matrixC += %matrixA * %matrixB at[%tid, %tid, %tid] { + params = #rock.accel_gemm_params< + kpackPerBlock = 4, mPerBlock = 16, nPerBlock = 16, kpack = 8, + mPerWave = 16, nPerWave = 16, mnPerXdl = 16, splitKFactor = 1, + scheduleVersion = 1, outputSwizzle = 2, wavesPerEU = 0, + gridGroupSize = 0, forceUnroll = true> + } : memref<1x1xvector<4xf32>, #gpu.address_space> += memref<1x2xvector<4xf16>, #gpu.address_space> * memref<1x2xvector<4xf16>, #gpu.address_space> + rock.yield + }{name="LDSRead"} + }{pipeline = #rock.pipeline<2>} + + %out = memref.load %regB[%c0] : memref<16xf16, #gpu.address_space> + memref.store %out, %output[%c0] : memref<16xf16, #gpu.address_space> + return +} + +// Test for single-wave kernel with scheduleVersion=3 (DirectToLDSDefault) +// When blockSize <= waveSize and scheduleVersion is 1 or 3, backward barriers should be skipped +// For scheduleVersion=3, the loop has only 2 stages: GlobalRead (writes directly to LDS) and LDSRead +// CHECK-LABEL: rock_pipeline_one_wave_schedule_v3 +// REMOVE-STAGES-LABEL: rock_pipeline_one_wave_schedule_v3 +// For single-wave with scheduleVersion=3, we should NOT see backward barrier +// The 2-stage loop doesn't fully pipeline but still gets barrier optimization +// REMOVE-STAGES: scf.for +// Inside loop - only ONE barrier for single-wave with scheduleVersion=3 +// REMOVE-STAGES: memref.store {{.*}} : memref<128xf16, #gpu.address_space> +// REMOVE-STAGES-NEXT: rock.lds_barrier +// REMOVE-STAGES: memref.load {{.*}} : memref<128xf16, #gpu.address_space> +// REMOVE-STAGES: rock.threadwise_gemm_accel +// No second barrier before end of loop body +// REMOVE-STAGES: } +// No barriers after loop for this test since it doesn't fully pipeline +// REMOVE-STAGES-NOT: rock.lds_barrier +// REMOVE-STAGES: return +func.func @rock_pipeline_one_wave_schedule_v3(%input : memref<16xf16, #gpu.address_space>, %output : memref<16xf16, #gpu.address_space>) attributes {block_size = 64 : i32, arch = "amdgcn-amd-amdhsa:gfx90a"} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + + // 128 f16 elements = 256 bytes + %rawLds = rock.alloc() : memref<256xi8, #gpu.address_space> + %rawRegA = rock.alloc() : memref<32xi8, #gpu.address_space> + %rawRegB = rock.alloc() : memref<32xi8, #gpu.address_space> + %matrixA = memref.alloc() : memref<1x2xvector<4xf16>, #gpu.address_space> + %matrixB = memref.alloc() : memref<1x2xvector<4xf16>, #gpu.address_space> + %matrixC = memref.alloc() : memref<1x1xvector<4xf32>, #gpu.address_space> + + %lds = memref.view %rawLds[%c0][] : memref<256xi8, #gpu.address_space> to memref<128xf16, #gpu.address_space> + %regA = memref.view %rawRegA[%c0][] : memref<32xi8, #gpu.address_space> to memref<16xf16, #gpu.address_space> + %regB = memref.view %rawRegB[%c0][] : memref<32xi8, #gpu.address_space> to memref<16xf16, #gpu.address_space> + + // For scheduleVersion=3 (DirectToLDS), there are only 2 stages: + // Stage 1: GlobalRead - loads from global and writes directly to LDS + // Stage 2: LDSRead - reads from LDS and performs MFMA + scf.for %arg3 = %c0 to %c16 step %c1 { + rock.stage { + // GlobalRead stage: load from global and write DIRECTLY to LDS (Direct-to-LDS) + %a = memref.load %input[%arg3] : memref<16xf16, #gpu.address_space> + memref.store %a, %lds[%arg3] : memref<128xf16, #gpu.address_space> + rock.yield + }{name="GlobalRead"} + rock.stage { + // LDSRead stage: read from LDS and perform MFMA + %a = memref.load %lds[%arg3] : memref<128xf16, #gpu.address_space> + memref.store %a, %regB[%arg3] : memref<16xf16, #gpu.address_space> + %tid = rock.workitem_id : index + rock.threadwise_gemm_accel %matrixC += %matrixA * %matrixB at[%tid, %tid, %tid] { + params = #rock.accel_gemm_params< + kpackPerBlock = 4, mPerBlock = 16, nPerBlock = 16, kpack = 8, + mPerWave = 16, nPerWave = 16, mnPerXdl = 16, splitKFactor = 1, + scheduleVersion = 3, outputSwizzle = 2, wavesPerEU = 0, + gridGroupSize = 0, forceUnroll = true> + } : memref<1x1xvector<4xf32>, #gpu.address_space> += memref<1x2xvector<4xf16>, #gpu.address_space> * memref<1x2xvector<4xf16>, #gpu.address_space> + rock.yield + }{name="LDSRead"} + }{pipeline = #rock.pipeline<2>} + + %out = memref.load %regB[%c0] : memref<16xf16, #gpu.address_space> + memref.store %out, %output[%c0] : memref<16xf16, #gpu.address_space> + return +} + +// Test for multi-wave kernel with scheduleVersion=1 - should still have backward barrier +// When blockSize > waveSize, backward barriers should NOT be skipped +// CHECK-LABEL: rock_pipeline_multi_wave_schedule_v1 +// REMOVE-STAGES-LABEL: rock_pipeline_multi_wave_schedule_v1 +// For multi-wave, we SHOULD see TWO barriers in the loop (forward and backward) +// Prologue stores to LDS: +// REMOVE-STAGES: memref.store {{.*}} : memref<128xf16, #gpu.address_space> +// REMOVE-STAGES: scf.for +// Inside loop - TWO barriers for multi-wave (forward + backward) +// REMOVE-STAGES: rock.lds_barrier +// REMOVE-STAGES: memref.load {{.*}} : memref<128xf16, #gpu.address_space> +// REMOVE-STAGES: rock.threadwise_gemm_accel +// REMOVE-STAGES: rock.lds_barrier +// REMOVE-STAGES: memref.store {{.*}} : memref<128xf16, #gpu.address_space> +// REMOVE-STAGES: } +// Epilogue barrier and LDS read: +// REMOVE-STAGES: rock.lds_barrier +// REMOVE-STAGES: return +func.func @rock_pipeline_multi_wave_schedule_v1(%input : memref<16xf16, #gpu.address_space>, %output : memref<16xf16, #gpu.address_space>) attributes {block_size = 128 : i32, arch = "amdgcn-amd-amdhsa:gfx90a"} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + + // 128 f16 elements = 256 bytes + %rawLds = rock.alloc() : memref<256xi8, #gpu.address_space> + %rawRegA = rock.alloc() : memref<32xi8, #gpu.address_space> + %rawRegB = rock.alloc() : memref<32xi8, #gpu.address_space> + %matrixA = memref.alloc() : memref<1x2xvector<4xf16>, #gpu.address_space> + %matrixB = memref.alloc() : memref<1x2xvector<4xf16>, #gpu.address_space> + %matrixC = memref.alloc() : memref<1x1xvector<4xf32>, #gpu.address_space> + + %lds = memref.view %rawLds[%c0][] : memref<256xi8, #gpu.address_space> to memref<128xf16, #gpu.address_space> + %regA = memref.view %rawRegA[%c0][] : memref<32xi8, #gpu.address_space> to memref<16xf16, #gpu.address_space> + %regB = memref.view %rawRegB[%c0][] : memref<32xi8, #gpu.address_space> to memref<16xf16, #gpu.address_space> + + scf.for %arg3 = %c0 to %c16 step %c1 { + rock.stage { + %a = memref.load %input[%arg3] : memref<16xf16, #gpu.address_space> + memref.store %a, %regA[%arg3] : memref<16xf16, #gpu.address_space> + rock.yield + }{name="GlobalRead"} + rock.stage { + %a = memref.load %regA[%arg3] : memref<16xf16, #gpu.address_space> + memref.store %a, %lds[%arg3] : memref<128xf16, #gpu.address_space> + rock.yield + }{name="LDSWrite"} + rock.stage { + %a = memref.load %lds[%arg3] : memref<128xf16, #gpu.address_space> + memref.store %a, %regB[%arg3] : memref<16xf16, #gpu.address_space> + %tid = rock.workitem_id : index + rock.threadwise_gemm_accel %matrixC += %matrixA * %matrixB at[%tid, %tid, %tid] { + params = #rock.accel_gemm_params< + kpackPerBlock = 4, mPerBlock = 16, nPerBlock = 16, kpack = 8, + mPerWave = 16, nPerWave = 16, mnPerXdl = 16, splitKFactor = 1, + scheduleVersion = 1, outputSwizzle = 2, wavesPerEU = 0, + gridGroupSize = 0, forceUnroll = true> + } : memref<1x1xvector<4xf32>, #gpu.address_space> += memref<1x2xvector<4xf16>, #gpu.address_space> * memref<1x2xvector<4xf16>, #gpu.address_space> + rock.yield + }{name="LDSRead"} + }{pipeline = #rock.pipeline<2>} + + %out = memref.load %regB[%c0] : memref<16xf16, #gpu.address_space> + memref.store %out, %output[%c0] : memref<16xf16, #gpu.address_space> + return +} + +// Test for single-wave kernel with scheduleVersion=2 (DoubleBuffer) - should still have backward barrier +// scheduleVersion=2 does NOT allow skipping backward barrier even for single-wave +// CHECK-LABEL: rock_pipeline_one_wave_schedule_v2 +// REMOVE-STAGES-LABEL: rock_pipeline_one_wave_schedule_v2 +// For scheduleVersion=2, we SHOULD see TWO barriers even with single-wave +// Prologue stores to LDS: +// REMOVE-STAGES: memref.store {{.*}} : memref<128xf16, #gpu.address_space> +// REMOVE-STAGES: scf.for +// Inside loop - TWO barriers even for single-wave with scheduleVersion=2 +// REMOVE-STAGES: rock.lds_barrier +// REMOVE-STAGES: memref.load {{.*}} : memref<128xf16, #gpu.address_space> +// REMOVE-STAGES: rock.threadwise_gemm_accel +// REMOVE-STAGES: rock.lds_barrier +// REMOVE-STAGES: memref.store {{.*}} : memref<128xf16, #gpu.address_space> +// REMOVE-STAGES: } +// Epilogue barrier and LDS read: +// REMOVE-STAGES: rock.lds_barrier +// REMOVE-STAGES: return +func.func @rock_pipeline_one_wave_schedule_v2(%input : memref<16xf16, #gpu.address_space>, %output : memref<16xf16, #gpu.address_space>) attributes {block_size = 64 : i32, arch = "amdgcn-amd-amdhsa:gfx90a"} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + + // 128 f16 elements = 256 bytes + %rawLds = rock.alloc() : memref<256xi8, #gpu.address_space> + %rawRegA = rock.alloc() : memref<32xi8, #gpu.address_space> + %rawRegB = rock.alloc() : memref<32xi8, #gpu.address_space> + %matrixA = memref.alloc() : memref<1x2xvector<4xf16>, #gpu.address_space> + %matrixB = memref.alloc() : memref<1x2xvector<4xf16>, #gpu.address_space> + %matrixC = memref.alloc() : memref<1x1xvector<4xf32>, #gpu.address_space> + + %lds = memref.view %rawLds[%c0][] : memref<256xi8, #gpu.address_space> to memref<128xf16, #gpu.address_space> + %regA = memref.view %rawRegA[%c0][] : memref<32xi8, #gpu.address_space> to memref<16xf16, #gpu.address_space> + %regB = memref.view %rawRegB[%c0][] : memref<32xi8, #gpu.address_space> to memref<16xf16, #gpu.address_space> + + scf.for %arg3 = %c0 to %c16 step %c1 { + rock.stage { + %a = memref.load %input[%arg3] : memref<16xf16, #gpu.address_space> + memref.store %a, %regA[%arg3] : memref<16xf16, #gpu.address_space> + rock.yield + }{name="GlobalRead"} + rock.stage { + %a = memref.load %regA[%arg3] : memref<16xf16, #gpu.address_space> + memref.store %a, %lds[%arg3] : memref<128xf16, #gpu.address_space> + rock.yield + }{name="LDSWrite"} + rock.stage { + %a = memref.load %lds[%arg3] : memref<128xf16, #gpu.address_space> + memref.store %a, %regB[%arg3] : memref<16xf16, #gpu.address_space> + %tid = rock.workitem_id : index + rock.threadwise_gemm_accel %matrixC += %matrixA * %matrixB at[%tid, %tid, %tid] { + params = #rock.accel_gemm_params< + kpackPerBlock = 4, mPerBlock = 16, nPerBlock = 16, kpack = 8, + mPerWave = 16, nPerWave = 16, mnPerXdl = 16, splitKFactor = 1, + scheduleVersion = 2, outputSwizzle = 2, wavesPerEU = 0, + gridGroupSize = 0, forceUnroll = true> + } : memref<1x1xvector<4xf32>, #gpu.address_space> += memref<1x2xvector<4xf16>, #gpu.address_space> * memref<1x2xvector<4xf16>, #gpu.address_space> + rock.yield + }{name="LDSRead"} + }{pipeline = #rock.pipeline<2>} + + %out = memref.load %regB[%c0] : memref<16xf16, #gpu.address_space> + memref.store %out, %output[%c0] : memref<16xf16, #gpu.address_space> + return +} diff --git a/mlir/test/e2e/CMakeLists.txt b/mlir/test/e2e/CMakeLists.txt index ca0d7c2473f3..269163ca26ad 100644 --- a/mlir/test/e2e/CMakeLists.txt +++ b/mlir/test/e2e/CMakeLists.txt @@ -53,6 +53,8 @@ if (ROCMLIR_DRIVER_PR_E2E_TEST_ENABLED) PrLdsTransposeLoadAttention PrConvDirectToLDS PrAttentionDirectToLDS + PrGemmOneWaveBarrier + PrGemmOneWaveBarrierDirectToLDS ) set(GEN_MODE "") endif() @@ -102,6 +104,9 @@ if (ROCK_E2E_TEST_ENABLED) AttentionNonPowerOfTwoTileSize LdsTransposeLoad LdsTransposeLoadAttention + GemmOneWaveBarrier + GemmOneWaveBarrierDirectToLDS + GemmOneWaveBarrierFp8 ) endif() # Create a list for dummy files diff --git a/mlir/test/e2e/GemmOneWaveBarrier.cfg b/mlir/test/e2e/GemmOneWaveBarrier.cfg new file mode 100644 index 000000000000..f03c401eaf22 --- /dev/null +++ b/mlir/test/e2e/GemmOneWaveBarrier.cfg @@ -0,0 +1,3 @@ +# Require MFMA or WMMA support +if not config.arch_support_mfma and not config.arch_support_wmma: + config.unsupported = True diff --git a/mlir/test/e2e/GemmOneWaveBarrier.toml b/mlir/test/e2e/GemmOneWaveBarrier.toml new file mode 100644 index 000000000000..3937e6b64a5e --- /dev/null +++ b/mlir/test/e2e/GemmOneWaveBarrier.toml @@ -0,0 +1,33 @@ +# One-wave GEMM barrier optimization test (scheduleVersion=1) + +directory = "GemmOneWaveBarrier" +prefix = "rocmlir-gen" +suffix = "--operation gemm --arch %arch %pv %random_data %rocmlir_gen_flags | rocmlir-driver -c | mlir-runner -O2 --shared-libs=%linalg_test_lib_dir/libmlir_rocm_runtime%shlibext,%conv_validation_wrapper_library_dir/libconv-validation-wrappers%shlibext,%linalg_test_lib_dir/libmlir_runner_utils%shlibext,%linalg_test_lib_dir/libmlir_float16_utils%shlibext --entry-point-result=void | FileCheck %s --check-prefix=" + +[[axis]] +name = "transA" +values = ["true", "false"] +prefix = "--transA=" + +[[axis]] +name = "transB" +values = ["true", "false"] +prefix = "--transB=" + +[[axis]] +name = "data type" +values = ["f32", "f16", "bf16", "i8"] +prefix = "-t " + +[[axis]] +name = "perf_config" +# One-wave: mPerBlock=mPerWave=32, nPerBlock=nPerWave=32, kpackPerBlock=16, scheduleVersion=1 +values = ["v4:32,32,16,32,32,32,1,1,1,2,0,0,1,1"] +prefix = "-perf_config=" + +[[suite]] +name = "gemm_one_wave_barrier" + +# gridSize = (m/32) * (n/32) = (1024/32) * (1024/32) = 32 * 32 = 1024 > 4*numCU +[[suite.test]] +config = "-g 1 -m 1024 -k 64 -n 1024" diff --git a/mlir/test/e2e/GemmOneWaveBarrierDirectToLDS.cfg b/mlir/test/e2e/GemmOneWaveBarrierDirectToLDS.cfg new file mode 100644 index 000000000000..65cc870f6efe --- /dev/null +++ b/mlir/test/e2e/GemmOneWaveBarrierDirectToLDS.cfg @@ -0,0 +1,5 @@ +# Require direct_to_lds feature for scheduleVersion 3, and fp8 support +if not 'direct_to_lds_32b' in config.features and not 'direct_to_lds_128b' in config.features: + config.unsupported = True +if not config.arch_support_accel_fp8: + config.unsupported = True diff --git a/mlir/test/e2e/GemmOneWaveBarrierDirectToLDS.toml b/mlir/test/e2e/GemmOneWaveBarrierDirectToLDS.toml new file mode 100644 index 000000000000..ec2b14a86a87 --- /dev/null +++ b/mlir/test/e2e/GemmOneWaveBarrierDirectToLDS.toml @@ -0,0 +1,33 @@ +# One-wave GEMM barrier optimization test (scheduleVersion=3, DirectToLDS) + +directory = "GemmOneWaveBarrierDirectToLDS" +prefix = "rocmlir-gen" +suffix = "--operation gemm --arch %arch %pv %random_data %rocmlir_gen_flags | rocmlir-driver -c | mlir-runner -O2 --shared-libs=%linalg_test_lib_dir/libmlir_rocm_runtime%shlibext,%conv_validation_wrapper_library_dir/libconv-validation-wrappers%shlibext,%linalg_test_lib_dir/libmlir_runner_utils%shlibext,%linalg_test_lib_dir/libmlir_float16_utils%shlibext --entry-point-result=void | FileCheck %s --check-prefix=" + +[[axis]] +name = "transA" +values = ["true", "false"] +prefix = "--transA=" + +[[axis]] +name = "transB" +values = ["true", "false"] +prefix = "--transB=" + +[[axis]] +name = "data type" +values = ["f32", "f16", "bf16", "i8", "fp8_fp8"] +prefix = "-t " + +[[axis]] +name = "perf_config" +# One-wave: mPerBlock=mPerWave=32, nPerBlock=nPerWave=32, kpackPerBlock=16, scheduleVersion=3 +values = ["v4:32,32,16,32,32,32,1,1,3,2,0,0,1,1"] +prefix = "-perf_config=" + +[[suite]] +name = "gemm_one_wave_barrier_direct_to_lds" + +# gridSize = (m/32) * (n/32) = (1024/32) * (1024/32) = 32 * 32 = 1024 > 4*numCU +[[suite.test]] +config = "-g 1 -m 1024 -k 64 -n 1024" diff --git a/mlir/test/e2e/GemmOneWaveBarrierFp8.cfg b/mlir/test/e2e/GemmOneWaveBarrierFp8.cfg new file mode 100644 index 000000000000..617d8f44098f --- /dev/null +++ b/mlir/test/e2e/GemmOneWaveBarrierFp8.cfg @@ -0,0 +1,5 @@ +# Require MFMA or WMMA support and fp8 acceleration +if not config.arch_support_mfma and not config.arch_support_wmma: + config.unsupported = True +if not config.arch_support_accel_fp8: + config.unsupported = True diff --git a/mlir/test/e2e/GemmOneWaveBarrierFp8.toml b/mlir/test/e2e/GemmOneWaveBarrierFp8.toml new file mode 100644 index 000000000000..7bda6b27e790 --- /dev/null +++ b/mlir/test/e2e/GemmOneWaveBarrierFp8.toml @@ -0,0 +1,33 @@ +# One-wave GEMM barrier optimization test for fp8 (scheduleVersion=1) + +directory = "GemmOneWaveBarrierFp8" +prefix = "rocmlir-gen" +suffix = "--operation gemm --arch %arch %pv %random_data %rocmlir_gen_flags | rocmlir-driver -c | mlir-runner -O2 --shared-libs=%linalg_test_lib_dir/libmlir_rocm_runtime%shlibext,%conv_validation_wrapper_library_dir/libconv-validation-wrappers%shlibext,%linalg_test_lib_dir/libmlir_runner_utils%shlibext,%linalg_test_lib_dir/libmlir_float16_utils%shlibext --entry-point-result=void | FileCheck %s --check-prefix=" + +[[axis]] +name = "transA" +values = ["true", "false"] +prefix = "--transA=" + +[[axis]] +name = "transB" +values = ["true", "false"] +prefix = "--transB=" + +[[axis]] +name = "data type" +values = ["fp8_fp8"] +prefix = "-t " + +[[axis]] +name = "perf_config" +# One-wave: mPerBlock=mPerWave=32, nPerBlock=nPerWave=32, kpackPerBlock=16, scheduleVersion=1 +values = ["v4:32,32,16,32,32,32,1,1,1,2,0,0,1,1"] +prefix = "-perf_config=" + +[[suite]] +name = "gemm_one_wave_barrier_fp8" + +# gridSize = (m/32) * (n/32) = (1024/32) * (1024/32) = 32 * 32 = 1024 > 4*numCU +[[suite.test]] +config = "-g 1 -m 1024 -k 64 -n 1024" diff --git a/mlir/test/e2e/PrGemmOneWaveBarrier.cfg b/mlir/test/e2e/PrGemmOneWaveBarrier.cfg new file mode 100644 index 000000000000..4428522278dc --- /dev/null +++ b/mlir/test/e2e/PrGemmOneWaveBarrier.cfg @@ -0,0 +1,3 @@ +# Require MFMA or WMMA support for accelerated GEMM +if not (config.arch_support_mfma or config.arch_support_wmma): + config.unsupported = True diff --git a/mlir/test/e2e/PrGemmOneWaveBarrier.toml b/mlir/test/e2e/PrGemmOneWaveBarrier.toml new file mode 100644 index 000000000000..73ec9be13121 --- /dev/null +++ b/mlir/test/e2e/PrGemmOneWaveBarrier.toml @@ -0,0 +1,22 @@ +# One-wave GEMM barrier optimization test (scheduleVersion=1) + +directory = "PrGemmOneWaveBarrier" +prefix = "rocmlir-gen" +suffix = "--operation gemm --arch %arch %pv %random_data %rocmlir_gen_flags | rocmlir-driver -c | mlir-runner -O2 --shared-libs=%linalg_test_lib_dir/libmlir_rocm_runtime%shlibext,%conv_validation_wrapper_library_dir/libconv-validation-wrappers%shlibext,%linalg_test_lib_dir/libmlir_runner_utils%shlibext,%linalg_test_lib_dir/libmlir_float16_utils%shlibext --entry-point-result=void | FileCheck %s --check-prefix=" + +[[axis]] +name = "data type" +values = ["f16"] +prefix = "-t " + +[[axis]] +name = "perf_config" +# One-wave: mPerBlock=mPerWave=32, nPerBlock=nPerWave=32, scheduleVersion=1 +values = ["v4:32,32,8,32,32,32,4,1,1,2,0,0,1,1"] +prefix = "-perf_config=" + +[[suite]] +name = "pr_gemm_one_wave_barrier" + +[[suite.test]] +config = "-g 1 -m 32 -k 64 -n 32" diff --git a/mlir/test/e2e/PrGemmOneWaveBarrierDirectToLDS.cfg b/mlir/test/e2e/PrGemmOneWaveBarrierDirectToLDS.cfg new file mode 100644 index 000000000000..5f04a2150299 --- /dev/null +++ b/mlir/test/e2e/PrGemmOneWaveBarrierDirectToLDS.cfg @@ -0,0 +1,3 @@ +# Require direct_to_lds feature for scheduleVersion 3 +if not 'direct_to_lds_32b' in config.features and not 'direct_to_lds_128b' in config.features: + config.unsupported = True diff --git a/mlir/test/e2e/PrGemmOneWaveBarrierDirectToLDS.toml b/mlir/test/e2e/PrGemmOneWaveBarrierDirectToLDS.toml new file mode 100644 index 000000000000..b05be858a0e5 --- /dev/null +++ b/mlir/test/e2e/PrGemmOneWaveBarrierDirectToLDS.toml @@ -0,0 +1,22 @@ +# One-wave GEMM barrier optimization test (scheduleVersion=3, DirectToLDS) + +directory = "PrGemmOneWaveBarrierDirectToLDS" +prefix = "rocmlir-gen" +suffix = "--operation gemm --arch %arch %pv %random_data %rocmlir_gen_flags | rocmlir-driver -c | mlir-runner -O2 --shared-libs=%linalg_test_lib_dir/libmlir_rocm_runtime%shlibext,%conv_validation_wrapper_library_dir/libconv-validation-wrappers%shlibext,%linalg_test_lib_dir/libmlir_runner_utils%shlibext,%linalg_test_lib_dir/libmlir_float16_utils%shlibext --entry-point-result=void | FileCheck %s --check-prefix=" + +[[axis]] +name = "data type" +values = ["f16"] +prefix = "-t " + +[[axis]] +name = "perf_config" +# One-wave: mPerBlock=mPerWave=32, nPerBlock=nPerWave=32, scheduleVersion=3 +values = ["v4:32,32,8,32,32,32,4,1,3,2,0,0,1,1"] +prefix = "-perf_config=" + +[[suite]] +name = "pr_gemm_one_wave_barrier_direct_to_lds" + +[[suite.test]] +config = "-g 1 -m 32 -k 64 -n 32" From e08393294bc78849ec20ed3825be418f97dfca71 Mon Sep 17 00:00:00 2001 From: Umang Yadav Date: Tue, 24 Feb 2026 20:35:35 +0000 Subject: [PATCH 2/3] just use CHECK --- .../Rock/test_rock_pipeline_wave_barrier.mlir | 109 ++++++++---------- 1 file changed, 51 insertions(+), 58 deletions(-) diff --git a/mlir/test/Dialect/Rock/test_rock_pipeline_wave_barrier.mlir b/mlir/test/Dialect/Rock/test_rock_pipeline_wave_barrier.mlir index 4a635f24abe4..a8bca60ed7bf 100644 --- a/mlir/test/Dialect/Rock/test_rock_pipeline_wave_barrier.mlir +++ b/mlir/test/Dialect/Rock/test_rock_pipeline_wave_barrier.mlir @@ -1,5 +1,4 @@ -// RUN: rocmlir-opt %s --rock-pipeline="rock-pipeline-remove-stages=false" | FileCheck %s -// RUN: rocmlir-opt %s --rock-pipeline="rock-pipeline-remove-stages=true" | FileCheck %s --check-prefix=REMOVE-STAGES +// RUN: rocmlir-opt %s --rock-pipeline="rock-pipeline-remove-stages=true" | FileCheck %s // This test file verifies the optimization that skips backward LDS barriers // for single-wave kernels with specific schedule versions. @@ -7,23 +6,21 @@ // Test for single-wave kernel with scheduleVersion=1 (Default) // When blockSize <= waveSize and scheduleVersion is 1 or 3, backward barriers should be skipped // For scheduleVersion=1, the loop has 3 stages: GlobalRead, LDSWrite, LDSRead -// CHECK-LABEL: rock_pipeline_one_wave_schedule_v1 -// REMOVE-STAGES-LABEL: rock_pipeline_one_wave_schedule_v1 -// For single-wave with scheduleVersion=1, we should NOT see a second barrier in the loop -// (backward barrier is skipped) + +// CHECK-LABEL: func.func @rock_pipeline_one_wave_schedule_v1 // Prologue stores to LDS: -// REMOVE-STAGES: memref.store {{.*}} : memref<128xf16, #gpu.address_space> -// REMOVE-STAGES: scf.for -// Inside loop - only ONE barrier (forward), no backward barrier for single-wave -// REMOVE-STAGES: rock.lds_barrier -// REMOVE-STAGES: memref.load {{.*}} : memref<128xf16, #gpu.address_space> -// REMOVE-STAGES: rock.threadwise_gemm_accel -// REMOVE-STAGES-NOT: rock.lds_barrier -// REMOVE-STAGES: memref.store {{.*}} : memref<128xf16, #gpu.address_space> -// REMOVE-STAGES: } +// CHECK: memref.store {{.*}} : memref<128xf16, #gpu.address_space> +// CHECK: scf.for +// Inside loop - only ONE barrier (forward), no backward barrier for single-wave +// CHECK: rock.lds_barrier +// CHECK: memref.load {{.*}} : memref<128xf16, #gpu.address_space> +// CHECK: rock.threadwise_gemm_accel +// CHECK-NOT: rock.lds_barrier +// CHECK: memref.store {{.*}} : memref<128xf16, #gpu.address_space> +// CHECK: } // Epilogue barrier and LDS read: -// REMOVE-STAGES: rock.lds_barrier -// REMOVE-STAGES: return +// CHECK: rock.lds_barrier +// CHECK: return func.func @rock_pipeline_one_wave_schedule_v1(%input : memref<16xf16, #gpu.address_space>, %output : memref<16xf16, #gpu.address_space>) attributes {block_size = 64 : i32, arch = "amdgcn-amd-amdhsa:gfx90a"} { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index @@ -75,21 +72,19 @@ func.func @rock_pipeline_one_wave_schedule_v1(%input : memref<16xf16, #gpu.addre // Test for single-wave kernel with scheduleVersion=3 (DirectToLDSDefault) // When blockSize <= waveSize and scheduleVersion is 1 or 3, backward barriers should be skipped // For scheduleVersion=3, the loop has only 2 stages: GlobalRead (writes directly to LDS) and LDSRead -// CHECK-LABEL: rock_pipeline_one_wave_schedule_v3 -// REMOVE-STAGES-LABEL: rock_pipeline_one_wave_schedule_v3 -// For single-wave with scheduleVersion=3, we should NOT see backward barrier -// The 2-stage loop doesn't fully pipeline but still gets barrier optimization -// REMOVE-STAGES: scf.for -// Inside loop - only ONE barrier for single-wave with scheduleVersion=3 -// REMOVE-STAGES: memref.store {{.*}} : memref<128xf16, #gpu.address_space> -// REMOVE-STAGES-NEXT: rock.lds_barrier -// REMOVE-STAGES: memref.load {{.*}} : memref<128xf16, #gpu.address_space> -// REMOVE-STAGES: rock.threadwise_gemm_accel + +// CHECK-LABEL: func.func @rock_pipeline_one_wave_schedule_v3 +// CHECK: scf.for +// Inside loop - only ONE barrier for single-wave with scheduleVersion=3 +// CHECK: memref.store {{.*}} : memref<128xf16, #gpu.address_space> +// CHECK-NEXT: rock.lds_barrier +// CHECK: memref.load {{.*}} : memref<128xf16, #gpu.address_space> +// CHECK: rock.threadwise_gemm_accel // No second barrier before end of loop body -// REMOVE-STAGES: } +// CHECK: } // No barriers after loop for this test since it doesn't fully pipeline -// REMOVE-STAGES-NOT: rock.lds_barrier -// REMOVE-STAGES: return +// CHECK-NOT: rock.lds_barrier +// CHECK: return func.func @rock_pipeline_one_wave_schedule_v3(%input : memref<16xf16, #gpu.address_space>, %output : memref<16xf16, #gpu.address_space>) attributes {block_size = 64 : i32, arch = "amdgcn-amd-amdhsa:gfx90a"} { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index @@ -140,22 +135,21 @@ func.func @rock_pipeline_one_wave_schedule_v3(%input : memref<16xf16, #gpu.addre // Test for multi-wave kernel with scheduleVersion=1 - should still have backward barrier // When blockSize > waveSize, backward barriers should NOT be skipped -// CHECK-LABEL: rock_pipeline_multi_wave_schedule_v1 -// REMOVE-STAGES-LABEL: rock_pipeline_multi_wave_schedule_v1 -// For multi-wave, we SHOULD see TWO barriers in the loop (forward and backward) + +// CHECK-LABEL: func.func @rock_pipeline_multi_wave_schedule_v1 // Prologue stores to LDS: -// REMOVE-STAGES: memref.store {{.*}} : memref<128xf16, #gpu.address_space> -// REMOVE-STAGES: scf.for -// Inside loop - TWO barriers for multi-wave (forward + backward) -// REMOVE-STAGES: rock.lds_barrier -// REMOVE-STAGES: memref.load {{.*}} : memref<128xf16, #gpu.address_space> -// REMOVE-STAGES: rock.threadwise_gemm_accel -// REMOVE-STAGES: rock.lds_barrier -// REMOVE-STAGES: memref.store {{.*}} : memref<128xf16, #gpu.address_space> -// REMOVE-STAGES: } +// CHECK: memref.store {{.*}} : memref<128xf16, #gpu.address_space> +// CHECK: scf.for +// Inside loop - TWO barriers for multi-wave (forward + backward) +// CHECK: rock.lds_barrier +// CHECK: memref.load {{.*}} : memref<128xf16, #gpu.address_space> +// CHECK: rock.threadwise_gemm_accel +// CHECK: rock.lds_barrier +// CHECK: memref.store {{.*}} : memref<128xf16, #gpu.address_space> +// CHECK: } // Epilogue barrier and LDS read: -// REMOVE-STAGES: rock.lds_barrier -// REMOVE-STAGES: return +// CHECK: rock.lds_barrier +// CHECK: return func.func @rock_pipeline_multi_wave_schedule_v1(%input : memref<16xf16, #gpu.address_space>, %output : memref<16xf16, #gpu.address_space>) attributes {block_size = 128 : i32, arch = "amdgcn-amd-amdhsa:gfx90a"} { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index @@ -206,22 +200,21 @@ func.func @rock_pipeline_multi_wave_schedule_v1(%input : memref<16xf16, #gpu.add // Test for single-wave kernel with scheduleVersion=2 (DoubleBuffer) - should still have backward barrier // scheduleVersion=2 does NOT allow skipping backward barrier even for single-wave -// CHECK-LABEL: rock_pipeline_one_wave_schedule_v2 -// REMOVE-STAGES-LABEL: rock_pipeline_one_wave_schedule_v2 -// For scheduleVersion=2, we SHOULD see TWO barriers even with single-wave + +// CHECK-LABEL: func.func @rock_pipeline_one_wave_schedule_v2 // Prologue stores to LDS: -// REMOVE-STAGES: memref.store {{.*}} : memref<128xf16, #gpu.address_space> -// REMOVE-STAGES: scf.for -// Inside loop - TWO barriers even for single-wave with scheduleVersion=2 -// REMOVE-STAGES: rock.lds_barrier -// REMOVE-STAGES: memref.load {{.*}} : memref<128xf16, #gpu.address_space> -// REMOVE-STAGES: rock.threadwise_gemm_accel -// REMOVE-STAGES: rock.lds_barrier -// REMOVE-STAGES: memref.store {{.*}} : memref<128xf16, #gpu.address_space> -// REMOVE-STAGES: } +// CHECK: memref.store {{.*}} : memref<128xf16, #gpu.address_space> +// CHECK: scf.for +// Inside loop - TWO barriers even for single-wave with scheduleVersion=2 +// CHECK: rock.lds_barrier +// CHECK: memref.load {{.*}} : memref<128xf16, #gpu.address_space> +// CHECK: rock.threadwise_gemm_accel +// CHECK: rock.lds_barrier +// CHECK: memref.store {{.*}} : memref<128xf16, #gpu.address_space> +// CHECK: } // Epilogue barrier and LDS read: -// REMOVE-STAGES: rock.lds_barrier -// REMOVE-STAGES: return +// CHECK: rock.lds_barrier +// CHECK: return func.func @rock_pipeline_one_wave_schedule_v2(%input : memref<16xf16, #gpu.address_space>, %output : memref<16xf16, #gpu.address_space>) attributes {block_size = 64 : i32, arch = "amdgcn-amd-amdhsa:gfx90a"} { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index From 25dd1ad312df6e176e4f6808e66394aec5421fc3 Mon Sep 17 00:00:00 2001 From: Umang Yadav <29876643+umangyadav@users.noreply.github.com> Date: Tue, 24 Feb 2026 15:38:56 -0500 Subject: [PATCH 3/3] Apply suggestions from code review Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../Dialect/Rock/Transforms/RockPipeline.cpp | 21 +++++++++++-------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/mlir/lib/Dialect/Rock/Transforms/RockPipeline.cpp b/mlir/lib/Dialect/Rock/Transforms/RockPipeline.cpp index bfc4f4e6c242..271052f67770 100644 --- a/mlir/lib/Dialect/Rock/Transforms/RockPipeline.cpp +++ b/mlir/lib/Dialect/Rock/Transforms/RockPipeline.cpp @@ -474,17 +474,21 @@ DagType pruneGraph(const DagType &dag) { // Determine if the backward barrier can be skipped for single-wave kernels. // -// For scheduleVersion 1 (Default) or 3 (DirectToLDSDefault), the loop -// structure is: +// For scheduleVersion 1 (Default), the loop structure is: // GlobalLoad -> DSWrite -> (fwd barrier) -> DSRead + MFMA // -// The forward barrier ensures DSWrites complete before DSReads start. -// For the loop-carried dependency (backward barrier), we need to ensure -// DSReads from iteration i finish before DSWrites from iteration i+1. +// For scheduleVersion 3 (DirectToLDSDefault), GlobalLoad writes directly to +// LDS, so the loop structure is logically: +// GlobalLoad (to LDS) -> (fwd barrier) -> DSRead + MFMA +// +// In both cases, the forward barrier ensures LDS writes (explicit DSWrite or +// DirectToLDS GlobalLoad) complete before DSReads start. For the +// loop-carried dependency (backward barrier), we need to ensure DSReads from +// iteration i finish before LDS writes from iteration i+1. // // When blockSize <= waveSize (single wave), this is guaranteed because // GPU issues instructions in order within a wave - once DSReads have been -// issued, they have read the data from the buffers, so DSWrites can proceed +// issued, they have read the data from the buffers, so LDS writes can proceed // without an explicit barrier. bool canSkipBackwardBarrierForOneWave(func::FuncOp func, scf::ForOp forOp) { // Check if this is a single-wave kernel @@ -500,15 +504,14 @@ bool canSkipBackwardBarrierForOneWave(func::FuncOp func, scf::ForOp forOp) { return false; StringAttr arch = rock::getArchValue(func); - if (!arch) - return false; + int64_t waveSize = rock::lookupArchInfo(arch).waveSize; bool isOneWave = (blockSize <= waveSize); if (!isOneWave) return false; - // for nested loops, it may require more analysis. For now, only support + // For nested loops, it may require more analysis. For now, only support // single loop. int forOpCount = 0; func.walk([&](scf::ForOp) { ++forOpCount; });