From fd5699d30800e55558c6a67d5ba0490f2d515e9b Mon Sep 17 00:00:00 2001 From: Umang Yadav Date: Tue, 24 Feb 2026 15:20:49 +0000 Subject: [PATCH 1/3] Fix barrier placement for scheduleVersion = 1. It should appear before LDSRead and not before GlobalLoad. Also fix a bug where, if there are no stages found then it should preserve barriers in original loop --- .../Dialect/Rock/Transforms/RockPipeline.cpp | 48 ++--- .../Rock/rock-pipeline-early-exit.mlir | 12 +- .../test/Dialect/Rock/test_rock_pipeline.mlir | 187 ++++++++++++++++ .../Rock/test_rock_pipeline_nested.mlir | 202 ++++++++++++++++-- 4 files changed, 406 insertions(+), 43 deletions(-) diff --git a/mlir/lib/Dialect/Rock/Transforms/RockPipeline.cpp b/mlir/lib/Dialect/Rock/Transforms/RockPipeline.cpp index 17e5e2534056..84a40b231c6a 100644 --- a/mlir/lib/Dialect/Rock/Transforms/RockPipeline.cpp +++ b/mlir/lib/Dialect/Rock/Transforms/RockPipeline.cpp @@ -139,8 +139,9 @@ struct RemoveBackToBackBarriersRewritePattern LogicalResult matchAndRewrite(rock::LDSBarrierOp op, PatternRewriter &rw) const override { - if (dyn_cast_or_null(op->getNextNode())) { - op->getNextNode()->erase(); + if (auto nextBarrier = + dyn_cast_or_null(op->getNextNode())) { + rw.eraseOp(nextBarrier); return success(); } return failure(); @@ -162,7 +163,12 @@ struct PushBarrierDownRewritePattern return failure(); // Don't go over the terminator - if (!nextOp->getNextNode()) + if (nextOp->hasTrait() || + nextOp->hasTrait()) + return failure(); + + // Don't push past another barrier - let RemoveBackToBackBarriers handle it + if (isa(nextOp)) return failure(); // We assume that operations that have a body may modify LDS @@ -733,14 +739,14 @@ void RockPipeline::runOnOperation() { forOp.walk([&](rock::StageOp stageOp) { stages.push_back(stageOp); }); + if (stages.empty()) + continue; + forOp.walk([](rock::LDSBarrierOp barrier) { if (!barrier->getParentOfType()) barrier->erase(); }); - if (stages.empty()) - continue; - LLVM_DEBUG(DBGS() << "Number of stages: " << stages.size() << "\n"); LLVM_DEBUG(DBGS() << "Initiation Interval: " << ii << "\n"); size_t numStages = stages.size(); @@ -764,6 +770,7 @@ void RockPipeline::runOnOperation() { // barriers for registers or globals placeBarriers(rewriter, loc, forOp, stages, multiAllocs, extendedStages, ii, numIterations); + LLVM_DEBUG(DBGS() << "ForOp: " << forOp << "\n"); ScheduleType schedule; // use all "resources" to generate dependency graph and generate schedule @@ -796,30 +803,23 @@ void RockPipeline::runOnOperation() { } } } + LLVM_DEBUG({ + DBGS() << "After remulti-buffer\n"; + DBGS() << "===============\n"; + DBGS() << "Operation: " << getOperation() << "\n"; + DBGS() << "===============\n"; + }); // Cleanup the stages { if (removeStages) { - RewritePatternSet patternsPushBarrier(&getContext()); - // run PushBarrierDownRewritePattern before RemoveStagesRewritePattern, - // because the latter will remove the stages and their terminators - patternsPushBarrier.add(ctx); - if (failed(applyPatternsGreedily(func, std::move(patternsPushBarrier)))) - return signalPassFailure(); - - // run RemoveStagesRewritePattern before - // RemoveBackToBackBarriersRewritePattern, because the latter expects to - // find no stages - RewritePatternSet patternsRemoveStages(&getContext()); - patternsRemoveStages.add(ctx); + RewritePatternSet patterns(&getContext()); + patterns.add(&getContext()); if (failed( - applyPatternsGreedily(func, std::move(patternsRemoveStages)))) - return signalPassFailure(); - - RewritePatternSet patternsBackToBack(&getContext()); - patternsBackToBack.add(ctx); - if (failed(applyPatternsGreedily(func, std::move(patternsBackToBack)))) + applyPatternsGreedily(getOperation(), std::move(patterns)))) { return signalPassFailure(); + } } } } diff --git a/mlir/test/Dialect/Rock/rock-pipeline-early-exit.mlir b/mlir/test/Dialect/Rock/rock-pipeline-early-exit.mlir index 592557e3e24e..93d7bfce7506 100644 --- a/mlir/test/Dialect/Rock/rock-pipeline-early-exit.mlir +++ b/mlir/test/Dialect/Rock/rock-pipeline-early-exit.mlir @@ -66,6 +66,10 @@ module { // CHECK: arith.addf // CHECK: memref.store {{.*}}[%[[INNER_IV]]] // CHECK: } + // CHECK: %[[ALLOC_LDS_A:.*]] = rock.alloc() : memref<16xf16, #gpu.address_space> + // CHECK: %[[ALLOC_LDS_B:.*]] = rock.alloc() : memref<16xf16, #gpu.address_space> + // CHECK: %[[WID_LDS:.*]] = rock.workitem_id : index + // CHECK: memref.load %[[ALLOC_LDS_A]][%c0] // CHECK-NEXT: rock.lds_barrier affine.for %arg5 = 0 to 16 { %4 = memref.load %1[%arg5] : memref<64xf16, #gpu.address_space> @@ -82,13 +86,7 @@ module { rock.lds_barrier } {pipeline = #rock.pipeline<2>} - // CHECK: %[[ALLOC_G:.*]] = rock.alloc() : memref<16xf16, #gpu.address_space> - // CHECK: %[[ALLOC_H:.*]] = rock.alloc() : memref<16xf16, #gpu.address_space> - // CHECK: %[[WID4:.*]] = rock.workitem_id : index - // CHECK: memref.load %[[ALLOC_G]][%c0] - // CHECK: memref.store {{.*}}, {{.*}}[%[[WID4]]] - // CHECK: memref.load %[[ALLOC_H]][%c0] - // CHECK: memref.store {{.*}}, {{.*}}[%[[WID4]]] + // CHECK: memref.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<64xf16, #gpu.address_space> // CHECK: } // CHECK-NOT: {pipeline = #rock.pipeline<2>} diff --git a/mlir/test/Dialect/Rock/test_rock_pipeline.mlir b/mlir/test/Dialect/Rock/test_rock_pipeline.mlir index 582bdc24dcf2..9a5de2df69a5 100644 --- a/mlir/test/Dialect/Rock/test_rock_pipeline.mlir +++ b/mlir/test/Dialect/Rock/test_rock_pipeline.mlir @@ -2,6 +2,17 @@ // RUN: rocmlir-opt %s --rock-pipeline="rock-pipeline-remove-stages=true" | FileCheck %s --check-prefix=REMOVE-STAGES // CHECK-LABEL: rock_pipeline_3_stages_ii_1 +// REMOVE-STAGES-LABEL: rock_pipeline_3_stages_ii_1 +// REMOVE-STAGES-NOT: rock.stage +// REMOVE-STAGES: scf.for + // REMOVE-STAGES-NOT: rock.stage + // REMOVE-STAGES: memref.load %{{.*}} : memref<16xi8, #gpu.address_space> + // REMOVE-STAGES-NEXT: rock.lds_barrier + // REMOVE-STAGES-NEXT: %[[LDS_BUF:.*]] = rock.extract_multibuffer{{.*}}#gpu.address_space + // REMOVE-STAGES-NEXT: memref.store %{{.*}}, %[[LDS_BUF]]{{.*}} : memref<16xi8, #gpu.address_space> +// REMOVE-STAGES: rock.lds_barrier +// REMOVE-STAGES: rock.lds_barrier +// REMOVE-STAGES: return func.func @rock_pipeline_3_stages_ii_1(%input : memref<16xi8, #gpu.address_space>, %output : memref<16xi8, #gpu.address_space>){ %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index @@ -60,6 +71,22 @@ func.func @rock_pipeline_3_stages_ii_1(%input : memref<16xi8, #gpu.address_space } // CHECK-LABEL: rock_pipeline_3_stages_ii_2 +// REMOVE-STAGES-LABEL: rock_pipeline_3_stages_ii_2 +// REMOVE-STAGES-NOT: rock.stage +// REMOVE-STAGES: scf.for + // REMOVE-STAGES-NOT: rock.stage + // Barrier before LDS read + // REMOVE-STAGES: memref.store %{{.*}} : memref<16xi8, #gpu.address_space> + // REMOVE-STAGES-NEXT: rock.lds_barrier + // REMOVE-STAGES-NEXT: %[[LDS_RD:.*]] = rock.extract_multibuffer{{.*}}#gpu.address_space + // REMOVE-STAGES-NEXT: memref.load %[[LDS_RD]] + // Barrier before LDS write + // REMOVE-STAGES: memref.load %{{.*}} : memref<16xi8, #gpu.address_space> + // REMOVE-STAGES-NEXT: rock.lds_barrier + // REMOVE-STAGES-NEXT: %[[LDS_WR:.*]] = rock.extract_multibuffer{{.*}}#gpu.address_space + // REMOVE-STAGES-NEXT: memref.store %{{.*}}, %[[LDS_WR]]{{.*}} : memref<16xi8, #gpu.address_space> +// REMOVE-STAGES: rock.lds_barrier +// REMOVE-STAGES: return func.func @rock_pipeline_3_stages_ii_2(%input : memref<16xi8, #gpu.address_space>, %output : memref<16xi8, #gpu.address_space>){ %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index @@ -119,6 +146,19 @@ func.func @rock_pipeline_3_stages_ii_2(%input : memref<16xi8, #gpu.address_space // this test shouldn't pipeline loop but it would add barriers and multibuffer by 1 // CHECK-LABEL: rock_pipeline_3_stages_ii_2_less_iterations +// REMOVE-STAGES-LABEL: rock_pipeline_3_stages_ii_2_less_iterations +// REMOVE-STAGES-NOT: rock.stage +// REMOVE-STAGES: scf.for + // REMOVE-STAGES-NOT: rock.stage + // Barrier before LDS read + // REMOVE-STAGES: memref.store %{{.*}} : memref<16xi8, #gpu.address_space> + // REMOVE-STAGES-NEXT: rock.lds_barrier + // REMOVE-STAGES-NEXT: %[[LDS_RD:.*]] = rock.extract_multibuffer{{.*}}#gpu.address_space + // REMOVE-STAGES-NEXT: memref.load %[[LDS_RD]] +// REMOVE-STAGES: } +// Epilogue: barrier before LDS read +// REMOVE-STAGES-NEXT: rock.lds_barrier +// REMOVE-STAGES: return func.func @rock_pipeline_3_stages_ii_2_less_iterations(%input : memref<16xi8, #gpu.address_space>, %output : memref<16xi8, #gpu.address_space>){ %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index @@ -170,6 +210,23 @@ func.func @rock_pipeline_3_stages_ii_2_less_iterations(%input : memref<16xi8, #g } // CHECK-LABEL: rock_pipeline_3_stages_ii_3 +// REMOVE-STAGES-LABEL: rock_pipeline_3_stages_ii_3 +// REMOVE-STAGES-NOT: rock.stage +// REMOVE-STAGES: scf.for + // REMOVE-STAGES-NOT: rock.stage + // Barrier before LDS write + // REMOVE-STAGES: memref.load %{{.*}} : memref<16xi8, #gpu.address_space> + // REMOVE-STAGES-NEXT: rock.lds_barrier + // REMOVE-STAGES-NEXT: %[[LDS_WR:.*]] = rock.extract_multibuffer{{.*}}#gpu.address_space + // REMOVE-STAGES-NEXT: memref.store %{{.*}}, %[[LDS_WR]]{{.*}} : memref<16xi8, #gpu.address_space> + // Barrier before LDS read + // REMOVE-STAGES: rock.lds_barrier + // REMOVE-STAGES-NEXT: %[[LDS_RD:.*]] = rock.extract_multibuffer{{.*}}#gpu.address_space + // REMOVE-STAGES-NEXT: memref.load %[[LDS_RD]] +// REMOVE-STAGES: } +// No barrier after loop for this function - all barriers are inside the loop +// REMOVE-STAGES-NOT: rock.lds_barrier +// REMOVE-STAGES: return func.func @rock_pipeline_3_stages_ii_3(%input : memref<16xi8, #gpu.address_space>, %output : memref<16xi8, #gpu.address_space>){ %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index @@ -223,6 +280,14 @@ func.func @rock_pipeline_3_stages_ii_3(%input : memref<16xi8, #gpu.address_space // This test shouldn't do any pipelining as it doesn't have any stages but it should still multibuffer by 1 // CHECK-LABEL: rock_pipeline_no_stages_ii_1 +// REMOVE-STAGES-LABEL: rock_pipeline_no_stages_ii_1 +// REMOVE-STAGES-NOT: rock.stage +// REMOVE-STAGES-NOT: rock.lds_barrier +// REMOVE-STAGES: scf.for + // REMOVE-STAGES-NOT: rock.stage + // REMOVE-STAGES-NOT: rock.lds_barrier +// REMOVE-STAGES-NOT: rock.lds_barrier +// REMOVE-STAGES: return func.func @rock_pipeline_no_stages_ii_1(%input : memref<16xi8, #gpu.address_space>, %output : memref<16xi8, #gpu.address_space>){ %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index @@ -266,6 +331,28 @@ func.func @rock_pipeline_no_stages_ii_1(%input : memref<16xi8, #gpu.address_spac } // CHECK-LABEL: rock_pipeline_4_stages_ii_2 +// REMOVE-STAGES-LABEL: rock_pipeline_4_stages_ii_2 +// REMOVE-STAGES-NOT: rock.stage +// Prologue: barrier before LDS read +// REMOVE-STAGES: memref.store %{{.*}} : memref<16xi8, #gpu.address_space> +// REMOVE-STAGES-NEXT: rock.lds_barrier +// REMOVE-STAGES-NEXT: %[[LDS_PRO:.*]] = rock.extract_multibuffer{{.*}}#gpu.address_space +// REMOVE-STAGES-NEXT: memref.load %[[LDS_PRO]] +// REMOVE-STAGES: scf.for + // REMOVE-STAGES-NOT: rock.stage + // Barrier before LDS write + // REMOVE-STAGES: memref.load %{{.*}} : memref<16xi8, #gpu.address_space> + // REMOVE-STAGES-NEXT: rock.lds_barrier + // REMOVE-STAGES-NEXT: %[[LDS_WR:.*]] = rock.extract_multibuffer{{.*}}#gpu.address_space + // REMOVE-STAGES-NEXT: memref.store %{{.*}}, %[[LDS_WR]]{{.*}} : memref<16xi8, #gpu.address_space> + // Barrier before LDS read + // REMOVE-STAGES: rock.lds_barrier + // REMOVE-STAGES-NEXT: %[[LDS_RD:.*]] = rock.extract_multibuffer{{.*}}#gpu.address_space + // REMOVE-STAGES-NEXT: memref.load %[[LDS_RD]] +// REMOVE-STAGES: } +// REMOVE-STAGES-NEXT: rock.lds_barrier +// REMOVE-STAGES: rock.lds_barrier +// REMOVE-STAGES: return func.func @rock_pipeline_4_stages_ii_2(%input : memref<16xi8, #gpu.address_space>, %output : memref<16xi8, #gpu.address_space>){ %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index @@ -333,6 +420,27 @@ func.func @rock_pipeline_4_stages_ii_2(%input : memref<16xi8, #gpu.address_space } // CHECK-LABEL: rock_pipeline_4_stages_ii_1_i8 +// REMOVE-STAGES-LABEL: rock_pipeline_4_stages_ii_1_i8 +// REMOVE-STAGES-NOT: rock.stage +// Prologue: barrier before LDS write +// REMOVE-STAGES: memref.load %{{.*}} : memref<16xi8, #gpu.address_space> +// REMOVE-STAGES: rock.lds_barrier +// REMOVE-STAGES-NEXT: %[[LDS_PRO:.*]] = rock.extract_multibuffer{{.*}}#gpu.address_space +// REMOVE-STAGES-NEXT: memref.store %{{.*}}, %[[LDS_PRO]]{{.*}} : memref<16xi8, #gpu.address_space> +// REMOVE-STAGES: scf.for + // REMOVE-STAGES-NOT: rock.stage + // Barrier before LDS write + // REMOVE-STAGES: memref.load %{{.*}} : memref<16xi8, #gpu.address_space> + // REMOVE-STAGES-NEXT: rock.lds_barrier + // REMOVE-STAGES-NEXT: %[[LDS_WR:.*]] = rock.extract_multibuffer{{.*}}#gpu.address_space + // REMOVE-STAGES-NEXT: memref.store %{{.*}}, %[[LDS_WR]]{{.*}} : memref<16xi8, #gpu.address_space> +// REMOVE-STAGES: } +// Epilogue: barrier before LDS write, barrier before LDS read +// REMOVE-STAGES: memref.load %{{.*}} : memref<16xi8, #gpu.address_space> +// REMOVE-STAGES: rock.lds_barrier +// REMOVE-STAGES: memref.store %{{.*}} : memref<16xi8, #gpu.address_space> +// REMOVE-STAGES: rock.lds_barrier +// REMOVE-STAGES: return func.func @rock_pipeline_4_stages_ii_1_i8(%input : memref<16xi8, #gpu.address_space>, %output : memref<16xi8, #gpu.address_space>){ %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index @@ -413,6 +521,27 @@ func.func @rock_pipeline_4_stages_ii_1_i8(%input : memref<16xi8, #gpu.address_sp } // CHECK-LABEL: rock_pipeline_4_stages_ii_1_f16 +// REMOVE-STAGES-LABEL: rock_pipeline_4_stages_ii_1_f16 +// REMOVE-STAGES-NOT: rock.stage +// Prologue: barrier before LDS write +// REMOVE-STAGES: memref.load %{{.*}} : memref<16xf16, #gpu.address_space> +// REMOVE-STAGES: rock.lds_barrier +// REMOVE-STAGES-NEXT: %[[LDS_PRO:.*]] = rock.extract_multibuffer{{.*}}#gpu.address_space +// REMOVE-STAGES-NEXT: memref.store %{{.*}}, %[[LDS_PRO]]{{.*}} : memref<16xf16, #gpu.address_space> +// REMOVE-STAGES: scf.for + // REMOVE-STAGES-NOT: rock.stage + // Barrier before LDS write + // REMOVE-STAGES: memref.load %{{.*}} : memref<16xf16, #gpu.address_space> + // REMOVE-STAGES-NEXT: rock.lds_barrier + // REMOVE-STAGES-NEXT: %[[LDS_WR:.*]] = rock.extract_multibuffer{{.*}}#gpu.address_space + // REMOVE-STAGES-NEXT: memref.store %{{.*}}, %[[LDS_WR]]{{.*}} : memref<16xf16, #gpu.address_space> +// REMOVE-STAGES: } +// Epilogue: barrier before LDS write, barrier before LDS read +// REMOVE-STAGES: memref.load %{{.*}} : memref<16xf16, #gpu.address_space> +// REMOVE-STAGES: rock.lds_barrier +// REMOVE-STAGES: memref.store %{{.*}} : memref<16xf16, #gpu.address_space> +// REMOVE-STAGES: rock.lds_barrier +// REMOVE-STAGES: return func.func @rock_pipeline_4_stages_ii_1_f16(%input : memref<16xf16, #gpu.address_space>, %output : memref<16xf16, #gpu.address_space>){ %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index @@ -489,6 +618,23 @@ func.func @rock_pipeline_4_stages_ii_1_f16(%input : memref<16xf16, #gpu.address_ // This test should adjust II to 2 to enable loop pipelining // CHECK-LABEL: rock_pipeline_4_stages_ii_1_f16_less_iterations +// REMOVE-STAGES-LABEL: rock_pipeline_4_stages_ii_1_f16_less_iterations +// REMOVE-STAGES-NOT: rock.stage +// REMOVE-STAGES: scf.for + // REMOVE-STAGES-NOT: rock.stage + // Barrier before LDS read + // REMOVE-STAGES: memref.store %{{.*}} : memref<16xf16, #gpu.address_space> + // REMOVE-STAGES-NEXT: rock.lds_barrier + // REMOVE-STAGES-NEXT: %[[LDS_RD:.*]] = rock.extract_multibuffer{{.*}}#gpu.address_space + // REMOVE-STAGES-NEXT: memref.load %[[LDS_RD]] + // Barrier before LDS write + // REMOVE-STAGES: memref.load %{{.*}} : memref<16xf16, #gpu.address_space> + // REMOVE-STAGES-NEXT: rock.lds_barrier + // REMOVE-STAGES-NEXT: %[[LDS_WR:.*]] = rock.extract_multibuffer{{.*}}#gpu.address_space + // REMOVE-STAGES-NEXT: memref.store %{{.*}}, %[[LDS_WR]]{{.*}} : memref<16xf16, #gpu.address_space> +// REMOVE-STAGES: } +// REMOVE-STAGES-NEXT: rock.lds_barrier +// REMOVE-STAGES: return func.func @rock_pipeline_4_stages_ii_1_f16_less_iterations(%input : memref<16xf16, #gpu.address_space>, %output : memref<16xf16, #gpu.address_space>){ %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index @@ -557,6 +703,27 @@ func.func @rock_pipeline_4_stages_ii_1_f16_less_iterations(%input : memref<16xf1 // this test should do loop pipelining without adjust II but notice that it emits scf.for loop with zero iterations. // CHECK-LABEL: rock_pipeline_4_stages_ii_1_f16_less_iterations_2 +// REMOVE-STAGES-LABEL: rock_pipeline_4_stages_ii_1_f16_less_iterations_2 +// REMOVE-STAGES-NOT: rock.stage +// Prologue: barrier before LDS write +// REMOVE-STAGES: memref.load %{{.*}} : memref<16xf16, #gpu.address_space> +// REMOVE-STAGES: rock.lds_barrier +// REMOVE-STAGES-NEXT: %[[LDS_PRO:.*]] = rock.extract_multibuffer{{.*}}#gpu.address_space +// REMOVE-STAGES-NEXT: memref.store %{{.*}}, %[[LDS_PRO]]{{.*}} : memref<16xf16, #gpu.address_space> +// REMOVE-STAGES: scf.for + // REMOVE-STAGES-NOT: rock.stage + // Barrier before LDS write + // REMOVE-STAGES: memref.load %{{.*}} : memref<16xf16, #gpu.address_space> + // REMOVE-STAGES-NEXT: rock.lds_barrier + // REMOVE-STAGES-NEXT: %[[LDS_WR:.*]] = rock.extract_multibuffer{{.*}}#gpu.address_space + // REMOVE-STAGES-NEXT: memref.store %{{.*}}, %[[LDS_WR]]{{.*}} : memref<16xf16, #gpu.address_space> +// REMOVE-STAGES: } +// Epilogue: barrier before LDS write, barrier before LDS read +// REMOVE-STAGES: memref.load %{{.*}} : memref<16xf16, #gpu.address_space> +// REMOVE-STAGES: rock.lds_barrier +// REMOVE-STAGES: memref.store %{{.*}} : memref<16xf16, #gpu.address_space> +// REMOVE-STAGES: rock.lds_barrier +// REMOVE-STAGES: return func.func @rock_pipeline_4_stages_ii_1_f16_less_iterations_2(%input : memref<16xf16, #gpu.address_space>, %output : memref<16xf16, #gpu.address_space>){ %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index @@ -699,6 +866,26 @@ func.func @rock_nopipeline(%input : memref<16xi8, #gpu.address_space>, % // The three-way rotation S2,S3,S4 -> S4,S3,S2 avoids private multi-buffering // for regB and regC. // REMOVE-STAGES-LABEL: rock_pipeline_5_stages_three_way_swap +// REMOVE-STAGES-NOT: rock.stage +// Prologue: barrier before LDS write (twice for 2-deep prologue) +// REMOVE-STAGES: memref.load %{{.*}} : memref<16xi8, #gpu.address_space> +// REMOVE-STAGES: rock.lds_barrier +// REMOVE-STAGES: memref.load %{{.*}} : memref<16xi8, #gpu.address_space> +// REMOVE-STAGES: rock.lds_barrier +// REMOVE-STAGES: scf.for + // REMOVE-STAGES-NOT: rock.stage + // Barrier before LDS write + // REMOVE-STAGES: memref.load %{{.*}} : memref<16xi8, #gpu.address_space> + // REMOVE-STAGES-NEXT: rock.lds_barrier + // REMOVE-STAGES-NEXT: %[[LDS_WR:.*]] = rock.extract_multibuffer{{.*}}#gpu.address_space + // REMOVE-STAGES-NEXT: memref.store %{{.*}}, %[[LDS_WR]]{{.*}} : memref<16xi8, #gpu.address_space> +// REMOVE-STAGES: } +// Epilogue: barrier before LDS write, barrier before LDS read +// REMOVE-STAGES: memref.load %{{.*}} : memref<16xi8, #gpu.address_space> +// REMOVE-STAGES: rock.lds_barrier +// REMOVE-STAGES: memref.store %{{.*}} : memref<16xi8, #gpu.address_space> +// REMOVE-STAGES: rock.lds_barrier +// REMOVE-STAGES: return // CHECK-LABEL: rock_pipeline_5_stages_three_way_swap func.func @rock_pipeline_5_stages_three_way_swap(%input : memref<16xi8, #gpu.address_space>, %output : memref<16xi8, #gpu.address_space>){ %c0 = arith.constant 0 : index diff --git a/mlir/test/Dialect/Rock/test_rock_pipeline_nested.mlir b/mlir/test/Dialect/Rock/test_rock_pipeline_nested.mlir index 7fdfd334f200..f3a463094aac 100644 --- a/mlir/test/Dialect/Rock/test_rock_pipeline_nested.mlir +++ b/mlir/test/Dialect/Rock/test_rock_pipeline_nested.mlir @@ -2,6 +2,23 @@ // RUN: rocmlir-opt %s --rock-pipeline="rock-pipeline-remove-stages=true" | FileCheck %s --check-prefix=REMOVE-STAGES // REMOVE-STAGES-LABEL: rock_nopipeline +// No pipeline attribute - no barriers expected +// REMOVE-STAGES: rock.alloc() : memref<32xi8, #gpu.address_space> +// REMOVE-STAGES: rock.alloc() : memref<32xi8, #gpu.address_space> +// REMOVE-STAGES-NOT: rock.alloc() : memref<32xi8, #gpu.address_space> +// REMOVE-STAGES-NOT: rock.stage +// REMOVE-STAGES-NOT: rock.lds_barrier +// REMOVE-STAGES: scf.for + // REMOVE-STAGES-NOT: rock.stage + // REMOVE-STAGES-NOT: rock.lds_barrier + // REMOVE-STAGES-NOT: rock.extract_multibuffer + // REMOVE-STAGES: scf.for + // REMOVE-STAGES-NOT: rock.stage + // REMOVE-STAGES-NOT: rock.lds_barrier + // REMOVE-STAGES: scf.for + // REMOVE-STAGES-NOT: rock.stage + // REMOVE-STAGES-NOT: rock.lds_barrier +// REMOVE-STAGES: return func.func @rock_nopipeline(%input : memref<16xf16, #gpu.address_space>, %output : memref<16xf16, #gpu.address_space>){ %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index @@ -18,18 +35,6 @@ func.func @rock_nopipeline(%input : memref<16xf16, #gpu.address_space>, %lds0 = memref.view %rawLds0[%c0][] : memref<32xi8, #gpu.address_space> to memref<16xf16, #gpu.address_space> %lds1 = memref.view %rawLds1[%c0][] : memref<32xi8, #gpu.address_space> to memref<16xf16, #gpu.address_space> - // REMOVE-STAGES: rock.alloc() : memref<32xi8, #gpu.address_space> - // REMOVE-STAGES: rock.alloc() : memref<32xi8, #gpu.address_space> - // REMOVE-STAGES-NOT: rock.alloc() : memref<32xi8, #gpu.address_space> - - // REMOVE-STAGES-NOT: rock.stage - // REMOVE-STAGES: scf.for - // REMOVE-STAGES-NOT: rock.stage - // REMOVE-STAGES-NOT: rock.extract_multibuffer - // REMOVE-STAGES: scf.for - // REMOVE-STAGES-NOT: rock.stage - // REMOVE-STAGES: scf.for - // REMOVE-STAGES-NOT: rock.stage scf.for %idx = %c0 to %c4 step %c1 { scf.for %arg3 = %c0 to %c3 step %c1 { rock.stage { @@ -88,6 +93,25 @@ func.func @rock_nopipeline(%input : memref<16xf16, #gpu.address_space>, // one loop inside another loop, inner loop has pipeline<1> attribute // CHECK-LABEL: rock_pipeline_oneloop +// REMOVE-STAGES-LABEL: rock_pipeline_oneloop +// REMOVE-STAGES-NOT: rock.stage +// REMOVE-STAGES: scf.for + // Prologue: barrier before LDS write + // REMOVE-STAGES: memref.load %{{.*}} : memref<16xf16, #gpu.address_space> + // REMOVE-STAGES: rock.lds_barrier + // REMOVE-STAGES: memref.store %{{.*}} : memref<16xf16, #gpu.address_space> + // REMOVE-STAGES: scf.for + // Loop body: barrier before LDS write + // REMOVE-STAGES: memref.load %{{.*}} : memref<16xf16, #gpu.address_space> + // REMOVE-STAGES-NEXT: rock.lds_barrier + // REMOVE-STAGES: } + // Epilogue: barrier before LDS write + // REMOVE-STAGES: rock.lds_barrier + // REMOVE-STAGES: memref.store %{{.*}} : memref<16xf16, #gpu.address_space> + // Epilogue: barrier before LDS read + // REMOVE-STAGES: rock.lds_barrier + // REMOVE-STAGES: memref.load %{{.*}} : memref<16xf16, #gpu.address_space> +// REMOVE-STAGES: return func.func @rock_pipeline_oneloop(%input : memref<16xf16, #gpu.address_space>, %output : memref<16xf16, #gpu.address_space>){ %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index @@ -175,6 +199,40 @@ func.func @rock_pipeline_oneloop(%input : memref<16xf16, #gpu.address_space attribute // CHECK-LABEL: rock_pipeline_twoloops +// REMOVE-STAGES-LABEL: rock_pipeline_twoloops +// REMOVE-STAGES-NOT: rock.stage +// REMOVE-STAGES: scf.for + // First inner pipelined loop + // Prologue: barrier before LDS write + // REMOVE-STAGES: rock.lds_barrier + // REMOVE-STAGES: memref.store %{{.*}} : memref<16xf16, #gpu.address_space> + // REMOVE-STAGES: scf.for + // Loop body: barrier before LDS write + // REMOVE-STAGES: rock.lds_barrier + // REMOVE-STAGES: memref.store %{{.*}} : memref<16xf16, #gpu.address_space> + // REMOVE-STAGES: } + // Epilogue: barrier before LDS write + // REMOVE-STAGES: rock.lds_barrier + // REMOVE-STAGES: memref.store %{{.*}} : memref<16xf16, #gpu.address_space> + // Epilogue: barrier before LDS read + // REMOVE-STAGES: rock.lds_barrier + // REMOVE-STAGES: memref.load %{{.*}} : memref<16xf16, #gpu.address_space> + // Second inner pipelined loop + // Prologue: barrier before LDS write + // REMOVE-STAGES: rock.lds_barrier + // REMOVE-STAGES: memref.store %{{.*}} : memref<16xf16, #gpu.address_space> + // REMOVE-STAGES: scf.for + // Loop body: barrier before LDS write + // REMOVE-STAGES: rock.lds_barrier + // REMOVE-STAGES: memref.store %{{.*}} : memref<16xf16, #gpu.address_space> + // REMOVE-STAGES: } + // Epilogue: barrier before LDS write + // REMOVE-STAGES: rock.lds_barrier + // REMOVE-STAGES: memref.store %{{.*}} : memref<16xf16, #gpu.address_space> + // Epilogue: barrier before LDS read + // REMOVE-STAGES: rock.lds_barrier + // REMOVE-STAGES: memref.load %{{.*}} : memref<16xf16, #gpu.address_space> +// REMOVE-STAGES: return func.func @rock_pipeline_twoloops(%input : memref<16xf16, #gpu.address_space>, %output : memref<16xf16, #gpu.address_space>) { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index @@ -323,6 +381,40 @@ func.func @rock_pipeline_twoloops(%input : memref<16xf16, #gpu.address_space attribute // CHECK-LABEL: rock_pipeline_twoloops_ii2 +// REMOVE-STAGES-LABEL: rock_pipeline_twoloops_ii2 +// REMOVE-STAGES-NOT: rock.stage +// REMOVE-STAGES: scf.for + // First inner pipelined loop (ii=2) + // Prologue: barrier before LDS write + // REMOVE-STAGES: rock.lds_barrier + // REMOVE-STAGES: memref.store %{{.*}} : memref<16xf16, #gpu.address_space> + // REMOVE-STAGES: scf.for + // Loop body: barrier before LDS read + // REMOVE-STAGES: rock.lds_barrier + // REMOVE-STAGES: memref.load %{{.*}} : memref<16xf16, #gpu.address_space> + // Loop body: barrier before LDS write + // REMOVE-STAGES: rock.lds_barrier + // REMOVE-STAGES: memref.store %{{.*}} : memref<16xf16, #gpu.address_space> + // REMOVE-STAGES: } + // Epilogue: barrier before LDS read + // REMOVE-STAGES: rock.lds_barrier + // REMOVE-STAGES: memref.load %{{.*}} : memref<16xf16, #gpu.address_space> + // Second inner pipelined loop (ii=2) + // Prologue: barrier before LDS write + // REMOVE-STAGES: rock.lds_barrier + // REMOVE-STAGES: memref.store %{{.*}} : memref<16xf16, #gpu.address_space> + // REMOVE-STAGES: scf.for + // Loop body: barrier before LDS read + // REMOVE-STAGES: rock.lds_barrier + // REMOVE-STAGES: memref.load %{{.*}} : memref<16xf16, #gpu.address_space> + // Loop body: barrier before LDS write + // REMOVE-STAGES: rock.lds_barrier + // REMOVE-STAGES: memref.store %{{.*}} : memref<16xf16, #gpu.address_space> + // REMOVE-STAGES: } + // Epilogue: barrier before LDS read + // REMOVE-STAGES: rock.lds_barrier + // REMOVE-STAGES: memref.load %{{.*}} : memref<16xf16, #gpu.address_space> +// REMOVE-STAGES: return func.func @rock_pipeline_twoloops_ii2(%input : memref<16xf16, #gpu.address_space>, %output : memref<16xf16, #gpu.address_space>) { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index @@ -444,6 +536,31 @@ func.func @rock_pipeline_twoloops_ii2(%input : memref<16xf16, #gpu.address_space // two loops inside an outer loop (which is inside another outer loop), inner loops have pipeline<1> attribute // CHECK-LABEL: rock_pipeline_twoloops_triplenested +// REMOVE-STAGES-LABEL: rock_pipeline_twoloops_triplenested +// REMOVE-STAGES-NOT: rock.stage +// REMOVE-STAGES: scf.for + // REMOVE-STAGES: scf.for + // First inner pipelined loop + // Prologue: barrier before LDS write + // REMOVE-STAGES: rock.lds_barrier + // REMOVE-STAGES: scf.for + // Loop body: barrier before LDS write + // REMOVE-STAGES: rock.lds_barrier + // REMOVE-STAGES: } + // Epilogue barriers + // REMOVE-STAGES: rock.lds_barrier + // REMOVE-STAGES: rock.lds_barrier + // Second inner pipelined loop + // Prologue: barrier before LDS write + // REMOVE-STAGES: rock.lds_barrier + // REMOVE-STAGES: scf.for + // Loop body: barrier before LDS write + // REMOVE-STAGES: rock.lds_barrier + // REMOVE-STAGES: } + // Epilogue barriers + // REMOVE-STAGES: rock.lds_barrier + // REMOVE-STAGES: rock.lds_barrier +// REMOVE-STAGES: return func.func @rock_pipeline_twoloops_triplenested(%input : memref<16xf16, #gpu.address_space>, %output : memref<16xf16, #gpu.address_space>) { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index @@ -594,6 +711,41 @@ func.func @rock_pipeline_twoloops_triplenested(%input : memref<16xf16, #gpu.addr // two outer loops that each contain two inner loops, inner loops have pipeline<1> attribute // CHECK-LABEL: rock_pipeline_twoloops_twoouterloops +// REMOVE-STAGES-LABEL: rock_pipeline_twoloops_twoouterloops +// REMOVE-STAGES-NOT: rock.stage +// First outer loop +// REMOVE-STAGES: scf.for + // First inner pipelined loop + // REMOVE-STAGES: rock.lds_barrier + // REMOVE-STAGES: scf.for + // REMOVE-STAGES: rock.lds_barrier + // REMOVE-STAGES: } + // REMOVE-STAGES: rock.lds_barrier + // REMOVE-STAGES: rock.lds_barrier + // Second inner pipelined loop + // REMOVE-STAGES: rock.lds_barrier + // REMOVE-STAGES: scf.for + // REMOVE-STAGES: rock.lds_barrier + // REMOVE-STAGES: } + // REMOVE-STAGES: rock.lds_barrier + // REMOVE-STAGES: rock.lds_barrier +// Second outer loop +// REMOVE-STAGES: scf.for + // First inner pipelined loop + // REMOVE-STAGES: rock.lds_barrier + // REMOVE-STAGES: scf.for + // REMOVE-STAGES: rock.lds_barrier + // REMOVE-STAGES: } + // REMOVE-STAGES: rock.lds_barrier + // REMOVE-STAGES: rock.lds_barrier + // Second inner pipelined loop + // REMOVE-STAGES: rock.lds_barrier + // REMOVE-STAGES: scf.for + // REMOVE-STAGES: rock.lds_barrier + // REMOVE-STAGES: } + // REMOVE-STAGES: rock.lds_barrier + // REMOVE-STAGES: rock.lds_barrier +// REMOVE-STAGES: return func.func @rock_pipeline_twoloops_twoouterloops(%input : memref<16xf16, #gpu.address_space>, %output : memref<16xf16, #gpu.address_space>) { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index @@ -867,6 +1019,18 @@ func.func @rock_pipeline_twoloops_twoouterloops(%input : memref<16xf16, #gpu.add // one loop inside a loop, inner loops have pipeline<1> attribute and no rock.stages // CHECK-LABEL: rock_pipeline_nestednostages +// REMOVE-STAGES-LABEL: rock_pipeline_nestednostages +// No stages to pipeline - but barriers are preserved when stages are empty +// REMOVE-STAGES-NOT: rock.stage +// REMOVE-STAGES: scf.for + // REMOVE-STAGES: scf.for + // Barriers preserved: barrier before LDS write + // REMOVE-STAGES: rock.lds_barrier + // REMOVE-STAGES: memref.store %{{.*}} : memref<16xf16, #gpu.address_space> + // Barriers preserved: barrier before LDS read + // REMOVE-STAGES: rock.lds_barrier + // REMOVE-STAGES: memref.load %{{.*}} : memref<16xf16, #gpu.address_space> +// REMOVE-STAGES: return func.func @rock_pipeline_nestednostages(%input : memref<16xf16, #gpu.address_space>, %output : memref<16xf16, #gpu.address_space>) { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index @@ -916,6 +1080,20 @@ func.func @rock_pipeline_nestednostages(%input : memref<16xf16, #gpu.address_spa // two loops inside an outer loop, inner loops have pipeline<2> attribute and two rock.stages // CHECK-LABEL: rock_pipeline_twoloops_ii_equal_numstages +// REMOVE-STAGES-LABEL: rock_pipeline_twoloops_ii_equal_numstages +// REMOVE-STAGES-NOT: rock.stage +// REMOVE-STAGES: scf.for + // First inner pipelined loop (ii=2, 2 stages - no prologue/epilogue) + // REMOVE-STAGES: scf.for + // Loop body: barrier before LDS write, barrier before LDS read + // REMOVE-STAGES: rock.lds_barrier + // REMOVE-STAGES: rock.lds_barrier + // Second inner pipelined loop (ii=2, 2 stages - no prologue/epilogue) + // REMOVE-STAGES: scf.for + // Loop body: barrier before LDS write, barrier before LDS read + // REMOVE-STAGES: rock.lds_barrier + // REMOVE-STAGES: rock.lds_barrier +// REMOVE-STAGES: return func.func @rock_pipeline_twoloops_ii_equal_numstages(%input : memref<16xf16, #gpu.address_space>, %output : memref<16xf16, #gpu.address_space>) { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index From 7faecdcb8fe48b79556b17a074e949ff6d887a0e Mon Sep 17 00:00:00 2001 From: Umang Yadav Date: Tue, 24 Feb 2026 15:21:22 +0000 Subject: [PATCH 2/3] Remove debug prints --- mlir/lib/Dialect/Rock/Transforms/RockPipeline.cpp | 8 -------- 1 file changed, 8 deletions(-) diff --git a/mlir/lib/Dialect/Rock/Transforms/RockPipeline.cpp b/mlir/lib/Dialect/Rock/Transforms/RockPipeline.cpp index 84a40b231c6a..bc3abe13ef26 100644 --- a/mlir/lib/Dialect/Rock/Transforms/RockPipeline.cpp +++ b/mlir/lib/Dialect/Rock/Transforms/RockPipeline.cpp @@ -770,8 +770,6 @@ void RockPipeline::runOnOperation() { // barriers for registers or globals placeBarriers(rewriter, loc, forOp, stages, multiAllocs, extendedStages, ii, numIterations); - LLVM_DEBUG(DBGS() << "ForOp: " << forOp << "\n"); - ScheduleType schedule; // use all "resources" to generate dependency graph and generate schedule createSchedule(extendedStages, resources, ii, schedule, @@ -803,12 +801,6 @@ void RockPipeline::runOnOperation() { } } } - LLVM_DEBUG({ - DBGS() << "After remulti-buffer\n"; - DBGS() << "===============\n"; - DBGS() << "Operation: " << getOperation() << "\n"; - DBGS() << "===============\n"; - }); // Cleanup the stages { From 9e4136bd514afef4b0d24de366c8d636df9c5bd8 Mon Sep 17 00:00:00 2001 From: Umang Yadav <29876643+umangyadav@users.noreply.github.com> Date: Tue, 24 Feb 2026 15:42:32 -0500 Subject: [PATCH 3/3] Update mlir/test/Dialect/Rock/rock-pipeline-early-exit.mlir Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- mlir/test/Dialect/Rock/rock-pipeline-early-exit.mlir | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mlir/test/Dialect/Rock/rock-pipeline-early-exit.mlir b/mlir/test/Dialect/Rock/rock-pipeline-early-exit.mlir index 93d7bfce7506..aa6435ecb127 100644 --- a/mlir/test/Dialect/Rock/rock-pipeline-early-exit.mlir +++ b/mlir/test/Dialect/Rock/rock-pipeline-early-exit.mlir @@ -66,10 +66,10 @@ module { // CHECK: arith.addf // CHECK: memref.store {{.*}}[%[[INNER_IV]]] // CHECK: } - // CHECK: %[[ALLOC_LDS_A:.*]] = rock.alloc() : memref<16xf16, #gpu.address_space> - // CHECK: %[[ALLOC_LDS_B:.*]] = rock.alloc() : memref<16xf16, #gpu.address_space> - // CHECK: %[[WID_LDS:.*]] = rock.workitem_id : index - // CHECK: memref.load %[[ALLOC_LDS_A]][%c0] + // CHECK: %[[ALLOC_PRIV_A:.*]] = rock.alloc() : memref<16xf16, #gpu.address_space> + // CHECK: %[[ALLOC_PRIV_B:.*]] = rock.alloc() : memref<16xf16, #gpu.address_space> + // CHECK: %[[WID_PRIV:.*]] = rock.workitem_id : index + // CHECK: memref.load %[[ALLOC_PRIV_A]][%c0] // CHECK-NEXT: rock.lds_barrier affine.for %arg5 = 0 to 16 { %4 = memref.load %1[%arg5] : memref<64xf16, #gpu.address_space>