Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 17 additions & 25 deletions mlir/lib/Dialect/Rock/Transforms/RockPipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -139,8 +139,9 @@ struct RemoveBackToBackBarriersRewritePattern

LogicalResult matchAndRewrite(rock::LDSBarrierOp op,
PatternRewriter &rw) const override {
if (dyn_cast_or_null<rock::LDSBarrierOp>(op->getNextNode())) {
op->getNextNode()->erase();
if (auto nextBarrier =
dyn_cast_or_null<rock::LDSBarrierOp>(op->getNextNode())) {
rw.eraseOp(nextBarrier);
return success();
}
return failure();
Expand All @@ -162,7 +163,12 @@ struct PushBarrierDownRewritePattern
return failure();

// Don't go over the terminator
if (!nextOp->getNextNode())
if (nextOp->hasTrait<OpTrait::IsTerminator>() ||
nextOp->hasTrait<OpTrait::ReturnLike>())
return failure();

// Don't push past another barrier - let RemoveBackToBackBarriers handle it
if (isa<rock::LDSBarrierOp>(nextOp))
return failure();

// We assume that operations that have a body may modify LDS
Expand Down Expand Up @@ -733,14 +739,14 @@ void RockPipeline::runOnOperation() {

forOp.walk([&](rock::StageOp stageOp) { stages.push_back(stageOp); });

if (stages.empty())
continue;

forOp.walk([](rock::LDSBarrierOp barrier) {
if (!barrier->getParentOfType<rock::StageOp>())
barrier->erase();
});

if (stages.empty())
continue;

LLVM_DEBUG(DBGS() << "Number of stages: " << stages.size() << "\n");
LLVM_DEBUG(DBGS() << "Initiation Interval: " << ii << "\n");
size_t numStages = stages.size();
Expand All @@ -764,7 +770,6 @@ void RockPipeline::runOnOperation() {
// barriers for registers or globals
placeBarriers(rewriter, loc, forOp, stages, multiAllocs, extendedStages,
ii, numIterations);

ScheduleType schedule;
// use all "resources" to generate dependency graph and generate schedule
createSchedule(extendedStages, resources, ii, schedule,
Expand Down Expand Up @@ -800,26 +805,13 @@ void RockPipeline::runOnOperation() {
// Cleanup the stages
{
if (removeStages) {
RewritePatternSet patternsPushBarrier(&getContext());
// run PushBarrierDownRewritePattern before RemoveStagesRewritePattern,
// because the latter will remove the stages and their terminators
patternsPushBarrier.add<PushBarrierDownRewritePattern>(ctx);
if (failed(applyPatternsGreedily(func, std::move(patternsPushBarrier))))
return signalPassFailure();

// run RemoveStagesRewritePattern before
// RemoveBackToBackBarriersRewritePattern, because the latter expects to
// find no stages
RewritePatternSet patternsRemoveStages(&getContext());
patternsRemoveStages.add<RemoveStagesRewritePattern>(ctx);
RewritePatternSet patterns(&getContext());
patterns.add<RemoveStagesRewritePattern, PushBarrierDownRewritePattern,
RemoveBackToBackBarriersRewritePattern>(&getContext());
if (failed(
applyPatternsGreedily(func, std::move(patternsRemoveStages))))
return signalPassFailure();

RewritePatternSet patternsBackToBack(&getContext());
patternsBackToBack.add<RemoveBackToBackBarriersRewritePattern>(ctx);
if (failed(applyPatternsGreedily(func, std::move(patternsBackToBack))))
applyPatternsGreedily(getOperation(), std::move(patterns)))) {
return signalPassFailure();
Comment on lines +808 to 813
Copy link

Copilot AI Feb 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the remove-stages cleanup, applyPatternsGreedily() is now given a single pattern set containing RemoveStagesRewritePattern, PushBarrierDownRewritePattern, and RemoveBackToBackBarriersRewritePattern. Greedy rewrites don't guarantee an explicit phase ordering between patterns, so this makes correct barrier placement depend on the driver’s worklist behavior. To keep the behavior deterministic (and aligned with the intent to push barriers only after stages/terminators are removed), consider running these pattern applications in separate applyPatternsGreedily calls (e.g., RemoveStages first, then PushBarrierDown, then back-to-back cleanup) or assign benefits to enforce the intended ordering.

Copilot uses AI. Check for mistakes.
}
}
}
}
Expand Down
12 changes: 5 additions & 7 deletions mlir/test/Dialect/Rock/rock-pipeline-early-exit.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,10 @@ module {
// CHECK: arith.addf
// CHECK: memref.store {{.*}}[%[[INNER_IV]]]
// CHECK: }
// CHECK: %[[ALLOC_PRIV_A:.*]] = rock.alloc() : memref<16xf16, #gpu.address_space<private>>
// CHECK: %[[ALLOC_PRIV_B:.*]] = rock.alloc() : memref<16xf16, #gpu.address_space<private>>
// CHECK: %[[WID_PRIV:.*]] = rock.workitem_id : index
// CHECK: memref.load %[[ALLOC_PRIV_A]][%c0]
// CHECK-NEXT: rock.lds_barrier
affine.for %arg5 = 0 to 16 {
%4 = memref.load %1[%arg5] : memref<64xf16, #gpu.address_space<workgroup>>
Expand All @@ -82,13 +86,7 @@ module {
rock.lds_barrier
} {pipeline = #rock.pipeline<2>}

// CHECK: %[[ALLOC_G:.*]] = rock.alloc() : memref<16xf16, #gpu.address_space<private>>
// CHECK: %[[ALLOC_H:.*]] = rock.alloc() : memref<16xf16, #gpu.address_space<private>>
// CHECK: %[[WID4:.*]] = rock.workitem_id : index
// CHECK: memref.load %[[ALLOC_G]][%c0]
// CHECK: memref.store {{.*}}, {{.*}}[%[[WID4]]]
// CHECK: memref.load %[[ALLOC_H]][%c0]
// CHECK: memref.store {{.*}}, {{.*}}[%[[WID4]]]
// CHECK: memref.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<64xf16, #gpu.address_space<workgroup>>
// CHECK: }
// CHECK-NOT: {pipeline = #rock.pipeline<2>}

Expand Down
187 changes: 187 additions & 0 deletions mlir/test/Dialect/Rock/test_rock_pipeline.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,17 @@
// RUN: rocmlir-opt %s --rock-pipeline="rock-pipeline-remove-stages=true" | FileCheck %s --check-prefix=REMOVE-STAGES

// CHECK-LABEL: rock_pipeline_3_stages_ii_1
// REMOVE-STAGES-LABEL: rock_pipeline_3_stages_ii_1
// REMOVE-STAGES-NOT: rock.stage
// REMOVE-STAGES: scf.for
// REMOVE-STAGES-NOT: rock.stage
// REMOVE-STAGES: memref.load %{{.*}} : memref<16xi8, #gpu.address_space<private>>
// REMOVE-STAGES-NEXT: rock.lds_barrier
// REMOVE-STAGES-NEXT: %[[LDS_BUF:.*]] = rock.extract_multibuffer{{.*}}#gpu.address_space<workgroup>
// REMOVE-STAGES-NEXT: memref.store %{{.*}}, %[[LDS_BUF]]{{.*}} : memref<16xi8, #gpu.address_space<workgroup>>
// REMOVE-STAGES: rock.lds_barrier
// REMOVE-STAGES: rock.lds_barrier
// REMOVE-STAGES: return
func.func @rock_pipeline_3_stages_ii_1(%input : memref<16xi8, #gpu.address_space<global>>, %output : memref<16xi8, #gpu.address_space<global>>){
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
Expand Down Expand Up @@ -60,6 +71,22 @@ func.func @rock_pipeline_3_stages_ii_1(%input : memref<16xi8, #gpu.address_space
}

// CHECK-LABEL: rock_pipeline_3_stages_ii_2
// REMOVE-STAGES-LABEL: rock_pipeline_3_stages_ii_2
// REMOVE-STAGES-NOT: rock.stage
// REMOVE-STAGES: scf.for
// REMOVE-STAGES-NOT: rock.stage
// Barrier before LDS read
// REMOVE-STAGES: memref.store %{{.*}} : memref<16xi8, #gpu.address_space<private>>
// REMOVE-STAGES-NEXT: rock.lds_barrier
// REMOVE-STAGES-NEXT: %[[LDS_RD:.*]] = rock.extract_multibuffer{{.*}}#gpu.address_space<workgroup>
// REMOVE-STAGES-NEXT: memref.load %[[LDS_RD]]
// Barrier before LDS write
// REMOVE-STAGES: memref.load %{{.*}} : memref<16xi8, #gpu.address_space<private>>
// REMOVE-STAGES-NEXT: rock.lds_barrier
// REMOVE-STAGES-NEXT: %[[LDS_WR:.*]] = rock.extract_multibuffer{{.*}}#gpu.address_space<workgroup>
// REMOVE-STAGES-NEXT: memref.store %{{.*}}, %[[LDS_WR]]{{.*}} : memref<16xi8, #gpu.address_space<workgroup>>
// REMOVE-STAGES: rock.lds_barrier
// REMOVE-STAGES: return
func.func @rock_pipeline_3_stages_ii_2(%input : memref<16xi8, #gpu.address_space<global>>, %output : memref<16xi8, #gpu.address_space<global>>){
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
Expand Down Expand Up @@ -119,6 +146,19 @@ func.func @rock_pipeline_3_stages_ii_2(%input : memref<16xi8, #gpu.address_space

// this test shouldn't pipeline loop but it would add barriers and multibuffer by 1
// CHECK-LABEL: rock_pipeline_3_stages_ii_2_less_iterations
// REMOVE-STAGES-LABEL: rock_pipeline_3_stages_ii_2_less_iterations
// REMOVE-STAGES-NOT: rock.stage
// REMOVE-STAGES: scf.for
// REMOVE-STAGES-NOT: rock.stage
// Barrier before LDS read
// REMOVE-STAGES: memref.store %{{.*}} : memref<16xi8, #gpu.address_space<private>>
// REMOVE-STAGES-NEXT: rock.lds_barrier
// REMOVE-STAGES-NEXT: %[[LDS_RD:.*]] = rock.extract_multibuffer{{.*}}#gpu.address_space<workgroup>
// REMOVE-STAGES-NEXT: memref.load %[[LDS_RD]]
// REMOVE-STAGES: }
// Epilogue: barrier before LDS read
// REMOVE-STAGES-NEXT: rock.lds_barrier
// REMOVE-STAGES: return
func.func @rock_pipeline_3_stages_ii_2_less_iterations(%input : memref<16xi8, #gpu.address_space<global>>, %output : memref<16xi8, #gpu.address_space<global>>){
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
Expand Down Expand Up @@ -170,6 +210,23 @@ func.func @rock_pipeline_3_stages_ii_2_less_iterations(%input : memref<16xi8, #g
}

// CHECK-LABEL: rock_pipeline_3_stages_ii_3
// REMOVE-STAGES-LABEL: rock_pipeline_3_stages_ii_3
// REMOVE-STAGES-NOT: rock.stage
// REMOVE-STAGES: scf.for
// REMOVE-STAGES-NOT: rock.stage
// Barrier before LDS write
// REMOVE-STAGES: memref.load %{{.*}} : memref<16xi8, #gpu.address_space<private>>
// REMOVE-STAGES-NEXT: rock.lds_barrier
// REMOVE-STAGES-NEXT: %[[LDS_WR:.*]] = rock.extract_multibuffer{{.*}}#gpu.address_space<workgroup>
// REMOVE-STAGES-NEXT: memref.store %{{.*}}, %[[LDS_WR]]{{.*}} : memref<16xi8, #gpu.address_space<workgroup>>
// Barrier before LDS read
// REMOVE-STAGES: rock.lds_barrier
// REMOVE-STAGES-NEXT: %[[LDS_RD:.*]] = rock.extract_multibuffer{{.*}}#gpu.address_space<workgroup>
// REMOVE-STAGES-NEXT: memref.load %[[LDS_RD]]
// REMOVE-STAGES: }
// No barrier after loop for this function - all barriers are inside the loop
// REMOVE-STAGES-NOT: rock.lds_barrier
// REMOVE-STAGES: return
func.func @rock_pipeline_3_stages_ii_3(%input : memref<16xi8, #gpu.address_space<global>>, %output : memref<16xi8, #gpu.address_space<global>>){
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
Expand Down Expand Up @@ -223,6 +280,14 @@ func.func @rock_pipeline_3_stages_ii_3(%input : memref<16xi8, #gpu.address_space

// This test shouldn't do any pipelining as it doesn't have any stages but it should still multibuffer by 1
// CHECK-LABEL: rock_pipeline_no_stages_ii_1
// REMOVE-STAGES-LABEL: rock_pipeline_no_stages_ii_1
// REMOVE-STAGES-NOT: rock.stage
// REMOVE-STAGES-NOT: rock.lds_barrier
// REMOVE-STAGES: scf.for
// REMOVE-STAGES-NOT: rock.stage
// REMOVE-STAGES-NOT: rock.lds_barrier
// REMOVE-STAGES-NOT: rock.lds_barrier
// REMOVE-STAGES: return
func.func @rock_pipeline_no_stages_ii_1(%input : memref<16xi8, #gpu.address_space<global>>, %output : memref<16xi8, #gpu.address_space<global>>){
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
Expand Down Expand Up @@ -266,6 +331,28 @@ func.func @rock_pipeline_no_stages_ii_1(%input : memref<16xi8, #gpu.address_spac
}

// CHECK-LABEL: rock_pipeline_4_stages_ii_2
// REMOVE-STAGES-LABEL: rock_pipeline_4_stages_ii_2
// REMOVE-STAGES-NOT: rock.stage
// Prologue: barrier before LDS read
// REMOVE-STAGES: memref.store %{{.*}} : memref<16xi8, #gpu.address_space<workgroup>>
// REMOVE-STAGES-NEXT: rock.lds_barrier
// REMOVE-STAGES-NEXT: %[[LDS_PRO:.*]] = rock.extract_multibuffer{{.*}}#gpu.address_space<workgroup>
// REMOVE-STAGES-NEXT: memref.load %[[LDS_PRO]]
// REMOVE-STAGES: scf.for
// REMOVE-STAGES-NOT: rock.stage
// Barrier before LDS write
// REMOVE-STAGES: memref.load %{{.*}} : memref<16xi8, #gpu.address_space<global>>
// REMOVE-STAGES-NEXT: rock.lds_barrier
// REMOVE-STAGES-NEXT: %[[LDS_WR:.*]] = rock.extract_multibuffer{{.*}}#gpu.address_space<workgroup>
// REMOVE-STAGES-NEXT: memref.store %{{.*}}, %[[LDS_WR]]{{.*}} : memref<16xi8, #gpu.address_space<workgroup>>
// Barrier before LDS read
// REMOVE-STAGES: rock.lds_barrier
// REMOVE-STAGES-NEXT: %[[LDS_RD:.*]] = rock.extract_multibuffer{{.*}}#gpu.address_space<workgroup>
// REMOVE-STAGES-NEXT: memref.load %[[LDS_RD]]
// REMOVE-STAGES: }
// REMOVE-STAGES-NEXT: rock.lds_barrier
// REMOVE-STAGES: rock.lds_barrier
// REMOVE-STAGES: return
func.func @rock_pipeline_4_stages_ii_2(%input : memref<16xi8, #gpu.address_space<global>>, %output : memref<16xi8, #gpu.address_space<global>>){
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
Expand Down Expand Up @@ -333,6 +420,27 @@ func.func @rock_pipeline_4_stages_ii_2(%input : memref<16xi8, #gpu.address_space
}

// CHECK-LABEL: rock_pipeline_4_stages_ii_1_i8
// REMOVE-STAGES-LABEL: rock_pipeline_4_stages_ii_1_i8
// REMOVE-STAGES-NOT: rock.stage
// Prologue: barrier before LDS write
// REMOVE-STAGES: memref.load %{{.*}} : memref<16xi8, #gpu.address_space<private>>
// REMOVE-STAGES: rock.lds_barrier
// REMOVE-STAGES-NEXT: %[[LDS_PRO:.*]] = rock.extract_multibuffer{{.*}}#gpu.address_space<workgroup>
// REMOVE-STAGES-NEXT: memref.store %{{.*}}, %[[LDS_PRO]]{{.*}} : memref<16xi8, #gpu.address_space<workgroup>>
// REMOVE-STAGES: scf.for
// REMOVE-STAGES-NOT: rock.stage
// Barrier before LDS write
// REMOVE-STAGES: memref.load %{{.*}} : memref<16xi8, #gpu.address_space<private>>
// REMOVE-STAGES-NEXT: rock.lds_barrier
// REMOVE-STAGES-NEXT: %[[LDS_WR:.*]] = rock.extract_multibuffer{{.*}}#gpu.address_space<workgroup>
// REMOVE-STAGES-NEXT: memref.store %{{.*}}, %[[LDS_WR]]{{.*}} : memref<16xi8, #gpu.address_space<workgroup>>
// REMOVE-STAGES: }
// Epilogue: barrier before LDS write, barrier before LDS read
// REMOVE-STAGES: memref.load %{{.*}} : memref<16xi8, #gpu.address_space<private>>
// REMOVE-STAGES: rock.lds_barrier
// REMOVE-STAGES: memref.store %{{.*}} : memref<16xi8, #gpu.address_space<private>>
// REMOVE-STAGES: rock.lds_barrier
// REMOVE-STAGES: return
func.func @rock_pipeline_4_stages_ii_1_i8(%input : memref<16xi8, #gpu.address_space<global>>, %output : memref<16xi8, #gpu.address_space<global>>){
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
Expand Down Expand Up @@ -413,6 +521,27 @@ func.func @rock_pipeline_4_stages_ii_1_i8(%input : memref<16xi8, #gpu.address_sp
}

// CHECK-LABEL: rock_pipeline_4_stages_ii_1_f16
// REMOVE-STAGES-LABEL: rock_pipeline_4_stages_ii_1_f16
// REMOVE-STAGES-NOT: rock.stage
// Prologue: barrier before LDS write
// REMOVE-STAGES: memref.load %{{.*}} : memref<16xf16, #gpu.address_space<private>>
// REMOVE-STAGES: rock.lds_barrier
// REMOVE-STAGES-NEXT: %[[LDS_PRO:.*]] = rock.extract_multibuffer{{.*}}#gpu.address_space<workgroup>
// REMOVE-STAGES-NEXT: memref.store %{{.*}}, %[[LDS_PRO]]{{.*}} : memref<16xf16, #gpu.address_space<workgroup>>
// REMOVE-STAGES: scf.for
// REMOVE-STAGES-NOT: rock.stage
// Barrier before LDS write
// REMOVE-STAGES: memref.load %{{.*}} : memref<16xf16, #gpu.address_space<private>>
// REMOVE-STAGES-NEXT: rock.lds_barrier
// REMOVE-STAGES-NEXT: %[[LDS_WR:.*]] = rock.extract_multibuffer{{.*}}#gpu.address_space<workgroup>
// REMOVE-STAGES-NEXT: memref.store %{{.*}}, %[[LDS_WR]]{{.*}} : memref<16xf16, #gpu.address_space<workgroup>>
// REMOVE-STAGES: }
// Epilogue: barrier before LDS write, barrier before LDS read
// REMOVE-STAGES: memref.load %{{.*}} : memref<16xf16, #gpu.address_space<private>>
// REMOVE-STAGES: rock.lds_barrier
// REMOVE-STAGES: memref.store %{{.*}} : memref<16xf16, #gpu.address_space<private>>
// REMOVE-STAGES: rock.lds_barrier
// REMOVE-STAGES: return
func.func @rock_pipeline_4_stages_ii_1_f16(%input : memref<16xf16, #gpu.address_space<global>>, %output : memref<16xf16, #gpu.address_space<global>>){
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
Expand Down Expand Up @@ -489,6 +618,23 @@ func.func @rock_pipeline_4_stages_ii_1_f16(%input : memref<16xf16, #gpu.address_

// This test should adjust II to 2 to enable loop pipelining
// CHECK-LABEL: rock_pipeline_4_stages_ii_1_f16_less_iterations
// REMOVE-STAGES-LABEL: rock_pipeline_4_stages_ii_1_f16_less_iterations
// REMOVE-STAGES-NOT: rock.stage
// REMOVE-STAGES: scf.for
// REMOVE-STAGES-NOT: rock.stage
// Barrier before LDS read
// REMOVE-STAGES: memref.store %{{.*}} : memref<16xf16, #gpu.address_space<private>>
// REMOVE-STAGES-NEXT: rock.lds_barrier
// REMOVE-STAGES-NEXT: %[[LDS_RD:.*]] = rock.extract_multibuffer{{.*}}#gpu.address_space<workgroup>
// REMOVE-STAGES-NEXT: memref.load %[[LDS_RD]]
// Barrier before LDS write
// REMOVE-STAGES: memref.load %{{.*}} : memref<16xf16, #gpu.address_space<private>>
// REMOVE-STAGES-NEXT: rock.lds_barrier
// REMOVE-STAGES-NEXT: %[[LDS_WR:.*]] = rock.extract_multibuffer{{.*}}#gpu.address_space<workgroup>
// REMOVE-STAGES-NEXT: memref.store %{{.*}}, %[[LDS_WR]]{{.*}} : memref<16xf16, #gpu.address_space<workgroup>>
// REMOVE-STAGES: }
// REMOVE-STAGES-NEXT: rock.lds_barrier
// REMOVE-STAGES: return
func.func @rock_pipeline_4_stages_ii_1_f16_less_iterations(%input : memref<16xf16, #gpu.address_space<global>>, %output : memref<16xf16, #gpu.address_space<global>>){
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
Expand Down Expand Up @@ -557,6 +703,27 @@ func.func @rock_pipeline_4_stages_ii_1_f16_less_iterations(%input : memref<16xf1

// this test should do loop pipelining without adjust II but notice that it emits scf.for loop with zero iterations.
// CHECK-LABEL: rock_pipeline_4_stages_ii_1_f16_less_iterations_2
// REMOVE-STAGES-LABEL: rock_pipeline_4_stages_ii_1_f16_less_iterations_2
// REMOVE-STAGES-NOT: rock.stage
// Prologue: barrier before LDS write
// REMOVE-STAGES: memref.load %{{.*}} : memref<16xf16, #gpu.address_space<private>>
// REMOVE-STAGES: rock.lds_barrier
// REMOVE-STAGES-NEXT: %[[LDS_PRO:.*]] = rock.extract_multibuffer{{.*}}#gpu.address_space<workgroup>
// REMOVE-STAGES-NEXT: memref.store %{{.*}}, %[[LDS_PRO]]{{.*}} : memref<16xf16, #gpu.address_space<workgroup>>
// REMOVE-STAGES: scf.for
// REMOVE-STAGES-NOT: rock.stage
// Barrier before LDS write
// REMOVE-STAGES: memref.load %{{.*}} : memref<16xf16, #gpu.address_space<private>>
// REMOVE-STAGES-NEXT: rock.lds_barrier
// REMOVE-STAGES-NEXT: %[[LDS_WR:.*]] = rock.extract_multibuffer{{.*}}#gpu.address_space<workgroup>
// REMOVE-STAGES-NEXT: memref.store %{{.*}}, %[[LDS_WR]]{{.*}} : memref<16xf16, #gpu.address_space<workgroup>>
// REMOVE-STAGES: }
// Epilogue: barrier before LDS write, barrier before LDS read
// REMOVE-STAGES: memref.load %{{.*}} : memref<16xf16, #gpu.address_space<private>>
// REMOVE-STAGES: rock.lds_barrier
// REMOVE-STAGES: memref.store %{{.*}} : memref<16xf16, #gpu.address_space<private>>
// REMOVE-STAGES: rock.lds_barrier
// REMOVE-STAGES: return
func.func @rock_pipeline_4_stages_ii_1_f16_less_iterations_2(%input : memref<16xf16, #gpu.address_space<global>>, %output : memref<16xf16, #gpu.address_space<global>>){
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
Expand Down Expand Up @@ -699,6 +866,26 @@ func.func @rock_nopipeline(%input : memref<16xi8, #gpu.address_space<global>>, %
// The three-way rotation S2,S3,S4 -> S4,S3,S2 avoids private multi-buffering
// for regB and regC.
// REMOVE-STAGES-LABEL: rock_pipeline_5_stages_three_way_swap
// REMOVE-STAGES-NOT: rock.stage
// Prologue: barrier before LDS write (twice for 2-deep prologue)
// REMOVE-STAGES: memref.load %{{.*}} : memref<16xi8, #gpu.address_space<private>>
// REMOVE-STAGES: rock.lds_barrier
// REMOVE-STAGES: memref.load %{{.*}} : memref<16xi8, #gpu.address_space<private>>
// REMOVE-STAGES: rock.lds_barrier
// REMOVE-STAGES: scf.for
// REMOVE-STAGES-NOT: rock.stage
// Barrier before LDS write
// REMOVE-STAGES: memref.load %{{.*}} : memref<16xi8, #gpu.address_space<private>>
// REMOVE-STAGES-NEXT: rock.lds_barrier
// REMOVE-STAGES-NEXT: %[[LDS_WR:.*]] = rock.extract_multibuffer{{.*}}#gpu.address_space<workgroup>
// REMOVE-STAGES-NEXT: memref.store %{{.*}}, %[[LDS_WR]]{{.*}} : memref<16xi8, #gpu.address_space<workgroup>>
// REMOVE-STAGES: }
// Epilogue: barrier before LDS write, barrier before LDS read
// REMOVE-STAGES: memref.load %{{.*}} : memref<16xi8, #gpu.address_space<private>>
// REMOVE-STAGES: rock.lds_barrier
// REMOVE-STAGES: memref.store %{{.*}} : memref<16xi8, #gpu.address_space<private>>
// REMOVE-STAGES: rock.lds_barrier
// REMOVE-STAGES: return
// CHECK-LABEL: rock_pipeline_5_stages_three_way_swap
func.func @rock_pipeline_5_stages_three_way_swap(%input : memref<16xi8, #gpu.address_space<global>>, %output : memref<16xi8, #gpu.address_space<global>>){
%c0 = arith.constant 0 : index
Expand Down
Loading