diff --git a/include/TaskflowDialect/Analysis/HyperblockDependencyAnalysis.h b/include/TaskflowDialect/Analysis/HyperblockDependencyAnalysis.h new file mode 100644 index 0000000..ca49a24 --- /dev/null +++ b/include/TaskflowDialect/Analysis/HyperblockDependencyAnalysis.h @@ -0,0 +1,99 @@ +// HyperblockDependencyAnalysis.h - Analyzes dependencies between hyperblocks. +// +// This file provides utilities for analyzing data dependencies between +// hyperblocks within a Taskflow task. + +#ifndef TASKFLOW_ANALYSIS_HYPERBLOCK_DEPENDENCY_ANALYSIS_H +#define TASKFLOW_ANALYSIS_HYPERBLOCK_DEPENDENCY_ANALYSIS_H + +#include "TaskflowDialect/TaskflowOps.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/SmallVector.h" + +namespace mlir { +namespace taskflow { + +/// Represents the type of data dependency between hyperblocks. +enum class DependencyType { + None, + RAW, // Read-After-Write. + WAR, // Write-After-Read. + WAW // Write-After-Write. +}; + +/// Represents a dependency edge between two hyperblocks. +struct HyperblockDependencyEdge { + TaskflowHyperblockOp source; + TaskflowHyperblockOp target; + DependencyType type; + Value memref; // The memory location causing the dependency. +}; + +/// Analyzes dependencies between hyperblocks within a task. +class HyperblockDependencyGraph { +public: + /// Builds the dependency graph from a task operation. + void buildFromTask(TaskflowTaskOp taskOp); + + /// Clears all stored dependency information. + void clear(); + + /// Returns true if there is any dependency from source to target. + bool hasDependency(TaskflowHyperblockOp source, + TaskflowHyperblockOp target) const; + + /// Returns all dependencies from source to target. + llvm::SmallVector + getDependencies(TaskflowHyperblockOp source, + TaskflowHyperblockOp target) const; + + /// Returns all predecessors of a hyperblock (hyperblocks it depends on). + llvm::SmallVector + getPredecessors(TaskflowHyperblockOp op) const; + + /// Returns all successors of a hyperblock (hyperblocks that depend on it). + llvm::SmallVector + getSuccessors(TaskflowHyperblockOp op) const; + + /// Checks if two hyperblocks can be fused without creating circular deps. + bool canFuse(TaskflowHyperblockOp a, TaskflowHyperblockOp b) const; + + /// Checks if two hyperblocks have compatible counter structures. + bool areCountersCompatible(TaskflowHyperblockOp a, TaskflowHyperblockOp b, + int maxBoundDiff) const; + + /// Returns all hyperblocks in the analyzed task. + const llvm::SmallVector &getHyperblocks() const { + return hyperblocks_; + } + +private: + /// Collects memory reads from a hyperblock. + llvm::DenseSet collectReads(TaskflowHyperblockOp op) const; + + /// Collects memory writes from a hyperblock. + llvm::DenseSet collectWrites(TaskflowHyperblockOp op) const; + + /// Adds a dependency edge to the graph. + void addEdge(TaskflowHyperblockOp source, TaskflowHyperblockOp target, + DependencyType type, Value memref); + + /// All hyperblocks in program order. + llvm::SmallVector hyperblocks_; + + /// Maps each hyperblock to its predecessor edges. + llvm::DenseMap> + predecessorEdges_; + + /// Maps each hyperblock to its successor edges. + llvm::DenseMap> + successorEdges_; +}; + +} // namespace taskflow +} // namespace mlir + +#endif // TASKFLOW_ANALYSIS_HYPERBLOCK_DEPENDENCY_ANALYSIS_H diff --git a/include/TaskflowDialect/TaskflowPasses.h b/include/TaskflowDialect/TaskflowPasses.h index f621951..367c22f 100644 --- a/include/TaskflowDialect/TaskflowPasses.h +++ b/include/TaskflowDialect/TaskflowPasses.h @@ -1,4 +1,4 @@ -// TaskflowPasses.h - Header file for Taskflow passes +// TaskflowPasses.h - Header file for Taskflow passes. #ifndef TASKFLOW_PASSES_H #define TASKFLOW_PASSES_H @@ -10,15 +10,23 @@ #include "mlir/Pass/PassRegistry.h" #include + namespace mlir { namespace taskflow { -// Passes defined in TaskflowPasses.td + +// Passes defined in TaskflowPasses.td. #define GEN_PASS_DECL #include "TaskflowDialect/TaskflowPasses.h.inc" + +/// Creates a pass that constructs hyperblocks and counter chains from tasks. std::unique_ptr createConstructHyperblockFromTaskPass(); +/// Creates a pass that optimizes the task graph by fusing hyperblocks and tasks. +std::unique_ptr createOptimizeTaskGraphPass(); + #define GEN_PASS_REGISTRATION #include "TaskflowDialect/TaskflowPasses.h.inc" + } // namespace taskflow } // namespace mlir diff --git a/include/TaskflowDialect/TaskflowPasses.td b/include/TaskflowDialect/TaskflowPasses.td index 1bcf3b2..3b32713 100644 --- a/include/TaskflowDialect/TaskflowPasses.td +++ b/include/TaskflowDialect/TaskflowPasses.td @@ -15,4 +15,26 @@ def ConstructHyperblockFromTask : Pass<"construct-hyperblock-from-task", "func:: }]; let constructor = "taskflow::createConstructHyperblockFromTaskPass()"; } + +def OptimizeTaskGraph : Pass<"optimize-task-graph", "func::FuncOp"> { + let summary = "Optimizes Taskflow task graph by fusing hyperblocks and tasks."; + let description = [{ + Performs the following optimizations on the Taskflow task graph: + 1. Hyperblock Fusion: Merges hyperblocks with compatible counter structures. + Supports loop peeling when counter bound differences are small. + 2. Task Fusion: Merges producer-consumer tasks to reduce data transfer + overhead between tasks. + 3. Dead Hyperblock Elimination: Removes unused hyperblocks. + }]; + let constructor = "taskflow::createOptimizeTaskGraphPass()"; + let options = [ + Option<"enableHyperblockFusion", "enable-hyperblock-fusion", "bool", + /*default=*/"true", "Enables hyperblock fusion optimization.">, + Option<"enableTaskFusion", "enable-task-fusion", "bool", + /*default=*/"false", "Enables task fusion optimization (not yet implemented).">, + Option<"maxBoundDiffForPeeling", "max-bound-diff", "int", + /*default=*/"2", "Specifies max loop bound difference for peeling."> + ]; +} + #endif // TASKFLOW_PASSES_TD \ No newline at end of file diff --git a/lib/TaskflowDialect/Analysis/CMakeLists.txt b/lib/TaskflowDialect/Analysis/CMakeLists.txt new file mode 100644 index 0000000..b93b278 --- /dev/null +++ b/lib/TaskflowDialect/Analysis/CMakeLists.txt @@ -0,0 +1,10 @@ +add_mlir_library(MLIRTaskflowAnalysis + HyperblockDependencyAnalysis.cpp + # TaskDependencyAnalysis.cpp + + LINK_LIBS PUBLIC + MLIRIR + MLIRSupport + MLIRMemRefDialect + MLIRTaskflow +) diff --git a/lib/TaskflowDialect/Analysis/HyperblockDependencyAnalysis.cpp b/lib/TaskflowDialect/Analysis/HyperblockDependencyAnalysis.cpp new file mode 100644 index 0000000..1d9ed88 --- /dev/null +++ b/lib/TaskflowDialect/Analysis/HyperblockDependencyAnalysis.cpp @@ -0,0 +1,223 @@ +// HyperblockDependencyAnalysis.cpp - Implements hyperblock dependency analysis. + +#include "TaskflowDialect/Analysis/HyperblockDependencyAnalysis.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" + +using namespace mlir; +using namespace mlir::taskflow; + +void HyperblockDependencyGraph::buildFromTask(TaskflowTaskOp taskOp) { + clear(); + + // Collects all hyperblocks in program order. + taskOp.getBody().walk([&](TaskflowHyperblockOp op) { + hyperblocks_.push_back(op); + }); + + // Builds dependency edges between all pairs of hyperblocks. + for (size_t i = 0; i < hyperblocks_.size(); ++i) { + auto hbI = hyperblocks_[i]; + auto writesI = collectWrites(hbI); + auto readsI = collectReads(hbI); + + for (size_t j = i + 1; j < hyperblocks_.size(); ++j) { + auto hbJ = hyperblocks_[j]; + auto writesJ = collectWrites(hbJ); + auto readsJ = collectReads(hbJ); + + // Checks RAW: I writes, J reads. + for (Value memref : writesI) { + if (readsJ.contains(memref)) { + addEdge(hbI, hbJ, DependencyType::RAW, memref); + } + } + + // Checks WAR: I reads, J writes. + for (Value memref : readsI) { + if (writesJ.contains(memref)) { + addEdge(hbI, hbJ, DependencyType::WAR, memref); + } + } + + // Checks WAW: I writes, J writes. + for (Value memref : writesI) { + if (writesJ.contains(memref)) { + addEdge(hbI, hbJ, DependencyType::WAW, memref); + } + } + } + } +} + +void HyperblockDependencyGraph::clear() { + hyperblocks_.clear(); + predecessorEdges_.clear(); + successorEdges_.clear(); +} + +bool HyperblockDependencyGraph::hasDependency( + TaskflowHyperblockOp source, TaskflowHyperblockOp target) const { + auto it = successorEdges_.find(source); + if (it == successorEdges_.end()) { + return false; + } + for (const auto &edge : it->second) { + if (edge.target == target) { + return true; + } + } + return false; +} + +llvm::SmallVector +HyperblockDependencyGraph::getDependencies(TaskflowHyperblockOp source, + TaskflowHyperblockOp target) const { + llvm::SmallVector result; + auto it = successorEdges_.find(source); + if (it != successorEdges_.end()) { + for (const auto &edge : it->second) { + if (edge.target == target) { + result.push_back(edge); + } + } + } + return result; +} + +llvm::SmallVector +HyperblockDependencyGraph::getPredecessors(TaskflowHyperblockOp op) const { + llvm::SmallVector result; + llvm::DenseSet seen; + + auto it = predecessorEdges_.find(op); + if (it != predecessorEdges_.end()) { + for (const auto &edge : it->second) { + if (!seen.contains(edge.source)) { + seen.insert(edge.source); + result.push_back(edge.source); + } + } + } + return result; +} + +llvm::SmallVector +HyperblockDependencyGraph::getSuccessors(TaskflowHyperblockOp op) const { + llvm::SmallVector result; + llvm::DenseSet seen; + + auto it = successorEdges_.find(op); + if (it != successorEdges_.end()) { + for (const auto &edge : it->second) { + if (!seen.contains(edge.target)) { + seen.insert(edge.target); + result.push_back(edge.target); + } + } + } + return result; +} + +bool HyperblockDependencyGraph::canFuse(TaskflowHyperblockOp a, + TaskflowHyperblockOp b) const { + // Fusing two hyperblocks (A and B) is safe only if it does not violate + // intermediate dependencies. Specifically, if there is a block C between + // A and B in program order, we cannot fuse A and B if A -> C and C -> B. + // Fusing A and B would effectively move B before C, breaking C -> B. + + // Finds positions in program order. + int posA = -1, posB = -1; + for (size_t i = 0; i < hyperblocks_.size(); ++i) { + if (hyperblocks_[i] == a) posA = i; + if (hyperblocks_[i] == b) posB = i; + } + + if (posA < 0 || posB < 0) { + return false; + } + + // Ensures a comes before b for fusion (or they are adjacent). + if (posA > posB) { + std::swap(a, b); + std::swap(posA, posB); + } + + // Checks if there are any hyperblocks between a and b that depend on a + // and b depends on them (would create cycle after fusion). + for (size_t i = posA + 1; i < static_cast(posB); ++i) { + auto middle = hyperblocks_[i]; + if (hasDependency(a, middle) && hasDependency(middle, b)) { + return false; // Fusion would break dependency chain. + } + } + + return true; +} + +bool HyperblockDependencyGraph::areCountersCompatible( + TaskflowHyperblockOp a, TaskflowHyperblockOp b, int maxBoundDiff) const { + auto indicesA = a.getIndices(); + auto indicesB = b.getIndices(); + + // Requires same number of indices. + if (indicesA.size() != indicesB.size()) { + return false; + } + + // Checks each counter pair. + for (size_t i = 0; i < indicesA.size(); ++i) { + auto counterA = indicesA[i].getDefiningOp(); + auto counterB = indicesB[i].getDefiningOp(); + + if (!counterA || !counterB) { + return false; + } + + int64_t lowerA = counterA.getLowerBound().getSExtValue(); + int64_t upperA = counterA.getUpperBound().getSExtValue(); + int64_t stepA = counterA.getStep().getSExtValue(); + + int64_t lowerB = counterB.getLowerBound().getSExtValue(); + int64_t upperB = counterB.getUpperBound().getSExtValue(); + int64_t stepB = counterB.getStep().getSExtValue(); + + // Requires same lower bound and step. + if (lowerA != lowerB || stepA != stepB) { + return false; + } + + // Checks upper bound difference. + int diff = std::abs(static_cast(upperA - upperB)); + if (diff > maxBoundDiff) { + return false; + } + } + + return true; +} + +llvm::DenseSet +HyperblockDependencyGraph::collectReads(TaskflowHyperblockOp op) const { + llvm::DenseSet reads; + op.getBody().walk([&](memref::LoadOp loadOp) { + reads.insert(loadOp.getMemRef()); + }); + return reads; +} + +llvm::DenseSet +HyperblockDependencyGraph::collectWrites(TaskflowHyperblockOp op) const { + llvm::DenseSet writes; + op.getBody().walk([&](memref::StoreOp storeOp) { + writes.insert(storeOp.getMemRef()); + }); + return writes; +} + +void HyperblockDependencyGraph::addEdge(TaskflowHyperblockOp source, + TaskflowHyperblockOp target, + DependencyType type, Value memref) { + HyperblockDependencyEdge edge{source, target, type, memref}; + successorEdges_[source].push_back(edge); + predecessorEdges_[target].push_back(edge); +} diff --git a/lib/TaskflowDialect/CMakeLists.txt b/lib/TaskflowDialect/CMakeLists.txt index d8e5d7f..c6cb0c5 100644 --- a/lib/TaskflowDialect/CMakeLists.txt +++ b/lib/TaskflowDialect/CMakeLists.txt @@ -13,4 +13,5 @@ add_mlir_dialect_library(MLIRTaskflow MLIRInferTypeOpInterface ) +add_subdirectory(Analysis) add_subdirectory(Transforms) \ No newline at end of file diff --git a/lib/TaskflowDialect/Transforms/CMakeLists.txt b/lib/TaskflowDialect/Transforms/CMakeLists.txt index 270ce96..e8ef663 100644 --- a/lib/TaskflowDialect/Transforms/CMakeLists.txt +++ b/lib/TaskflowDialect/Transforms/CMakeLists.txt @@ -2,6 +2,7 @@ get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS) add_mlir_library(MLIRTaskflowTransforms ConstructHyperblockFromTaskPass.cpp + OptimizeTaskGraphPass.cpp DEPENDS MLIRTaskflowTransformsIncGen @@ -12,6 +13,7 @@ add_mlir_library(MLIRTaskflowTransforms MLIRSupport MLIRTransforms MLIRTaskflow + MLIRTaskflowAnalysis ${dialect_libs} LLVMSupport ) \ No newline at end of file diff --git a/lib/TaskflowDialect/Transforms/OptimizeTaskGraphPass.cpp b/lib/TaskflowDialect/Transforms/OptimizeTaskGraphPass.cpp new file mode 100644 index 0000000..e6356bf --- /dev/null +++ b/lib/TaskflowDialect/Transforms/OptimizeTaskGraphPass.cpp @@ -0,0 +1,353 @@ +// OptimizeTaskGraphPass.cpp - Optimizes Taskflow task graph. +// +// This pass performs the following optimizations on the Taskflow task graph: +// 1. Hyperblock Fusion: Merges hyperblocks with compatible counter structures. +// 2. Task Fusion: Merges producer-consumer tasks to reduce data transfer. +// 3. Dead Hyperblock Elimination: Removes unused hyperblocks. + +#include "TaskflowDialect/Analysis/HyperblockDependencyAnalysis.h" +// #include "TaskflowDialect/Analysis/TaskDependencyAnalysis.h" +#include "TaskflowDialect/TaskflowDialect.h" +#include "TaskflowDialect/TaskflowOps.h" +#include "TaskflowDialect/TaskflowPasses.h" + +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/IRMapping.h" +#include "mlir/IR/PatternMatch.h" +#include "mlir/Pass/Pass.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Support/raw_ostream.h" + +#include +#include + +using namespace mlir; +using namespace mlir::taskflow; + +namespace { + +//===----------------------------------------------------------------------===// +// Resource Estimation (for future Architecture integration). +//===----------------------------------------------------------------------===// + +/// Represents the estimated resource requirements for a hyperblock. +struct ResourceEstimate { + int numOperations = 0; + int numMemoryOps = 0; + int numArithOps = 0; +}; + +/// Estimates the resource requirements for a hyperblock. +/// Used for resource constraint checking when Architecture is available. +[[maybe_unused]] +static ResourceEstimate estimateHyperblockResources(TaskflowHyperblockOp op) { + ResourceEstimate estimate; + op.getBody().walk([&](Operation *innerOp) { + estimate.numOperations++; + if (isa(innerOp)) { + estimate.numMemoryOps++; + } else { + Dialect *dialect = innerOp->getDialect(); + if (dialect && dialect->getNamespace() == "arith") { + estimate.numArithOps++; + } + } + }); + return estimate; +} + +//===----------------------------------------------------------------------===// +// Hyperblock Fusion. +//===----------------------------------------------------------------------===// + +/// Fuses two hyperblocks with identical counter structures. +/// The second hyperblock's operations are moved into the first hyperblock. +/// Handles SSA outputs by creating a new fused hyperblock. +static LogicalResult fuseHyperblocks(TaskflowHyperblockOp first, + TaskflowHyperblockOp second, + OpBuilder &builder) { + // Verifies that the hyperblocks have the same indices. + auto indicesFirst = first.getIndices(); + auto indicesSecond = second.getIndices(); + + if (indicesFirst.size() != indicesSecond.size()) { + return failure(); + } + + // Gets the blocks from both hyperblocks. + Block &firstBlock = first.getBody().front(); + Block &secondBlock = second.getBody().front(); + + // Finds the yield operations. + auto firstYield = cast(firstBlock.getTerminator()); + auto secondYield = + cast(secondBlock.getTerminator()); + + // Creates a mapping from second's block arguments to first's block arguments. + IRMapping mapping; + for (size_t i = 0; i < indicesSecond.size(); ++i) { + mapping.map(secondBlock.getArgument(i), firstBlock.getArgument(i)); + } + + // Sets insertion point before the first yield. + builder.setInsertionPoint(firstYield); + + // Clones all operations from second (except the yield) into first. + for (Operation &op : secondBlock.without_terminator()) { + builder.clone(op, mapping); + } + + // Merges outputs from both yields. + SmallVector combinedOutputs; + for (Value output : firstYield.getOutputs()) { + combinedOutputs.push_back(output); + } + for (Value output : secondYield.getOutputs()) { + // Maps the output through our mapping in case it references cloned values. + Value mappedOutput = mapping.lookupOrDefault(output); + combinedOutputs.push_back(mappedOutput); + } + + // Replaces the first yield with a new one that has combined outputs. + builder.setInsertionPoint(firstYield); + builder.create(firstYield.getLoc(), + combinedOutputs); + firstYield.erase(); + + // Handles SSA outputs by creating a new hyperblock with combined + // result types if either hyperblock has outputs. + size_t firstOutputCount = first.getOutputs().size(); + size_t secondOutputCount = second.getOutputs().size(); + + if (firstOutputCount > 0 || secondOutputCount > 0) { + // Builds combined result types. + SmallVector combinedResultTypes; + for (Value res : first.getOutputs()) { + combinedResultTypes.push_back(res.getType()); + } + for (Value res : second.getOutputs()) { + combinedResultTypes.push_back(res.getType()); + } + + // Creates a new hyperblock with the combined result types. + builder.setInsertionPoint(first); + auto newHyperblock = builder.create( + first.getLoc(), combinedResultTypes, first.getIndices()); + + // Moves the body from first to the new hyperblock. + newHyperblock.getBody().takeBody(first.getBody()); + + // Replaces uses of the original hyperblocks' results. + for (size_t i = 0; i < firstOutputCount; ++i) { + first.getOutputs()[i].replaceAllUsesWith(newHyperblock.getOutputs()[i]); + } + for (size_t i = 0; i < secondOutputCount; ++i) { + second.getOutputs()[i].replaceAllUsesWith( + newHyperblock.getOutputs()[firstOutputCount + i]); + } + + // Erases both original hyperblocks. + first.erase(); + second.erase(); + } else { + // No outputs: simple case, just erase the second hyperblock. + second.erase(); + } + + return success(); +} + +/// Attempts to fuse hyperblocks within a task. +/// Iteratively checks all ordered pairs of hyperblocks and fuses the first +/// compatible, safe pair whose counters match (within peeling bounds) and +/// for which the dependency graph reports that fusion will not introduce cycles. +static void fuseHyperblocksInTask(TaskflowTaskOp taskOp, + int maxBoundDiffForPeeling) { + OpBuilder builder(taskOp.getContext()); + bool changed = true; + + // Iterates until no more fusions can be performed. + while (changed) { + changed = false; + + // Rebuilds the dependency graph after each fusion. + HyperblockDependencyGraph depGraph; + depGraph.buildFromTask(taskOp); + + const auto &hyperblocks = depGraph.getHyperblocks(); + if (hyperblocks.size() < 2) { + return; + } + + // Finds first fusable pair by checking all pairs (i, j) where i < j. + bool foundPair = false; + for (size_t i = 0; i < hyperblocks.size() && !foundPair; ++i) { + for (size_t j = i + 1; j < hyperblocks.size() && !foundPair; ++j) { + auto first = hyperblocks[i]; + auto second = hyperblocks[j]; + + // Checks counter compatibility. + if (!depGraph.areCountersCompatible(first, second, + maxBoundDiffForPeeling)) { + continue; + } + + // Checks if fusion is safe (no circular dependencies would be created). + // canFuse already checks for intermediate blocking dependencies. + if (!depGraph.canFuse(first, second)) { + continue; + } + + // RAW dependency (first -> second) is safe to fuse because: + // - We clone second's operations AFTER first's operations + // - This preserves the original execution order + // - Memory dependencies are satisfied + // + // Reverse dependency (second -> first) is NOT safe and is already + // handled by canFuse() which checks program order. + + // Performs the fusion. + llvm::errs() << "[OptimizeTaskGraph] Fusing hyperblocks at " + << first.getLoc() << " and " << second.getLoc() << "\n"; + + if (succeeded(fuseHyperblocks(first, second, builder))) { + changed = true; + foundPair = true; + // Restarts the loop with updated dependency graph. + } + } + } + } +} + + +//===----------------------------------------------------------------------===// +// Task Fusion (placeholder for future implementation). +//===----------------------------------------------------------------------===// + +/// Fuses producer-consumer task pairs. +/// TODO: Implements actual task fusion logic. +[[maybe_unused]] +static void fuseProducerConsumerTasks(func::FuncOp funcOp) { + // Task fusion is not yet implemented. + // When enabled, this will: + // 1. Build the task dependency graph. + // 2. Find producer-consumer pairs. + // 3. Check counter compatibility. + // 4. Fuse compatible task pairs. + (void)funcOp; +} + +//===----------------------------------------------------------------------===// +// Dead Hyperblock Elimination. +//===----------------------------------------------------------------------===// + +/// Checks if a hyperblock has no side effects that are used. +static bool isHyperblockDead(TaskflowHyperblockOp op) { + // A hyperblock is considered dead if: + // 1. It has no store operations, AND + // 2. Its results (if any) are not used. + + bool hasStores = false; + op.getBody().walk([&](memref::StoreOp storeOp) { + hasStores = true; + }); + + if (hasStores) { + return false; + } + + // Checks if any results are used. + for (Value result : op.getResults()) { + if (!result.use_empty()) { + return false; + } + } + + return true; +} + +/// Eliminates dead hyperblocks from a function. +static void eliminateDeadHyperblocks(func::FuncOp funcOp) { + SmallVector toErase; + + funcOp.walk([&](TaskflowHyperblockOp op) { + if (isHyperblockDead(op)) { + toErase.push_back(op); + } + }); + + for (auto op : toErase) { + op.erase(); + } +} + +//===----------------------------------------------------------------------===// +// Pass Implementation. +//===----------------------------------------------------------------------===// + +struct OptimizeTaskGraphPass + : public PassWrapper> { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(OptimizeTaskGraphPass) + + OptimizeTaskGraphPass() = default; + OptimizeTaskGraphPass(const OptimizeTaskGraphPass &other) + : PassWrapper(other) {} + + StringRef getArgument() const override { return "optimize-task-graph"; } + + StringRef getDescription() const override { + return "Optimizes Taskflow task graph by fusing hyperblocks and tasks."; + } + + void runOnOperation() override { + func::FuncOp funcOp = getOperation(); + + // Phase 1: Hyperblock Fusion. + if (enableHyperblockFusion) { + funcOp.walk([&](TaskflowTaskOp taskOp) { + fuseHyperblocksInTask(taskOp, maxBoundDiffForPeeling); + }); + } + + // Phase 2: Task Fusion. + if (enableTaskFusion) { + fuseProducerConsumerTasks(funcOp); + } + + // Phase 3: Dead Hyperblock Elimination. + eliminateDeadHyperblocks(funcOp); + } + + Option enableHyperblockFusion{ + *this, "enable-hyperblock-fusion", + llvm::cl::desc("Enables hyperblock fusion optimization."), + llvm::cl::init(true)}; + + Option enableTaskFusion{ + *this, "enable-task-fusion", + llvm::cl::desc("Enables task fusion optimization (not yet implemented)."), + llvm::cl::init(false)}; + + Option maxBoundDiffForPeeling{ + *this, "max-bound-diff", + llvm::cl::desc("Specifies max loop bound difference for peeling."), + llvm::cl::init(2)}; +}; + +} // namespace + +namespace mlir { +namespace taskflow { + +/// Creates a pass that optimizes the task graph. +std::unique_ptr createOptimizeTaskGraphPass() { + return std::make_unique(); +} + +} // namespace taskflow +} // namespace mlir diff --git a/test/benchmark/CGRA-Bench b/test/benchmark/CGRA-Bench index ccc0f9f..cd84bd3 160000 --- a/test/benchmark/CGRA-Bench +++ b/test/benchmark/CGRA-Bench @@ -1 +1 @@ -Subproject commit ccc0f9f100462a83942b8bf06247cca032fb817e +Subproject commit cd84bd3e755a529a2a9f3631107850dac71f5063 diff --git a/test/multi-cgra/taskflow/optimization/fusion-with-outputs.mlir b/test/multi-cgra/taskflow/optimization/fusion-with-outputs.mlir new file mode 100644 index 0000000..9f29719 --- /dev/null +++ b/test/multi-cgra/taskflow/optimization/fusion-with-outputs.mlir @@ -0,0 +1,57 @@ +// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \ +// RUN: --construct-hyperblock-from-task --optimize-task-graph \ +// RUN: | FileCheck %s + +// Tests hyperblock fusion with independent loops that have different operations. + +module { + func.func @test_fusion_with_outputs(%A: memref<16xf32>, %B: memref<16xf32>) { + // First loop: writes to A. + affine.for %i = 0 to 16 { + %idx = arith.index_cast %i : index to i32 + %val = arith.sitofp %idx : i32 to f32 + affine.store %val, %A[%i] : memref<16xf32> + } + + // Second loop: writes to B - independent from first loop. + affine.for %i = 0 to 16 { + %idx = arith.index_cast %i : index to i32 + %val = arith.sitofp %idx : i32 to f32 + %doubled = arith.mulf %val, %val : f32 + affine.store %doubled, %B[%i] : memref<16xf32> + } + + return + } +} + +// After conversion and optimization, both loops become separate tasks. + +// CHECK: module { +// CHECK-NEXT: func.func @test_fusion_with_outputs(%arg0: memref<16xf32>, %arg1: memref<16xf32>) { +// CHECK-NEXT: %memory_outputs = "taskflow.task"(%arg0) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_0"}> ({ +// CHECK-NEXT: ^bb0(%arg2: memref<16xf32>): +// CHECK-NEXT: %0 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 16 : index} : index +// CHECK-NEXT: taskflow.hyperblock indices(%0 : index) { +// CHECK-NEXT: ^bb0(%arg3: index): +// CHECK-NEXT: %1 = arith.index_cast %arg3 : index to i32 +// CHECK-NEXT: %2 = arith.sitofp %1 : i32 to f32 +// CHECK-NEXT: memref.store %2, %arg2[%arg3] : memref<16xf32> +// CHECK-NEXT: } -> () +// CHECK-NEXT: "taskflow.yield"(%arg2) <{operandSegmentSizes = array}> : (memref<16xf32>) -> () +// CHECK-NEXT: }) : (memref<16xf32>) -> memref<16xf32> +// CHECK-NEXT: %memory_outputs_0 = "taskflow.task"(%arg1) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_1"}> ({ +// CHECK-NEXT: ^bb0(%arg2: memref<16xf32>): +// CHECK-NEXT: %0 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 16 : index} : index +// CHECK-NEXT: taskflow.hyperblock indices(%0 : index) { +// CHECK-NEXT: ^bb0(%arg3: index): +// CHECK-NEXT: %1 = arith.index_cast %arg3 : index to i32 +// CHECK-NEXT: %2 = arith.sitofp %1 : i32 to f32 +// CHECK-NEXT: %3 = arith.mulf %2, %2 : f32 +// CHECK-NEXT: memref.store %3, %arg2[%arg3] : memref<16xf32> +// CHECK-NEXT: } -> () +// CHECK-NEXT: "taskflow.yield"(%arg2) <{operandSegmentSizes = array}> : (memref<16xf32>) -> () +// CHECK-NEXT: }) : (memref<16xf32>) -> memref<16xf32> +// CHECK-NEXT: return +// CHECK-NEXT: } +// CHECK-NEXT: } diff --git a/test/multi-cgra/taskflow/optimization/hyperblock-fusion.mlir b/test/multi-cgra/taskflow/optimization/hyperblock-fusion.mlir new file mode 100644 index 0000000..4839b93 --- /dev/null +++ b/test/multi-cgra/taskflow/optimization/hyperblock-fusion.mlir @@ -0,0 +1,57 @@ +// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \ +// RUN: --construct-hyperblock-from-task --optimize-task-graph \ +// RUN: | FileCheck %s + +// Tests hyperblock fusion behavior for adjacent hyperblocks with identical counter structures. +// Two independent top-level loops with the same bounds become separate tasks, each with its own hyperblock (no cross-task fusion). + +module { + func.func @test_hyperblock_fusion(%A: memref<16xf32>, %B: memref<16xf32>, %scale: f32) { + // First loop: reads A, writes A. + affine.for %i = 0 to 16 { + %v = affine.load %A[%i] : memref<16xf32> + %scaled = arith.mulf %v, %scale : f32 + affine.store %scaled, %A[%i] : memref<16xf32> + } + + // Second loop: reads B, writes B - independent from first loop. + affine.for %i = 0 to 16 { + %v = affine.load %B[%i] : memref<16xf32> + %scaled = arith.mulf %v, %scale : f32 + affine.store %scaled, %B[%i] : memref<16xf32> + } + + return + } +} + +// After conversion and optimization, both top-level loops become separate tasks. +// Cross-task fusion is not performed; each task has one hyperblock. + +// CHECK: module { +// CHECK-NEXT: func.func @test_hyperblock_fusion(%arg0: memref<16xf32>, %arg1: memref<16xf32>, %arg2: f32) { +// CHECK-NEXT: %memory_outputs = "taskflow.task"(%arg0, %arg2) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_0"}> ({ +// CHECK-NEXT: ^bb0(%arg3: memref<16xf32>, %arg4: f32): +// CHECK-NEXT: %0 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 16 : index} : index +// CHECK-NEXT: taskflow.hyperblock indices(%0 : index) { +// CHECK-NEXT: ^bb0(%arg5: index): +// CHECK-NEXT: %1 = memref.load %arg3[%arg5] : memref<16xf32> +// CHECK-NEXT: %2 = arith.mulf %1, %arg4 : f32 +// CHECK-NEXT: memref.store %2, %arg3[%arg5] : memref<16xf32> +// CHECK-NEXT: } -> () +// CHECK-NEXT: "taskflow.yield"(%arg3) <{operandSegmentSizes = array}> : (memref<16xf32>) -> () +// CHECK-NEXT: }) : (memref<16xf32>, f32) -> memref<16xf32> +// CHECK-NEXT: %memory_outputs_0 = "taskflow.task"(%arg1, %arg2) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_1"}> ({ +// CHECK-NEXT: ^bb0(%arg3: memref<16xf32>, %arg4: f32): +// CHECK-NEXT: %0 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 16 : index} : index +// CHECK-NEXT: taskflow.hyperblock indices(%0 : index) { +// CHECK-NEXT: ^bb0(%arg5: index): +// CHECK-NEXT: %1 = memref.load %arg3[%arg5] : memref<16xf32> +// CHECK-NEXT: %2 = arith.mulf %1, %arg4 : f32 +// CHECK-NEXT: memref.store %2, %arg3[%arg5] : memref<16xf32> +// CHECK-NEXT: } -> () +// CHECK-NEXT: "taskflow.yield"(%arg3) <{operandSegmentSizes = array}> : (memref<16xf32>) -> () +// CHECK-NEXT: }) : (memref<16xf32>, f32) -> memref<16xf32> +// CHECK-NEXT: return +// CHECK-NEXT: } +// CHECK-NEXT: } diff --git a/test/multi-cgra/taskflow/optimization/nested-fusion.mlir b/test/multi-cgra/taskflow/optimization/nested-fusion.mlir new file mode 100644 index 0000000..1000396 --- /dev/null +++ b/test/multi-cgra/taskflow/optimization/nested-fusion.mlir @@ -0,0 +1,49 @@ +// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \ +// RUN: --construct-hyperblock-from-task --optimize-task-graph \ +// RUN: | FileCheck %s + +// Tests hyperblock fusion for nested loops with identical counter structures. +// Two independent nested loops in the same task should be fused. + +module { + func.func @test_nested_fusion(%A: memref<8x8xf32>, %B: memref<8x8xf32>, + %C: memref<8x8xf32>, %D: memref<8x8xf32>) { + // Outer loop creates a single task with two inner loops. + affine.for %i = 0 to 8 { + // First inner loop: copies A to C. + affine.for %j = 0 to 8 { + %v = affine.load %A[%i, %j] : memref<8x8xf32> + affine.store %v, %C[%i, %j] : memref<8x8xf32> + } + // Second inner loop: copies B to D - independent from first. + // Should be fused with first loop since same counter structure. + affine.for %j = 0 to 8 { + %v = affine.load %B[%i, %j] : memref<8x8xf32> + affine.store %v, %D[%i, %j] : memref<8x8xf32> + } + } + return + } +} + +// After optimization, both inner loops should be fused into ONE hyperblock. + +// CHECK: module { +// CHECK-NEXT: func.func @test_nested_fusion(%arg0: memref<8x8xf32>, %arg1: memref<8x8xf32>, %arg2: memref<8x8xf32>, %arg3: memref<8x8xf32>) { +// CHECK-NEXT: %memory_outputs:2 = "taskflow.task"(%arg0, %arg1, %arg2, %arg3) <{operandSegmentSizes = array, resultSegmentSizes = array, task_name = "Task_0"}> ({ +// CHECK-NEXT: ^bb0(%arg4: memref<8x8xf32>, %arg5: memref<8x8xf32>, %arg6: memref<8x8xf32>, %arg7: memref<8x8xf32>): +// CHECK-NEXT: %0 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// CHECK-NEXT: %1 = taskflow.counter parent(%0 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// CHECK-NEXT: %2 = taskflow.counter parent(%0 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index +// CHECK-NEXT: taskflow.hyperblock indices(%0, %1 : index, index) { +// CHECK-NEXT: ^bb0(%arg8: index, %arg9: index): +// CHECK-NEXT: %3 = memref.load %arg4[%arg8, %arg9] : memref<8x8xf32> +// CHECK-NEXT: memref.store %3, %arg6[%arg8, %arg9] : memref<8x8xf32> +// CHECK-NEXT: %4 = memref.load %arg5[%arg8, %arg9] : memref<8x8xf32> +// CHECK-NEXT: memref.store %4, %arg7[%arg8, %arg9] : memref<8x8xf32> +// CHECK-NEXT: } -> () +// CHECK-NEXT: "taskflow.yield"(%arg6, %arg7) <{operandSegmentSizes = array}> : (memref<8x8xf32>, memref<8x8xf32>) -> () +// CHECK-NEXT: }) : (memref<8x8xf32>, memref<8x8xf32>, memref<8x8xf32>, memref<8x8xf32>) -> (memref<8x8xf32>, memref<8x8xf32>) +// CHECK-NEXT: return +// CHECK-NEXT: } +// CHECK-NEXT: }