diff --git a/include/Conversion/AffineToNeura/LoopNestAnalysis.h b/include/Conversion/AffineToNeura/LoopNestAnalysis.h new file mode 100644 index 00000000..ce18a4cc --- /dev/null +++ b/include/Conversion/AffineToNeura/LoopNestAnalysis.h @@ -0,0 +1,80 @@ +//===- LoopNestAnalysis.h - Analyze affine loop nests ----------*- C++ -*-===// +// +// Loop nest analysis for affine loops. +// +// Features: +// 1. Build loop hierarchy tree (parent-child relationships, nesting depth) +// 2. Identify perfect vs imperfect nesting +// 3. Support valid signal reuse optimization for nested loops +// +//===----------------------------------------------------------------------===// +#ifndef CONVERSION_AFFINE_TO_NEURA_LOOP_NEST_ANALYSIS_H +#define CONVERSION_AFFINE_TO_NEURA_LOOP_NEST_ANALYSIS_H + +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/IR/Operation.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" +#include + +namespace mlir { +namespace neura { + +/// Loop information structure - Stores all analysis information for a single loop. +struct LoopInfo { + affine::AffineForOp loop; // The loop operation itself. + LoopInfo *parent = nullptr; // Parent loop (nullptr if top-level). + llvm::SmallVector children; // Child loops list. + unsigned depth = 0; // Nesting depth (0=top-level). + bool isPerfectNest = true; // Whether it is a perfect nest. + + // Operations list for imperfect nesting. + llvm::SmallVector operationsBeforeChild; // Operations before child loops. + llvm::SmallVector operationsAfterChild; // Operations after child loops. + + LoopInfo(affine::AffineForOp loop) : loop(loop) {} +}; + +/// Loop nest analysis class. +/// +/// Purpose: Provides loop hierarchy information for AffineToNeura pass to support optimization decisions. +/// +/// Usage example: +/// LoopNestAnalysis analysis(func_op); +/// analysis.dump(); // Prints analysis results. +/// LoopInfo *info = analysis.getLoopInfo(loop); +/// if (info && info->parent) { +/// // This is a nested loop, can reuse parent's valid signal. +/// } +class LoopNestAnalysis { +public: + /// Constructor - Performs loop nest analysis on the given function. + explicit LoopNestAnalysis(func::FuncOp func); + + /// Query interfaces. + LoopInfo *getLoopInfo(affine::AffineForOp loop) const; // Gets loop information. + llvm::ArrayRef getTopLevelLoops() const { return topLevelLoops; } // Gets top-level loops. + llvm::ArrayRef> getAllLoops() const { return allLoops; } // Gets all loops. + bool isPerfectNest(affine::AffineForOp loop) const; // Checks if perfect nest. + LoopInfo *getParentLoop(affine::AffineForOp loop) const; // Gets parent loop. + llvm::ArrayRef getChildLoops(affine::AffineForOp loop) const; // Gets child loops. + + /// Debug interface - Prints analysis results. + void dump() const; + +private: + /// Internal analysis methods. + void buildLoopNestTree(func::FuncOp func); // Builds loop hierarchy tree. + void analyzePerfectNests(); // Analyzes perfect nest characteristics. + + /// Data members. + llvm::DenseMap loopMap; // Loop fast lookup table. + llvm::SmallVector, 8> allLoops; // All loops (owns ownership). + llvm::SmallVector topLevelLoops; // Top-level loop pointers list. +}; + +} // namespace neura +} // namespace mlir + +#endif diff --git a/include/Conversion/ConversionPasses.h b/include/Conversion/ConversionPasses.h index 30cbf0e8..15f9b2d6 100644 --- a/include/Conversion/ConversionPasses.h +++ b/include/Conversion/ConversionPasses.h @@ -18,6 +18,7 @@ std::unique_ptr createLowerArithToNeuraPass(); std::unique_ptr createLowerLlvmToNeuraPass(); std::unique_ptr createLowerMemRefToNeuraPass(); std::unique_ptr createLowerBuiltinToNeuraPass(); +std::unique_ptr createLowerAffineToNeuraPass(); #define GEN_PASS_REGISTRATION #include "Conversion/ConversionPasses.h.inc" diff --git a/include/Conversion/ConversionPasses.td b/include/Conversion/ConversionPasses.td index 2e79dd96..7044b9ad 100644 --- a/include/Conversion/ConversionPasses.td +++ b/include/Conversion/ConversionPasses.td @@ -32,4 +32,16 @@ def LowerBuiltinToNeura : Pass<"lower-builtin-to-neura", "ModuleOp">{ let constructor = "mlir::createLowerBuiltinToNeuraPass()"; } +def LowerAffineToNeura : Pass<"lower-affine-to-neura", "func::FuncOp">{ + let summary = "Lower Affine perfect nested loops to Neura loop_control operations"; + let description = [{ + Converts perfectly nested affine.for loops directly to Neura dialect using + loop_control operations, avoiding the need to flatten to LLVM IR first. + This preserves loop structure information for better optimization on + dataflow architectures. + }]; + let constructor = "mlir::createLowerAffineToNeuraPass()"; + let dependentDialects = ["mlir::neura::NeuraDialect", "mlir::affine::AffineDialect"]; +} + #endif // CONVERSION_PASSES_TD \ No newline at end of file diff --git a/include/NeuraDialect/Architecture/Architecture.h b/include/NeuraDialect/Architecture/Architecture.h index ca7a4951..bae2db90 100644 --- a/include/NeuraDialect/Architecture/Architecture.h +++ b/include/NeuraDialect/Architecture/Architecture.h @@ -57,7 +57,9 @@ enum OperationKind { // Loop control operations. ILoopControl = 34, // Constant operations. - IConstant = 35 + IConstant = 35, + // Steering control fused operations. + ICarryInvariant = 36, IConditionalSelect = 37, IInvariantGroup = 38 }; //===----------------------------------------------------------------------===// diff --git a/include/NeuraDialect/Mapping/mapping_util.h b/include/NeuraDialect/Mapping/mapping_util.h index 8c301aa1..cf85d2a2 100644 --- a/include/NeuraDialect/Mapping/mapping_util.h +++ b/include/NeuraDialect/Mapping/mapping_util.h @@ -12,6 +12,10 @@ OperationKind getOperationKindFromMlirOp(Operation *op); // Returns true if the operation does not need CGRA tile placement. bool is_non_materialized(Operation *op); +// Returns true if the operation is a steering-mode operation that doesn't +// require DataMovOp wrapping (e.g., constants, carry, invariant, etc.). +bool is_steering_unwrapped_op(Operation *op); + // Returns true if the operation is a materialized reserve user, i.e., // phi, invariant, carry. bool isMaterializedReserveUser(Operation *op); diff --git a/include/NeuraDialect/NeuraOps.td b/include/NeuraDialect/NeuraOps.td index eeb2677a..91f303fa 100644 --- a/include/NeuraDialect/NeuraOps.td +++ b/include/NeuraDialect/NeuraOps.td @@ -657,4 +657,133 @@ def Neura_InvariantOp : Op{ let arguments = (ins AnyType:$initial, AnyType:$condition); let results = (outs AnyType:$result); let assemblyFormat = "$initial `,` $condition attr-dict `:` type($initial) `,` type($condition) `->` type($result)"; +} + +// ============================================================================ +// FUSED OPERATIONS FOR RECMII OPTIMIZATION +// ============================================================================ + +// Defines the carry_invariant fused operation. +def Neura_CarryInvariantOp : Op{ + let summary = "Fused carry and invariant operation for nested loops."; + let description = [{ + Combines carry and invariant operations into a single operation to reduce RecMII. + This is optimized for nested loop patterns where an inner loop's carry result + is used as an invariant in the outer loop. + + Semantics: + - If inner_condition is false (first inner iteration): return initial value + - Else if outer_condition is false (outer loop active, inner loop invariant): + return initial value from inner carry + - Else: return carried value + + Replaces the pattern: + %carry_result = neura.carry %init, %inner_cond, %carried + %inv_result = neura.invariant %carry_result, %outer_cond + + With: + %result = neura.carry_invariant %init, %inner_cond, %outer_cond, %carried + + RecMII Impact: Reduces 2 operations to 1 operation (-50% on critical path) + + Example: + %out = neura.carry_invariant %init, %inner_cond, %outer_cond, %carried + : i64, i1, i1, i64 -> i64 + }]; + + let arguments = (ins + AnyType:$initial, + AnyType:$inner_condition, + AnyType:$outer_condition, + AnyType:$carried + ); + let results = (outs AnyType:$result); + + let assemblyFormat = [{ + $initial `,` $inner_condition `,` $outer_condition `,` $carried attr-dict + `:` type($initial) `,` type($inner_condition) `,` type($outer_condition) `,` + type($carried) `->` type($result) + }]; +} + +// Defines the conditional_select fused operation. +def Neura_ConditionalSelectOp : Op{ + let summary = "Fused comparison and conditional selection operation."; + let description = [{ + Combines comparison (icmp) and conditional selection (false_steer) into a + single atomic operation to reduce RecMII. + + Semantics: + - Performs comparison: result = (lhs rhs) + - If result is false: return value + - If result is true: return default value (typically from hardware) + + Replaces the pattern: + %cond = neura.icmp %lhs, %rhs <{cmpType = "slt"}> + %result = neura.false_steer %value, %cond + + With: + %result = neura.cond_select %lhs, %rhs, %value <{predicate = "slt"}> + + RecMII Impact: Reduces 2 operations to 1 operation (-50% on critical path) + + Supported predicates: "eq", "ne", "slt", "sle", "sgt", "sge", "ult", "ule", "ugt", "uge" + + Example: + %out = neura.cond_select %a, %b, %val <{predicate = "slt"}> + : i64, i64, i64 -> i64 + }]; + + let arguments = (ins + AnyType:$lhs, + AnyType:$rhs, + AnyType:$value, + StrAttr:$predicate + ); + let results = (outs AnyType:$result); + + let assemblyFormat = [{ + $lhs `,` $rhs `,` $value attr-dict `:` type($lhs) `,` type($rhs) `,` + type($value) `->` type($result) + }]; +} + +// Defines the invariant_group batch operation. +def Neura_InvariantGroupOp : Op{ + let summary = "Batch invariant extraction for multiple values."; + let description = [{ + Extracts multiple invariants with the same condition in a single operation. + This is optimized for nested loops where many values need to be marked as + invariant with respect to the outer loop. + + Hardware can optimize this by: + - Sharing condition checking logic + - Parallel invariant extraction + - Reduced control overhead + + Replaces multiple individual invariant operations: + %inv1 = neura.invariant %val1, %cond + %inv2 = neura.invariant %val2, %cond + %inv3 = neura.invariant %val3, %cond + + With a single batch operation: + %inv1, %inv2, %inv3 = neura.invariant_group %val1, %val2, %val3, %cond + + ResMII Impact: Reduces N operations to 1 operation (improves resource utilization) + + Example: + %out1, %out2, %out3 = neura.invariant_group %in1, %in2, %in3, %cond + : i64, i64, i64, i1 -> i64, i64, i64 + }]; + + let arguments = (ins + Variadic:$inputs, + AnyType:$condition + ); + let results = (outs Variadic:$outputs); + + let assemblyFormat = [{ + $inputs `,` $condition attr-dict `:` type($inputs) `,` type($condition) + `->` type($outputs) + }]; } \ No newline at end of file diff --git a/include/NeuraDialect/NeuraPasses.td b/include/NeuraDialect/NeuraPasses.td index 3d70af2c..d7f4974a 100644 --- a/include/NeuraDialect/NeuraPasses.td +++ b/include/NeuraDialect/NeuraPasses.td @@ -134,4 +134,5 @@ def RemovePredicatedType : Pass<"remove-predicated-type", "ModuleOp"> { }]; let constructor = "neura::createRemovePredicatedTypePass()"; } + #endif // NEURA_PASSES_TD \ No newline at end of file diff --git a/lib/Conversion/AffineToNeura/AffineToNeuraPass.cpp b/lib/Conversion/AffineToNeura/AffineToNeuraPass.cpp new file mode 100644 index 00000000..77afea12 --- /dev/null +++ b/lib/Conversion/AffineToNeura/AffineToNeuraPass.cpp @@ -0,0 +1,480 @@ +#include "Common/AcceleratorAttrs.h" +#include "Conversion/ConversionPasses.h" +#include "Conversion/AffineToNeura/LoopNestAnalysis.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/IR/AffineExpr.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/BuiltinAttributes.h" +#include "mlir/IR/BuiltinOps.h" +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/MLIRContext.h" +#include "mlir/IR/PatternMatch.h" +#include "mlir/IR/Region.h" +#include "mlir/IR/ValueRange.h" +#include "mlir/IR/Visitors.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Support/LLVM.h" +#include "mlir/Support/LogicalResult.h" +#include "mlir/Transforms/DialectConversion.h" + +#include "NeuraDialect/NeuraDialect.h" +#include "NeuraDialect/NeuraOps.h" +#include "llvm/Support/raw_ostream.h" +#include + +using namespace mlir; +using namespace mlir::neura; +using namespace mlir::func; + +#define GEN_PASS_DEF_LOWERAFFINETONEURA +#include "Conversion/ConversionPasses.h.inc" + +namespace { +LogicalResult convertAffineMapToIndices(AffineMap map, ValueRange map_operands, + Location loc, PatternRewriter &rewriter, + SmallVector &new_indices) { + new_indices.clear(); + new_indices.reserve(map.getNumResults()); + for (AffineExpr expr : map.getResults()) { + if (AffineConstantExpr const_expr = dyn_cast(expr)) { + IndexType index_type = rewriter.getIndexType(); + IntegerAttr value_attr = + rewriter.getIntegerAttr(index_type, const_expr.getValue()); + new_indices.push_back(rewriter.create( + loc, index_type, value_attr)); + } else if (AffineDimExpr dim_expr = dyn_cast(expr)) { + if (dim_expr.getPosition() >= map.getNumDims() || + dim_expr.getPosition() >= + map_operands + .size()) { // Checks against mapOperands size for safety. + return failure(); + } + new_indices.push_back(map_operands[dim_expr.getPosition()]); + } else if (AffineSymbolExpr sym_expr = dyn_cast(expr)) { + unsigned symbol_operand_index = map.getNumDims() + sym_expr.getPosition(); + if (symbol_operand_index >= map_operands.size()) { + return failure(); + } + new_indices.push_back(map_operands[symbol_operand_index]); + } else { + // For more complex affine expressions (e.g., d0 + c1), + // expands them into explicit Neura arithmetic operations. + // Supports: Add, Mul, Mod, FloorDiv, CeilDiv. + llvm::errs() << "[affine2neura] Expanding complex affine expression: " + << expr << "\n"; + + // Helper lambda: recursively expands AffineExpr to Value. + std::function expandExpr = + [&](AffineExpr e) -> Value { + // Constant expression. + if (auto const_expr = dyn_cast(e)) { + return rewriter.create( + loc, rewriter.getIndexType(), + rewriter.getIntegerAttr(rewriter.getIndexType(), + const_expr.getValue())); + } + // Dimension expression. + else if (auto dim_expr = dyn_cast(e)) { + return map_operands[dim_expr.getPosition()]; + } + // Symbol expression. + else if (auto sym_expr = dyn_cast(e)) { + unsigned symbol_operand_index = + map.getNumDims() + sym_expr.getPosition(); + return map_operands[symbol_operand_index]; + } + // Binary operation expression. + else if (auto bin_expr = dyn_cast(e)) { + Value lhs = expandExpr(bin_expr.getLHS()); + Value rhs = expandExpr(bin_expr.getRHS()); + + switch (bin_expr.getKind()) { + case AffineExprKind::Add: + return rewriter.create( + loc, rewriter.getIndexType(), lhs, rhs).getResult(); + case AffineExprKind::Mul: + return rewriter.create( + loc, rewriter.getIndexType(), lhs, rhs).getResult(); + case AffineExprKind::Mod: + return rewriter.create( + loc, rewriter.getIndexType(), lhs, rhs).getResult(); + case AffineExprKind::FloorDiv: + return rewriter.create( + loc, rewriter.getIndexType(), lhs, rhs).getResult(); + case AffineExprKind::CeilDiv: { + // ceildiv(a, b) = floordiv(a + b - 1, b). + Value one = rewriter.create( + loc, rewriter.getIndexType(), + rewriter.getIntegerAttr(rewriter.getIndexType(), 1)); + Value b_minus_1 = rewriter.create( + loc, rewriter.getIndexType(), rhs, one).getResult(); + Value numerator = rewriter.create( + loc, rewriter.getIndexType(), lhs, b_minus_1).getResult(); + return rewriter.create( + loc, rewriter.getIndexType(), numerator, rhs).getResult(); + } + default: + llvm::errs() << "[affine2neura] Unsupported binary op kind: " + << static_cast(bin_expr.getKind()) << "\n"; + return Value(); + } + } + + llvm::errs() << "[affine2neura] Unsupported affine expression type\n"; + return Value(); + }; + + Value expanded = expandExpr(expr); + if (!expanded) { + // Fallback: if expansion fails, use affine.apply (ensures correctness). + llvm::errs() << "[affine2neura] Failed to expand, using affine.apply\n"; + AffineMap single_result_map = AffineMap::get( + map.getNumDims(), map.getNumSymbols(), expr, rewriter.getContext()); + expanded = rewriter.create( + loc, single_result_map, map_operands); + } + new_indices.push_back(expanded); + } + } + return success(); +} + +struct AffineLoadLowering : public OpRewritePattern { + AffineLoadLowering(MLIRContext *context) + : OpRewritePattern(context, /*benefit=*/1) {} + + LogicalResult matchAndRewrite(affine::AffineLoadOp load_op, + PatternRewriter &rewriter) const override { + Location loc = load_op.getLoc(); + auto memref = load_op.getMemref(); + AffineMap map = load_op.getAffineMap(); + ValueRange map_operands = load_op.getMapOperands(); + // Gets the indices for the load operation. + SmallVector new_indices; + if (failed(convertAffineMapToIndices(map, map_operands, loc, rewriter, + new_indices))) { + return load_op.emitError( + "[affine2neura] Failed to convert affine map to indices"); + } + + MemRefType memref_type = dyn_cast(memref.getType()); + if (!memref_type) { + return load_op.emitError( + "[affine2neura] Base of load is not a MemRefType"); + } + if (new_indices.size() != static_cast(memref_type.getRank())) { + return load_op.emitError( + "[affine2neura] Number of indices from affine map (") + << new_indices.size() << ") does not match memref rank (" + << memref_type.getRank() << ")"; + } + + // Creates the neura.load_indexed operation. + LoadIndexedOp new_load_op = rewriter.create( + loc, load_op.getType(), memref, ValueRange{new_indices}); + + rewriter.replaceOp(load_op, new_load_op.getResult()); + return success(); + } +}; + +struct AffineStoreLowering : public OpRewritePattern { + AffineStoreLowering(MLIRContext *context) + : OpRewritePattern(context, /*benefit=*/1) {} + + LogicalResult matchAndRewrite(affine::AffineStoreOp store_op, + PatternRewriter &rewriter) const override { + Location loc = store_op.getLoc(); + auto memref = store_op.getMemref(); + Value value = store_op.getValueToStore(); + AffineMap map = store_op.getAffineMap(); + ValueRange mapOperands = store_op.getMapOperands(); + + SmallVector newIndices; + if (failed(convertAffineMapToIndices(map, mapOperands, loc, rewriter, + newIndices))) { + return store_op.emitError( + "[affine2neura] Failed to convert affine map to indices"); + } + + MemRefType memRefType = dyn_cast(memref.getType()); + if (!memRefType) { + return store_op.emitError( + "[affine2neura] Base of store is not a MemRefType"); + } + if (newIndices.size() != static_cast(memRefType.getRank())) { + return store_op.emitError( + "[affine2neura] Number of indices from affine map (") + << newIndices.size() << ") does not match memref rank (" + << memRefType.getRank() << ")"; + } + + rewriter.create(loc, value, memref, + ValueRange{newIndices}); + rewriter.eraseOp(store_op); + return success(); + } +}; + +struct AffineApplyLowering : public OpRewritePattern { + AffineApplyLowering(MLIRContext *context) + : OpRewritePattern(context, /*benefit=*/1) {} + + LogicalResult matchAndRewrite(affine::AffineApplyOp apply_op, + PatternRewriter &rewriter) const override { + AffineMap map = apply_op.getAffineMap(); + ValueRange operands = apply_op.getMapOperands(); + Location loc = apply_op.getLoc(); + + if (map.getNumResults() != 1) { + return apply_op.emitError( + "[affine2neura] AffineApplyOp must have a single result"); + } + + AffineExpr expr = map.getResult(0); + llvm::errs() << "[affine2neura] Expanding affine.apply expression: " + << expr << "\n"; + + // Helper lambda: recursively expands AffineExpr to Value. + std::function expandExpr = + [&](AffineExpr e) -> Value { + // Constant expression. + if (auto const_expr = dyn_cast(e)) { + return rewriter.create( + loc, rewriter.getIndexType(), + rewriter.getIntegerAttr(rewriter.getIndexType(), + const_expr.getValue())); + } + // Dimension expression. + else if (auto dim_expr = dyn_cast(e)) { + return operands[dim_expr.getPosition()]; + } + // Symbol expression. + else if (auto sym_expr = dyn_cast(e)) { + unsigned symbol_operand_index = + map.getNumDims() + sym_expr.getPosition(); + return operands[symbol_operand_index]; + } + // Binary operation expression. + else if (auto bin_expr = dyn_cast(e)) { + Value lhs = expandExpr(bin_expr.getLHS()); + Value rhs = expandExpr(bin_expr.getRHS()); + + if (!lhs || !rhs) { + return Value(); + } + + switch (bin_expr.getKind()) { + case AffineExprKind::Add: + return rewriter.create( + loc, rewriter.getIndexType(), lhs, rhs).getResult(); + case AffineExprKind::Mul: + return rewriter.create( + loc, rewriter.getIndexType(), lhs, rhs).getResult(); + case AffineExprKind::Mod: + return rewriter.create( + loc, rewriter.getIndexType(), lhs, rhs).getResult(); + case AffineExprKind::FloorDiv: + return rewriter.create( + loc, rewriter.getIndexType(), lhs, rhs).getResult(); + case AffineExprKind::CeilDiv: { + // ceildiv(a, b) = floordiv(a + b - 1, b). + Value one = rewriter.create( + loc, rewriter.getIndexType(), + rewriter.getIntegerAttr(rewriter.getIndexType(), 1)); + Value b_minus_1 = rewriter.create( + loc, rewriter.getIndexType(), rhs, one).getResult(); + Value numerator = rewriter.create( + loc, rewriter.getIndexType(), lhs, b_minus_1).getResult(); + return rewriter.create( + loc, rewriter.getIndexType(), numerator, rhs).getResult(); + } + default: + llvm::errs() << "[affine2neura] Unsupported binary op kind: " + << static_cast(bin_expr.getKind()) << "\n"; + return Value(); + } + } + + llvm::errs() << "[affine2neura] Unsupported affine expression type\n"; + return Value(); + }; + + Value expanded = expandExpr(expr); + if (!expanded) { + return apply_op.emitError("[affine2neura] Failed to expand affine.apply expression"); + } + + rewriter.replaceOp(apply_op, expanded); + return success(); + } +}; + +struct AffineForLowering : public OpRewritePattern { + const LoopNestAnalysis &analysis; + llvm::DenseMap &loopValidSignals; + + AffineForLowering(MLIRContext *context, const LoopNestAnalysis &analysis, + llvm::DenseMap &loopValidSignals) + : OpRewritePattern(context, /*benefit=*/1), + analysis(analysis), loopValidSignals(loopValidSignals) {} + + LogicalResult matchAndRewrite(affine::AffineForOp for_op, + PatternRewriter &rewriter) const override { + Location loc = for_op.getLoc(); + + // Extracts loop bounds - must be constant. + // Dynamic bounds are not supported as neura.loop_control requires + // compile-time constant attributes for hardware configuration. + if (!for_op.hasConstantLowerBound() || !for_op.hasConstantUpperBound()) { + return for_op.emitError( + "[affine2neura] Non-constant loop bounds not supported. " + "Loop bounds must be compile-time constants for CGRA configuration"); + } + + int64_t lower_bound = for_op.getConstantLowerBound(); + int64_t upper_bound = for_op.getConstantUpperBound(); + int64_t step = for_op.getStepAsInt(); + + // Get loop nesting information + LoopInfo *loopInfo = analysis.getLoopInfo(for_op); + Type i1_type = rewriter.getI1Type(); + Value parent_valid; + + // Optimization: Reuse parent loop's valid signal for nested loops. + // This avoids creating redundant grant_once operations. + if (loopInfo && loopInfo->parent) { + // This is a nested loop - try to reuse parent's loop_valid signal + auto it = loopValidSignals.find(loopInfo->parent->loop.getOperation()); + if (it != loopValidSignals.end()) { + parent_valid = it->second; + llvm::errs() << "[affine2neura] Reusing parent valid signal for " + << "nested loop (depth=" << loopInfo->depth << ")\n"; + } else { + // Fallback: parent not yet converted, create grant_once + parent_valid = rewriter.create( + loc, i1_type, /*value=*/Value(), /*constant_value=*/nullptr); + llvm::errs() << "[affine2neura] Parent valid not available, " + << "creating grant_once for nested loop\n"; + } + } else { + // Top-level loop - create grant_once + parent_valid = rewriter.create( + loc, i1_type, /*value=*/Value(), /*constant_value=*/nullptr); + if (loopInfo) { + llvm::errs() << "[affine2neura] Created grant_once for top-level loop " + << "(depth=" << loopInfo->depth << ")\n"; + } + } + + // Creates loop_control operation. + auto index_type = rewriter.getIndexType(); + + auto loop_control = rewriter.create( + loc, + /*resultTypes=*/TypeRange{index_type, i1_type}, + /*parentValid=*/parent_valid, + /*iterationType=*/rewriter.getStringAttr("increment"), + /*start=*/rewriter.getI64IntegerAttr(lower_bound), + /*end=*/rewriter.getI64IntegerAttr(upper_bound), + /*step=*/rewriter.getI64IntegerAttr(step)); + + Value loop_index = loop_control.getResult(0); + Value loop_valid = loop_control.getResult(1); + + // Store the loop_valid signal for child loops to use. + // This enables the optimization for nested loops. + loopValidSignals[for_op.getOperation()] = loop_valid; + + // Replaces uses of the induction variable. + for_op.getInductionVar().replaceAllUsesWith(loop_index); + + // Inlines the body operations before the for_op. + Block &body_block = for_op.getRegion().front(); + Operation *terminator = body_block.getTerminator(); + rewriter.eraseOp(terminator); // Removes affine.yield first. + + // Merge the loop body into the parent block before the for_op. + // Pass empty ValueRange since we've already replaced the induction variable. + rewriter.inlineBlockBefore(&body_block, for_op.getOperation(), {}); + + // Erases the for_op. + rewriter.eraseOp(for_op); + + return success(); + } +}; + +struct LowerAffineToNeuraPass + : public PassWrapper> { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(LowerAffineToNeuraPass) + + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } + + StringRef getArgument() const override { return "lower-affine-to-neura"; } + StringRef getDescription() const override { + return "Lower affine operations to Neura dialect operations"; + } + + void runOnOperation() override { + ModuleOp module_op = getOperation(); + MLIRContext *context = module_op.getContext(); + + module_op.walk([&](func::FuncOp func_op) { + // Checks if function targets neura accelerator, or applies to all if no attribute. + if (func_op->hasAttr(mlir::accel::kAcceleratorAttr)) { + auto target = func_op->getAttrOfType( + mlir::accel::kAcceleratorAttr); + if (!target || target.getValue() != mlir::accel::kNeuraTarget) { + return; // Skips this function. + } + } + // If no accelerator attribute, applies the pass anyway (for testing). + + // Step 1: Perform loop nest analysis + // This builds the loop hierarchy and identifies perfect/imperfect nests + llvm::errs() << "[affine2neura] Analyzing loop nests in function: " + << func_op.getName() << "\n"; + LoopNestAnalysis analysis(func_op); + analysis.dump(); // Print analysis results for debugging + + // Step 2: Create a map to store loop_valid signals + // This allows nested loops to reuse parent's valid signal + llvm::DenseMap loopValidSignals; + + // Step 3: Set up dialect conversion + // We use Dialect Conversion instead of Greedy Pattern Rewriter because: + // 1. It provides better error reporting when conversion fails + // 2. It explicitly defines which operations are legal/illegal + // 3. It's the standard approach for dialect lowering passes + ConversionTarget target(*context); + target.addLegalDialect(); + target.addIllegalDialect(); + + // Step 4: Register rewrite patterns with analysis + RewritePatternSet patterns(context); + patterns.add(context); + // Pass references to the analysis and loopValidSignals map + patterns.add(context, std::cref(analysis), + std::ref(loopValidSignals)); + + if (failed(applyPartialConversion(func_op, target, std::move(patterns)))) { + func_op.emitError("[affine2neura] Failed to lower affine " + "operations to Neura dialect"); + signalPassFailure(); + } + }); + } +}; +} // namespace + +std::unique_ptr mlir::createLowerAffineToNeuraPass() { + return std::make_unique(); +} \ No newline at end of file diff --git a/lib/Conversion/AffineToNeura/CMakeLists.txt b/lib/Conversion/AffineToNeura/CMakeLists.txt new file mode 100644 index 00000000..285099f3 --- /dev/null +++ b/lib/Conversion/AffineToNeura/CMakeLists.txt @@ -0,0 +1,19 @@ +add_mlir_dialect_library(MLIRNeuraAffineToNeuraPass + AffineToNeuraPass.cpp + LoopNestAnalysis.cpp + + ADDITIONAL_HEADER_DIRS + ${MLIR_MAIN_INCLUDE_DIR}/Conversion + + DEPENDS + MLIRConversionIncGen + + LINK_LIBS PUBLIC + MLIRAffineDialect + MLIRNeura + MLIRIR + MLIRPass + MLIRSupport + MLIRTransforms + MLIRFuncDialect +) diff --git a/lib/Conversion/AffineToNeura/LoopNestAnalysis.cpp b/lib/Conversion/AffineToNeura/LoopNestAnalysis.cpp new file mode 100644 index 00000000..64b6a029 --- /dev/null +++ b/lib/Conversion/AffineToNeura/LoopNestAnalysis.cpp @@ -0,0 +1,191 @@ +#include "Conversion/AffineToNeura/LoopNestAnalysis.h" +#include "llvm/Support/raw_ostream.h" + +using namespace mlir; +using namespace mlir::neura; + +/// Constructor - Performs complete loop nest analysis. +LoopNestAnalysis::LoopNestAnalysis(func::FuncOp func) { + llvm::errs() << "[LoopNestAnalysis] Starting analysis for function: " + << func.getName() << "\n"; + buildLoopNestTree(func); + llvm::errs() << "[LoopNestAnalysis] Found " << allLoops.size() << " loops\n"; + analyzePerfectNests(); + llvm::errs() << "[LoopNestAnalysis] Analysis complete\n"; +} + +// Builds the loop hierarchy tree. +void LoopNestAnalysis::buildLoopNestTree(func::FuncOp func) { + // Step 1: Collects all loops. + func.walk([&](affine::AffineForOp loop) { + auto loopInfo = std::make_unique(loop); + loopMap[loop.getOperation()] = loopInfo.get(); + allLoops.push_back(std::move(loopInfo)); + }); + + // Step 2: Establishes parent-child relationships. + for (auto &loopInfoPtr : allLoops) { + LoopInfo *loopInfo = loopInfoPtr.get(); + affine::AffineForOp loop = loopInfo->loop; + + // Searches upward for parent loop. + Operation *parentOp = loop->getParentOp(); + while (parentOp && !isa(parentOp)) { + if (auto parentLoop = dyn_cast(parentOp)) { + auto it = loopMap.find(parentLoop.getOperation()); + if (it != loopMap.end()) { + loopInfo->parent = it->second; + loopInfo->depth = loopInfo->parent->depth + 1; // depth = parent_depth + 1 + it->second->children.push_back(loopInfo); + } + break; + } + parentOp = parentOp->getParentOp(); + } + + // If no parent loop, this is a top-level loop. + if (!loopInfo->parent) { + topLevelLoops.push_back(loopInfo); + } + } +} + +// Analyzes perfect nesting characteristics. +void LoopNestAnalysis::analyzePerfectNests() { + for (auto &loopInfoPtr : allLoops) { + LoopInfo *info = loopInfoPtr.get(); + + // Leaf loops are automatically perfect. + if (info->children.empty()) { + info->isPerfectNest = true; + continue; + } + + Block &body = info->loop.getRegion().front(); + + // Builds child loop operation set for fast lookup. + llvm::DenseSet childLoopOps; + for (LoopInfo *child : info->children) { + childLoopOps.insert(child->loop.getOperation()); + } + + Operation *firstChild = info->children.front()->loop.getOperation(); + Operation *lastChild = info->children.back()->loop.getOperation(); + + // Checks if operations exist before the first child loop. + for (Operation &op : body.getOperations()) { + if (&op == firstChild) break; + if (isa(&op)) continue; + info->operationsBeforeChild.push_back(&op); + info->isPerfectNest = false; // Operations before child → imperfect + } + + // Checks if operations exist after the last child loop. + bool afterLastChild = false; + for (Operation &op : body.getOperations()) { + if (&op == lastChild) { + afterLastChild = true; + continue; + } + if (afterLastChild && !isa(&op)) { + info->operationsAfterChild.push_back(&op); + info->isPerfectNest = false; // Operations after child → imperfect + } + } + + // Checks if operations exist between sibling child loops. + // Example: affine.for i { affine.for j1; op; affine.for j2 } + if (info->children.size() > 1) { + bool betweenChildren = false; + Operation *prevChild = nullptr; + + for (Operation &op : body.getOperations()) { + if (childLoopOps.contains(&op)) { + if (prevChild && betweenChildren) { + info->isPerfectNest = false; // Operations between siblings → imperfect + break; + } + prevChild = &op; + betweenChildren = false; + } else if (prevChild && !isa(&op)) { + betweenChildren = true; + } + } + } + } +} + + +// Query Interface Implementation + +// Queries LoopInfo by loop operation. +LoopInfo *LoopNestAnalysis::getLoopInfo(affine::AffineForOp loop) const { + auto it = loopMap.find(loop.getOperation()); + return it != loopMap.end() ? it->second : nullptr; +} + +// Checks if the loop is a perfect nest. +bool LoopNestAnalysis::isPerfectNest(affine::AffineForOp loop) const { + LoopInfo *info = getLoopInfo(loop); + return info ? info->isPerfectNest : false; +} + +// Gets the parent loop. +LoopInfo *LoopNestAnalysis::getParentLoop(affine::AffineForOp loop) const { + LoopInfo *info = getLoopInfo(loop); + return info ? info->parent : nullptr; +} + +// Gets the list of child loops. +llvm::ArrayRef +LoopNestAnalysis::getChildLoops(affine::AffineForOp loop) const { + LoopInfo *info = getLoopInfo(loop); + return info ? llvm::ArrayRef(info->children) + : llvm::ArrayRef(); +} + + +// Debug Output Implementation +void LoopNestAnalysis::dump() const { + llvm::errs() << "=== Loop Nest Analysis ===\n"; + llvm::errs() << "Total loops: " << allLoops.size() << "\n"; + llvm::errs() << "Top-level loops: " << topLevelLoops.size() << "\n\n"; + + // Recursive print function. + std::function printLoop; + printLoop = [&](LoopInfo *info, unsigned indent) { + // Prints indentation. + for (unsigned i = 0; i < indent; ++i) llvm::errs() << " "; + + // Prints basic loop information. + llvm::errs() << "Loop (depth=" << info->depth + << ", perfect=" << (info->isPerfectNest ? "yes" : "no") + << ", children=" << info->children.size() << ")"; + + // If imperfect nest, prints detailed information. + if (!info->isPerfectNest) { + llvm::errs() << " [IMPERFECT: " + << "ops_before=" << info->operationsBeforeChild.size() + << ", ops_after=" << info->operationsAfterChild.size() + << "]"; + } + llvm::errs() << "\n"; + + // Prints location information. + for (unsigned i = 0; i < indent; ++i) llvm::errs() << " "; + llvm::errs() << " at: "; + info->loop.getLoc().print(llvm::errs()); + llvm::errs() << "\n"; + + // Recursively prints child loops. + for (LoopInfo *child : info->children) { + printLoop(child, indent + 1); + } + }; + + for (LoopInfo *topLoop : topLevelLoops) { + printLoop(topLoop, 0); + } + + llvm::errs() << "=== End Loop Nest Analysis ===\n\n"; +} diff --git a/lib/Conversion/ArithToNeura/ArithToNeuraPass.cpp b/lib/Conversion/ArithToNeura/ArithToNeuraPass.cpp index dc6f4532..8328eb61 100644 --- a/lib/Conversion/ArithToNeura/ArithToNeuraPass.cpp +++ b/lib/Conversion/ArithToNeura/ArithToNeuraPass.cpp @@ -343,8 +343,9 @@ struct LowerArithToNeuraPass ArithFDivToNeuraFDiv, ArithExtfToNeuraCast, ArithMulFToNeuraFMul, ArithSubIToNeuraSub, ArithSubFToNeuraFSub, ArithMulIToNeuraMul, ArithDivSIToNeuraDiv, ArithRemSIToNeuraOp>(context); + // Apply patterns to the function, not the entire module if (failed( - applyPatternsGreedily(getOperation(), std::move(patterns)))) { + applyPatternsGreedily(func_op, std::move(patterns)))) { signalPassFailure(); } } diff --git a/lib/Conversion/CMakeLists.txt b/lib/Conversion/CMakeLists.txt index 98f5dac2..bb6ccd5a 100644 --- a/lib/Conversion/CMakeLists.txt +++ b/lib/Conversion/CMakeLists.txt @@ -1,6 +1,7 @@ get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS) add_subdirectory(ArithToNeura) +add_subdirectory(AffineToNeura) add_subdirectory(LlvmToNeura) add_subdirectory(MemRefToNeura) add_subdirectory(BuiltinToNeura) @@ -16,6 +17,7 @@ target_link_libraries(MLIRConversion INTERFACE MLIRTransforms MLIRNeura MLIRNeuraArithToNeuraPass + MLIRNeuraAffineToNeuraPass MLIRNeuraLlvmToNeuraPass MLIRNeuraMemRefToNeuraPass MLIRNeuraBuiltinToNeuraPass diff --git a/lib/NeuraDialect/Mapping/mapping_util.cpp b/lib/NeuraDialect/Mapping/mapping_util.cpp index 18b2a899..21d33250 100644 --- a/lib/NeuraDialect/Mapping/mapping_util.cpp +++ b/lib/NeuraDialect/Mapping/mapping_util.cpp @@ -57,6 +57,11 @@ OperationKind getOperationKindFromMlirOp(Operation *op) { if (isa(op)) return FAddFAdd; if (isa(op)) return FMulFAdd; + // Steering control fused operations + if (isa(op)) return ICarryInvariant; + if (isa(op)) return IConditionalSelect; + if (isa(op)) return IInvariantGroup; + // Control flow operations if (isa(op)) return IReturn; if (isa(op)) return IPhi; @@ -87,6 +92,14 @@ bool is_non_materialized(Operation *op) { return mlir::isa(op); } +// Returns true if the operation is a steering-mode operation that doesn't +// require DataMovOp wrapping (e.g., constants, carry, invariant, etc.). +bool is_steering_unwrapped_op(Operation *op) { + return mlir::isa(op); +} + } // namespace neura } // namespace mlir @@ -625,9 +638,16 @@ bool mlir::neura::tryRouteDataMove(Operation *mov_op, MappingLoc src_loc, Operation *mlir::neura::getMaterializedProducer(Value operand) { Operation *producer = operand.getDefiningOp(); + + // In steering mode, some operations (like constants, carry, invariant, etc.) + // may not be wrapped by DataMovOp. Return them directly. + if (is_steering_unwrapped_op(producer)) { + return producer; + } + + // For operations wrapped by DataMovOp, find the actual producer. assert(isa(producer) && - "Expected operand to be defined by a DataMovOp"); - // Finds the actual producer. + "Expected a DataMovOp as operand producer for non-steering operations"); auto mov_op = dyn_cast(producer); auto materialized_producer = mov_op.getOperand().getDefiningOp(); return materialized_producer; @@ -760,6 +780,16 @@ bool mlir::neura::isMaterializedReserveUser(Operation *user) { if (isa(user)) { return true; } + // Fused steering control operations + if (isa(user)) { + return true; + } + if (isa(user)) { + return true; + } + if (isa(user)) { + return true; + } return false; } @@ -961,8 +991,18 @@ bool mlir::neura::placeAndRoute(Operation *op, const MappingLoc &target_loc, continue; } Operation *data_move = operand.getDefiningOp(); + + // In steering mode, some operands may not be DataMovOp (e.g., constants, + // carry, invariant, etc.). Skip routing for these operations. + if (is_steering_unwrapped_op(data_move)) { + llvm::errs() << "Skipping steering unwrapped operand: " << *data_move + << "\n"; + continue; + } + assert(isa(data_move) && - "Expected a DataMovOp as operand producer"); + "Expected a DataMovOp as operand for non-steering operations"); + Operation *producer = getMaterializedProducer(operand); MappingLoc src_loc = mapping_state.getAllLocsOfOp(producer).back(); diff --git a/test/Conversion/AffineToNeura/complex-affine-expressions.mlir b/test/Conversion/AffineToNeura/complex-affine-expressions.mlir new file mode 100644 index 00000000..06c417ac --- /dev/null +++ b/test/Conversion/AffineToNeura/complex-affine-expressions.mlir @@ -0,0 +1,90 @@ +// RUN: mlir-neura-opt %s --lower-affine-to-neura | FileCheck %s + +// This test verifies that complex affine expressions are correctly expanded +// into explicit Neura arithmetic operations. + +module { + // Test 1: Multiplication expression (d0 * 2) + // CHECK-LABEL: func.func @mul_expression + // CHECK-NEXT: %[[GRANT:.*]] = "neura.grant_once"() : () -> i1 + // CHECK-NEXT: %[[I:.*]], %[[VALID:.*]] = "neura.loop_control"(%[[GRANT]]) <{end = 10 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1) + // CHECK-NEXT: %[[C2:.*]] = "neura.constant"() <{value = 2 : index}> : () -> index + // CHECK-NEXT: %[[MUL:.*]] = "neura.mul"(%[[I]], %[[C2]]) : (index, index) -> index + // CHECK-NEXT: %{{.*}} = neura.load_indexed %arg0[%[[MUL]] : index] memref<10xf32> : f32 + // CHECK-NEXT: return + func.func @mul_expression(%arg0: memref<10xf32>) { + affine.for %i = 0 to 10 { + %0 = affine.load %arg0[2 * %i] : memref<10xf32> + } + return + } + + // Test 2: Addition and multiplication (d0 * 2 + 1) + // CHECK-LABEL: func.func @complex_expression + // CHECK-NEXT: %[[GRANT:.*]] = "neura.grant_once"() : () -> i1 + // CHECK-NEXT: %[[I:.*]], %[[VALID:.*]] = "neura.loop_control"(%[[GRANT]]) <{end = 10 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1) + // CHECK-NEXT: %[[C2:.*]] = "neura.constant"() <{value = 2 : index}> : () -> index + // CHECK-NEXT: %[[MUL:.*]] = "neura.mul"(%[[I]], %[[C2]]) : (index, index) -> index + // CHECK-NEXT: %[[C1:.*]] = "neura.constant"() <{value = 1 : index}> : () -> index + // CHECK-NEXT: %[[ADD:.*]] = "neura.add"(%[[MUL]], %[[C1]]) : (index, index) -> index + // CHECK-NEXT: %{{.*}} = neura.load_indexed %arg0[%[[ADD]] : index] memref<100xf32> : f32 + // CHECK-NEXT: return + func.func @complex_expression(%arg0: memref<100xf32>) { + affine.for %i = 0 to 10 { + %0 = affine.load %arg0[2 * %i + 1] : memref<100xf32> + } + return + } + + // Test 3: Modulo operation (d0 % 8) + // CHECK-LABEL: func.func @modulo_expression + // CHECK-NEXT: %[[GRANT:.*]] = "neura.grant_once"() : () -> i1 + // CHECK-NEXT: %[[I:.*]], %[[VALID:.*]] = "neura.loop_control"(%[[GRANT]]) <{end = 64 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1) + // CHECK-NEXT: %[[C8:.*]] = "neura.constant"() <{value = 8 : index}> : () -> index + // CHECK-NEXT: %[[REM:.*]] = "neura.rem"(%[[I]], %[[C8]]) : (index, index) -> index + // CHECK-NEXT: %{{.*}} = neura.load_indexed %arg0[%[[REM]] : index] memref<64xf32> : f32 + // CHECK-NEXT: return + func.func @modulo_expression(%arg0: memref<64xf32>) { + affine.for %i = 0 to 64 { + %0 = affine.load %arg0[%i mod 8] : memref<64xf32> + } + return + } + + // Test 4: Floor division (d0 floordiv 4) + // CHECK-LABEL: func.func @floordiv_expression + // CHECK-NEXT: %[[GRANT:.*]] = "neura.grant_once"() : () -> i1 + // CHECK-NEXT: %[[I:.*]], %[[VALID:.*]] = "neura.loop_control"(%[[GRANT]]) <{end = 32 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1) + // CHECK-NEXT: %[[C4_1:.*]] = "neura.constant"() <{value = 4 : index}> : () -> index + // CHECK-NEXT: %[[DIV:.*]] = "neura.div"(%[[I]], %[[C4_1]]) : (index, index) -> index + // CHECK-NEXT: %[[C4_2:.*]] = "neura.constant"() <{value = 4 : index}> : () -> index + // CHECK-NEXT: %[[REM:.*]] = "neura.rem"(%[[I]], %[[C4_2]]) : (index, index) -> index + // CHECK-NEXT: %{{.*}} = neura.load_indexed %arg0[%[[DIV]], %[[REM]] : index, index] memref<8x8xf32> : f32 + // CHECK-NEXT: return + func.func @floordiv_expression(%arg0: memref<8x8xf32>) { + affine.for %i = 0 to 32 { + %row = affine.apply affine_map<(d0) -> (d0 floordiv 4)>(%i) + %col = affine.apply affine_map<(d0) -> (d0 mod 4)>(%i) + %0 = affine.load %arg0[%row, %col] : memref<8x8xf32> + } + return + } + + // Test 5: Multiple dimensions with complex expressions + // CHECK-LABEL: func.func @multi_dim_complex + // CHECK-NEXT: %[[GRANT:.*]] = "neura.grant_once"() : () -> i1 + // CHECK-NEXT: %[[I:.*]], %[[VALID_I:.*]] = "neura.loop_control"(%[[GRANT]]) <{end = 10 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1) + // CHECK-NEXT: %[[J:.*]], %[[VALID_J:.*]] = "neura.loop_control"(%[[VALID_I]]) <{end = 20 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1) + // CHECK-NEXT: %[[C1:.*]] = "neura.constant"() <{value = 1 : index}> : () -> index + // CHECK-NEXT: %[[ADD:.*]] = "neura.add"(%[[J]], %[[C1]]) : (index, index) -> index + // CHECK-NEXT: %{{.*}} = neura.load_indexed %arg0[%[[I]], %[[ADD]] : index, index] memref<10x20xf32> : f32 + // CHECK-NEXT: return + func.func @multi_dim_complex(%arg0: memref<10x20xf32>) { + affine.for %i = 0 to 10 { + affine.for %j = 0 to 20 { + %0 = affine.load %arg0[%i, %j + 1] : memref<10x20xf32> + } + } + return + } +} diff --git a/test/Conversion/AffineToNeura/deep-nesting.mlir b/test/Conversion/AffineToNeura/deep-nesting.mlir new file mode 100644 index 00000000..c558eda0 --- /dev/null +++ b/test/Conversion/AffineToNeura/deep-nesting.mlir @@ -0,0 +1,31 @@ +// RUN: mlir-neura-opt %s --lower-affine-to-neura | FileCheck %s + +// Corner Case: Deeply nested loops (4 levels) - tests perfect nesting with 4D +module { + func.func @deep_nesting_4d(%arg0: memref<5x5x5x5xf32>) { + affine.for %i = 0 to 5 { + affine.for %j = 0 to 5 { + affine.for %k = 0 to 5 { + affine.for %l = 0 to 5 { + %0 = affine.load %arg0[%i, %j, %k, %l] : memref<5x5x5x5xf32> + } + } + } + } + return + } +} + +// ============================================================================ +// Verify transformation: no affine ops, only neura ops, 1 grant_once for perfect nest +// ============================================================================ +// CHECK-LABEL: func.func @deep_nesting_4d +// CHECK-NOT: affine. +// CHECK-NEXT: %[[V0:.*]] = "neura.grant_once"() : () -> i1 +// CHECK-NEXT: %[[I:.*]], %[[VI:.*]] = "neura.loop_control"(%[[V0]]) <{end = 5 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1) +// CHECK-NEXT: %[[J:.*]], %[[VJ:.*]] = "neura.loop_control"(%[[VI]]) <{end = 5 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1) +// CHECK-NEXT: %[[K:.*]], %[[VK:.*]] = "neura.loop_control"(%[[VJ]]) <{end = 5 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1) +// CHECK-NEXT: %[[L:.*]], %[[VL:.*]] = "neura.loop_control"(%[[VK]]) <{end = 5 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1) +// CHECK-NEXT: %{{.*}} = neura.load_indexed %arg0[%[[I]], %[[J]], %[[K]], %[[L]] : index, index, index, index] memref<5x5x5x5xf32> : f32 +// CHECK-NEXT: return +// CHECK-NOT: affine. diff --git a/test/Conversion/AffineToNeura/imperfect-ops-after.mlir b/test/Conversion/AffineToNeura/imperfect-ops-after.mlir new file mode 100644 index 00000000..899dc1c9 --- /dev/null +++ b/test/Conversion/AffineToNeura/imperfect-ops-after.mlir @@ -0,0 +1,29 @@ +// RUN: mlir-neura-opt %s --lower-affine-to-neura | FileCheck %s + +// Imperfect Nesting: Operations after child loop +module { + func.func @imperfect_ops_after(%arg0: memref<10x20xf32>, %arg1: memref<10xf32>) { + affine.for %i = 0 to 10 { + affine.for %j = 0 to 20 { + %0 = affine.load %arg0[%i, %j] : memref<10x20xf32> + } + %cst = arith.constant 1.0 : f32 + affine.store %cst, %arg1[%i] : memref<10xf32> + } + return + } +} + +// ============================================================================ +// Verify transformation: no affine ops, valid signal reuse for inner loop +// ============================================================================ +// CHECK-LABEL: func.func @imperfect_ops_after(%arg0: memref<10x20xf32>, %arg1: memref<10xf32>) +// CHECK-NEXT: %[[GRANT:.*]] = "neura.grant_once"() : () -> i1 +// CHECK-NEXT: %[[I:.*]], %[[VI:.*]] = "neura.loop_control"(%[[GRANT]]) <{end = 10 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1) +// CHECK-NEXT: %[[J:.*]], %[[VJ:.*]] = "neura.loop_control"(%[[VI]]) <{end = 20 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1) +// CHECK-NEXT: %{{.*}} = neura.load_indexed %arg0[%[[I]], %[[J]] : index, index] memref<10x20xf32> : f32 +// CHECK-NEXT: %[[CST:.*]] = arith.constant 1.000000e+00 : f32 +// CHECK-NEXT: neura.store_indexed %[[CST]] to %arg1[%[[I]] : index] memref<10xf32> : f32 +// CHECK-NEXT: return +// CHECK-NEXT: } +// CHECK-NOT: affine. diff --git a/test/Conversion/AffineToNeura/loop-nest-optimization.mlir b/test/Conversion/AffineToNeura/loop-nest-optimization.mlir new file mode 100644 index 00000000..3e4af366 --- /dev/null +++ b/test/Conversion/AffineToNeura/loop-nest-optimization.mlir @@ -0,0 +1,98 @@ +// RUN: mlir-neura-opt %s --lower-affine-to-neura | FileCheck %s + +// Test 1: Perfect nested loops - should reuse valid signals +// CHECK-LABEL: func.func @perfect_nest_2d +// CHECK-NEXT: %[[GRANT:.*]] = "neura.grant_once"() : () -> i1 +// CHECK-NEXT: %[[I:.*]], %[[VALID_I:.*]] = "neura.loop_control"(%[[GRANT]]) <{end = 10 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1) +// CHECK-NEXT: %[[J:.*]], %[[VALID_J:.*]] = "neura.loop_control"(%[[VALID_I]]) <{end = 20 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1) +// CHECK-NEXT: %{{.*}} = neura.load_indexed %arg0[%[[I]], %[[J]] : index, index] memref<10x20xf32> : f32 +// CHECK-NEXT: return +func.func @perfect_nest_2d(%A: memref<10x20xf32>) { + affine.for %i = 0 to 10 { + affine.for %j = 0 to 20 { + %v = affine.load %A[%i, %j] : memref<10x20xf32> + } + } + return +} + +// Test 2: Triple nested loops - should reuse valid signals transitively +// CHECK-LABEL: func.func @perfect_nest_3d +// CHECK-NEXT: %[[GRANT:.*]] = "neura.grant_once"() : () -> i1 +// CHECK-NEXT: %[[I:.*]], %[[VALID_I:.*]] = "neura.loop_control"(%[[GRANT]]) <{end = 10 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1) +// CHECK-NEXT: %[[J:.*]], %[[VALID_J:.*]] = "neura.loop_control"(%[[VALID_I]]) <{end = 20 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1) +// CHECK-NEXT: %[[K:.*]], %[[VALID_K:.*]] = "neura.loop_control"(%[[VALID_J]]) <{end = 30 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1) +// CHECK-NEXT: %{{.*}} = neura.load_indexed %arg0[%[[I]], %[[J]], %[[K]] : index, index, index] memref<10x20x30xf32> : f32 +// CHECK-NEXT: return +func.func @perfect_nest_3d(%A: memref<10x20x30xf32>) { + affine.for %i = 0 to 10 { + affine.for %j = 0 to 20 { + affine.for %k = 0 to 30 { + %v = affine.load %A[%i, %j, %k] : memref<10x20x30xf32> + } + } + } + return +} + +// Test 3: Imperfect nested loop - operations before inner loop +// CHECK-LABEL: func.func @imperfect_nest_before +// CHECK-NEXT: %[[GRANT:.*]] = "neura.grant_once"() : () -> i1 +// CHECK-NEXT: %[[I:.*]], %[[VALID_I:.*]] = "neura.loop_control"(%[[GRANT]]) <{end = 10 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1) +// CHECK-NEXT: %[[CST:.*]] = arith.constant 0.000000e+00 : f32 +// CHECK-NEXT: %[[J:.*]], %[[VALID_J:.*]] = "neura.loop_control"(%[[VALID_I]]) <{end = 20 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1) +// CHECK-NEXT: %{{.*}} = neura.load_indexed %arg0[%[[I]], %[[J]] : index, index] memref<10x20xf32> : f32 +// CHECK-NEXT: return +func.func @imperfect_nest_before(%A: memref<10x20xf32>, %B: memref<10xf32>) { + affine.for %i = 0 to 10 { + %c = arith.constant 0.0 : f32 + affine.for %j = 0 to 20 { + %v = affine.load %A[%i, %j] : memref<10x20xf32> + } + } + return +} + +// Test 4: Two separate top-level loops - each should get its own grant_once +// CHECK-LABEL: func.func @two_top_level_loops +// CHECK-NEXT: %[[GRANT1:.*]] = "neura.grant_once"() : () -> i1 +// CHECK-NEXT: %[[I:.*]], %[[VALID_I:.*]] = "neura.loop_control"(%[[GRANT1]]) <{end = 10 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1) +// CHECK-NEXT: %{{.*}} = neura.load_indexed %arg0[%[[I]] : index] memref<10xf32> : f32 +// CHECK-NEXT: %[[GRANT2:.*]] = "neura.grant_once"() : () -> i1 +// CHECK-NEXT: %[[J:.*]], %[[VALID_J:.*]] = "neura.loop_control"(%[[GRANT2]]) <{end = 20 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1) +// CHECK-NEXT: %{{.*}} = neura.load_indexed %arg1[%[[J]] : index] memref<20xf32> : f32 +// CHECK-NEXT: return +func.func @two_top_level_loops(%A: memref<10xf32>, %B: memref<20xf32>) { + affine.for %i = 0 to 10 { + %v = affine.load %A[%i] : memref<10xf32> + } + + affine.for %j = 0 to 20 { + %w = affine.load %B[%j] : memref<20xf32> + } + return +} + +// Test 5: Siblings - two inner loops should both reuse parent's valid +// CHECK-LABEL: func.func @sibling_loops +// CHECK-NEXT: %[[GRANT:.*]] = "neura.grant_once"() : () -> i1 +// CHECK-NEXT: %[[I:.*]], %[[VALID_I:.*]] = "neura.loop_control"(%[[GRANT]]) <{end = 10 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1) +// CHECK-NEXT: %[[J1:.*]], %[[VALID_J1:.*]] = "neura.loop_control"(%[[VALID_I]]) <{end = 20 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1) +// CHECK-NEXT: %{{.*}} = neura.load_indexed %arg0[%[[I]], %[[J1]] : index, index] memref<10x20xf32> : f32 +// CHECK-NEXT: %[[J2:.*]], %[[VALID_J2:.*]] = "neura.loop_control"(%[[VALID_I]]) <{end = 20 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1) +// CHECK-NEXT: %{{.*}} = neura.load_indexed %arg1[%[[I]], %[[J2]] : index, index] memref<10x20xf32> : f32 +// CHECK-NEXT: return +func.func @sibling_loops(%A: memref<10x20xf32>, %B: memref<10x20xf32>) { + affine.for %i = 0 to 10 { + // First inner loop + affine.for %j = 0 to 20 { + %v = affine.load %A[%i, %j] : memref<10x20xf32> + } + + // Second inner loop (sibling) + affine.for %k = 0 to 20 { + %w = affine.load %B[%i, %k] : memref<10x20xf32> + } + } + return +} diff --git a/test/Conversion/AffineToNeura/single-iteration.mlir b/test/Conversion/AffineToNeura/single-iteration.mlir new file mode 100644 index 00000000..08999f38 --- /dev/null +++ b/test/Conversion/AffineToNeura/single-iteration.mlir @@ -0,0 +1,23 @@ +// RUN: mlir-neura-opt %s --lower-affine-to-neura | FileCheck %s + +// Corner Case: Single iteration loop +module { + func.func @single_iteration(%arg0: memref<1xf32>) { + affine.for %i = 0 to 1 { + %0 = affine.load %arg0[%i] : memref<1xf32> + } + return + } +} + +// ============================================================================ +// Expected output after --lower-affine-to-neura transformation: +// Verify: 1) no affine ops, 2) all neura ops present, 3) exact IR match +// ============================================================================ +// CHECK-LABEL: func.func @single_iteration(%arg0: memref<1xf32>) +// CHECK-NEXT: %[[V0:.*]] = "neura.grant_once"() : () -> i1 +// CHECK-NEXT: %[[NEXT:.*]], %[[VALID:.*]] = "neura.loop_control"(%[[V0]]) <{end = 1 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1) +// CHECK-NEXT: %{{.*}} = neura.load_indexed %arg0[%[[NEXT]] : index] memref<1xf32> : f32 +// CHECK-NEXT: return +// CHECK-NEXT: } +// CHECK-NOT: affine. diff --git a/tools/mlir-neura-opt/mlir-neura-opt.cpp b/tools/mlir-neura-opt/mlir-neura-opt.cpp index 8969fa56..e88202fe 100644 --- a/tools/mlir-neura-opt/mlir-neura-opt.cpp +++ b/tools/mlir-neura-opt/mlir-neura-opt.cpp @@ -1,5 +1,6 @@ // tools/mlir-neura-opt/mlir-neura-opt.cpp +#include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/DLTI/DLTI.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/InitAllDialects.h" @@ -57,6 +58,7 @@ int main(int argc, char **argv) { registry.insert(); registry.insert(); registry.insert(); + registry.insert(); registry.insert(); registry.insert(); registry.insert();