diff --git a/include/Conversion/AffineToNeura/LoopNestAnalysis.h b/include/Conversion/AffineToNeura/LoopNestAnalysis.h
new file mode 100644
index 00000000..67517371
--- /dev/null
+++ b/include/Conversion/AffineToNeura/LoopNestAnalysis.h
@@ -0,0 +1,70 @@
+#ifndef CONVERSION_AFFINE_TO_NEURA_LOOP_NEST_ANALYSIS_H
+#define CONVERSION_AFFINE_TO_NEURA_LOOP_NEST_ANALYSIS_H
+
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/Operation.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include <memory>
+
+namespace mlir {
+namespace neura {
+
+/// Loop information structure - Stores all analysis information for a single loop.
+struct LoopInfo {
+  affine::AffineForOp loop;              // The loop operation itself.
+  LoopInfo *parent = nullptr;            // Parent loop (nullptr if top-level).
+  llvm::SmallVector<LoopInfo *, 4> children;  // Child loops list.
+  unsigned depth = 0;                    // Nesting depth (0=top-level).
+  bool is_perfect_nest = true;           // Whether it is a perfect nest.
+  
+  // Operations list for imperfect nesting.
+  llvm::SmallVector<Operation *, 4> operations_before_child;  // Operations before child loops.
+  llvm::SmallVector<Operation *, 4> operations_after_child;   // Operations after child loops.
+  
+  LoopInfo(affine::AffineForOp loop) : loop(loop) {}
+};
+
+/// Loop nest analysis class.
+/// 
+/// Purpose: Provides loop hierarchy information for AffineToNeura pass to support optimization decisions.
+/// 
+/// Usage example:
+///   LoopNestAnalysis analysis(func_op);
+///   analysis.dump();  // Prints analysis results.
+///   LoopInfo *info = analysis.getLoopInfo(loop);
+///   if (info && info->parent) {
+///     // This is a nested loop, can reuse parent's valid signal.
+///   }
+class LoopNestAnalysis {
+public:
+  /// Constructor - Performs loop nest analysis on the given function.
+  explicit LoopNestAnalysis(func::FuncOp func);
+  
+  /// Query interfaces.
+  LoopInfo *getLoopInfo(affine::AffineForOp loop) const;  // Gets loop information.
+  llvm::ArrayRef<LoopInfo *> getTopLevelLoops() const { return topLevelLoops; }  // Gets top-level loops.
+  llvm::ArrayRef<std::unique_ptr<LoopInfo>> getAllLoops() const { return allLoops; }  // Gets all loops.
+  bool isPerfectNest(affine::AffineForOp loop) const;  // Checks if perfect nest.
+  LoopInfo *getParentLoop(affine::AffineForOp loop) const;  // Gets parent loop.
+  llvm::ArrayRef<LoopInfo *> getChildLoops(affine::AffineForOp loop) const;  // Gets child loops.
+  
+  /// Debug interface - Prints analysis results.
+  void dump() const;
+
+private:
+  /// Internal analysis methods.
+  void buildLoopNestTree(func::FuncOp func);  // Builds loop hierarchy tree.
+  void analyzePerfectNests();  // Analyzes perfect nest characteristics.
+  
+  /// Data members.
+  llvm::DenseMap<Operation *, LoopInfo *> loopMap;  // Loop fast lookup table.
+  llvm::SmallVector<std::unique_ptr<LoopInfo>, 8> allLoops;  // All loops (owns ownership).
+  llvm::SmallVector<LoopInfo *, 4> topLevelLoops;  // Top-level loop pointers list.
+};
+
+} // namespace neura
+} // namespace mlir
+
+#endif
diff --git a/include/Conversion/ConversionPasses.h b/include/Conversion/ConversionPasses.h
index 30cbf0e8..15f9b2d6 100644
--- a/include/Conversion/ConversionPasses.h
+++ b/include/Conversion/ConversionPasses.h
@@ -18,6 +18,7 @@ std::unique_ptr<mlir::Pass> createLowerArithToNeuraPass();
 std::unique_ptr<mlir::Pass> createLowerLlvmToNeuraPass();
 std::unique_ptr<mlir::Pass> createLowerMemRefToNeuraPass();
 std::unique_ptr<mlir::Pass> createLowerBuiltinToNeuraPass();
+std::unique_ptr<mlir::Pass> createLowerAffineToNeuraPass();
 
 #define GEN_PASS_REGISTRATION
 #include "Conversion/ConversionPasses.h.inc"
diff --git a/include/Conversion/ConversionPasses.td b/include/Conversion/ConversionPasses.td
index 2e79dd96..7044b9ad 100644
--- a/include/Conversion/ConversionPasses.td
+++ b/include/Conversion/ConversionPasses.td
@@ -32,4 +32,16 @@ def LowerBuiltinToNeura : Pass<"lower-builtin-to-neura", "ModuleOp">{
   let constructor = "mlir::createLowerBuiltinToNeuraPass()";
 }
 
+def LowerAffineToNeura : Pass<"lower-affine-to-neura", "func::FuncOp">{
+  let summary = "Lower Affine perfect nested loops to Neura loop_control operations";
+  let description = [{
+    Converts perfectly nested affine.for loops directly to Neura dialect using 
+    loop_control operations, avoiding the need to flatten to LLVM IR first.
+    This preserves loop structure information for better optimization on 
+    dataflow architectures.
+  }];
+  let constructor = "mlir::createLowerAffineToNeuraPass()";
+  let dependentDialects = ["mlir::neura::NeuraDialect", "mlir::affine::AffineDialect"];
+}
+
 #endif // CONVERSION_PASSES_TD
\ No newline at end of file
diff --git a/include/NeuraDialect/Architecture/Architecture.h b/include/NeuraDialect/Architecture/Architecture.h
index ca7a4951..bae2db90 100644
--- a/include/NeuraDialect/Architecture/Architecture.h
+++ b/include/NeuraDialect/Architecture/Architecture.h
@@ -57,7 +57,9 @@ enum OperationKind {
   // Loop control operations.
   ILoopControl = 34,
   // Constant operations.
-  IConstant = 35
+  IConstant = 35,
+  // Steering control fused operations.
+  ICarryInvariant = 36, IConditionalSelect = 37, IInvariantGroup = 38
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/include/NeuraDialect/Mapping/mapping_util.h b/include/NeuraDialect/Mapping/mapping_util.h
index 8c301aa1..cf85d2a2 100644
--- a/include/NeuraDialect/Mapping/mapping_util.h
+++ b/include/NeuraDialect/Mapping/mapping_util.h
@@ -12,6 +12,10 @@ OperationKind getOperationKindFromMlirOp(Operation *op);
 // Returns true if the operation does not need CGRA tile placement.
 bool is_non_materialized(Operation *op);
 
+// Returns true if the operation is a steering-mode operation that doesn't
+// require DataMovOp wrapping (e.g., constants, carry, invariant, etc.).
+bool is_steering_unwrapped_op(Operation *op);
+
 // Returns true if the operation is a materialized reserve user, i.e.,
 // phi, invariant, carry.
 bool isMaterializedReserveUser(Operation *op);
diff --git a/include/NeuraDialect/NeuraPasses.td b/include/NeuraDialect/NeuraPasses.td
index 3d70af2c..d7f4974a 100644
--- a/include/NeuraDialect/NeuraPasses.td
+++ b/include/NeuraDialect/NeuraPasses.td
@@ -134,4 +134,5 @@ def RemovePredicatedType : Pass<"remove-predicated-type", "ModuleOp"> {
   }];
   let constructor = "neura::createRemovePredicatedTypePass()";
 }
+
 #endif // NEURA_PASSES_TD
\ No newline at end of file
diff --git a/lib/Conversion/AffineToNeura/AffineToNeuraPass.cpp b/lib/Conversion/AffineToNeura/AffineToNeuraPass.cpp
new file mode 100644
index 00000000..39051720
--- /dev/null
+++ b/lib/Conversion/AffineToNeura/AffineToNeuraPass.cpp
@@ -0,0 +1,593 @@
+#include "Common/AcceleratorAttrs.h"
+#include "Conversion/ConversionPasses.h"
+#include "Conversion/AffineToNeura/LoopNestAnalysis.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/Region.h"
+#include "mlir/IR/ValueRange.h"
+#include "mlir/IR/Visitors.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+#include "NeuraDialect/NeuraDialect.h"
+#include "NeuraDialect/NeuraOps.h"
+#include "llvm/Support/raw_ostream.h"
+#include <memory>
+
+using namespace mlir;
+using namespace mlir::neura;
+using namespace mlir::func;
+
+#define GEN_PASS_DEF_LOWERAFFINETONEURA
+#include "Conversion/ConversionPasses.h.inc"
+
+namespace {
+// Converts an AffineMap to explicit index computations using Neura operations.
+// This function handles the expansion of affine expressions into arithmetic ops.
+//
+// Example 1 - Simple dimension access:
+// Before: affine_map<(d0, d1) -> (d0, d1)> with operands (%i, %j)
+// After:  Returns [%i, %j] directly
+//
+// Example 2 - Constant offset:
+// Before: affine_map<(d0) -> (d0 + 5)> with operand %i
+// After:  %c5 = neura.constant 5 : index
+//         %result = neura.add %i, %c5 : index
+//         Returns [%result]
+//
+// Example 3 - Complex expression:
+// Before: affine_map<(d0, d1) -> (d0 * 2 + d1)> with operands (%i, %j)
+// After:  %c2 = neura.constant 2 : index
+//         %mul = neura.mul %i, %c2 : index
+//         %result = neura.add %mul, %j : index
+//         Returns [%result]
+LogicalResult convertAffineMapToIndices(AffineMap map, ValueRange map_operands,
+                                        Location loc, PatternRewriter &rewriter,
+                                        SmallVector<Value> &new_indices) {
+  new_indices.clear();
+  new_indices.reserve(map.getNumResults());
+  for (AffineExpr expr : map.getResults()) {
+    if (AffineConstantExpr const_expr = dyn_cast<AffineConstantExpr>(expr)) {
+      IndexType index_type = rewriter.getIndexType();
+      IntegerAttr value_attr =
+          rewriter.getIntegerAttr(index_type, const_expr.getValue());
+      new_indices.push_back(rewriter.create<neura::ConstantOp>(
+          loc, index_type, value_attr));
+    } else if (AffineDimExpr dim_expr = dyn_cast<AffineDimExpr>(expr)) {
+      if (dim_expr.getPosition() >= map.getNumDims() ||
+          dim_expr.getPosition() >=
+              map_operands
+                  .size()) { // Checks against mapOperands size for safety.
+        return failure();
+      }
+      new_indices.push_back(map_operands[dim_expr.getPosition()]);
+    } else if (AffineSymbolExpr sym_expr = dyn_cast<AffineSymbolExpr>(expr)) {
+      unsigned symbol_operand_index = map.getNumDims() + sym_expr.getPosition();
+      if (symbol_operand_index >= map_operands.size()) {
+        return failure();
+      }
+      new_indices.push_back(map_operands[symbol_operand_index]);
+    } else {
+      // For more complex affine expressions (e.g., d0 + c1),
+      // expands them into explicit Neura arithmetic operations.
+      // Supports: Add, Mul, Mod, FloorDiv, CeilDiv.
+      llvm::errs() << "[affine2neura] Expanding complex affine expression: " 
+                   << expr << "\n";
+      
+      // Helper lambda: recursively expands AffineExpr to Value.
+      std::function<Value(AffineExpr)> expandExpr = 
+          [&](AffineExpr e) -> Value {
+        // Constant expression.
+        if (auto const_expr = dyn_cast<AffineConstantExpr>(e)) {
+          return rewriter.create<neura::ConstantOp>(
+              loc, rewriter.getIndexType(),
+              rewriter.getIntegerAttr(rewriter.getIndexType(), 
+                                      const_expr.getValue()));
+        }
+        // Dimension expression.
+        else if (auto dim_expr = dyn_cast<AffineDimExpr>(e)) {
+          return map_operands[dim_expr.getPosition()];
+        }
+        // Symbol expression.
+        else if (auto sym_expr = dyn_cast<AffineSymbolExpr>(e)) {
+          unsigned symbol_operand_index = 
+              map.getNumDims() + sym_expr.getPosition();
+          return map_operands[symbol_operand_index];
+        }
+        // Binary operation expression.
+        else if (auto bin_expr = dyn_cast<AffineBinaryOpExpr>(e)) {
+          Value lhs = expandExpr(bin_expr.getLHS());
+          Value rhs = expandExpr(bin_expr.getRHS());
+          
+          switch (bin_expr.getKind()) {
+            case AffineExprKind::Add:
+              return rewriter.create<neura::AddOp>(
+                  loc, rewriter.getIndexType(), lhs, rhs).getResult();
+            case AffineExprKind::Mul:
+              return rewriter.create<neura::MulOp>(
+                  loc, rewriter.getIndexType(), lhs, rhs).getResult();
+            case AffineExprKind::Mod:
+              return rewriter.create<neura::RemOp>(
+                  loc, rewriter.getIndexType(), lhs, rhs).getResult();
+            case AffineExprKind::FloorDiv:
+              return rewriter.create<neura::DivOp>(
+                  loc, rewriter.getIndexType(), lhs, rhs).getResult();
+            case AffineExprKind::CeilDiv: {
+              // ceildiv(a, b) = floordiv(a + b - 1, b).
+              Value one = rewriter.create<neura::ConstantOp>(
+                  loc, rewriter.getIndexType(),
+                  rewriter.getIntegerAttr(rewriter.getIndexType(), 1));
+              Value b_minus_1 = rewriter.create<neura::SubOp>(
+                  loc, rewriter.getIndexType(), rhs, one).getResult();
+              Value numerator = rewriter.create<neura::AddOp>(
+                  loc, rewriter.getIndexType(), lhs, b_minus_1).getResult();
+              return rewriter.create<neura::DivOp>(
+                  loc, rewriter.getIndexType(), numerator, rhs).getResult();
+            }
+            default:
+              llvm::errs() << "[affine2neura] Unsupported binary op kind: "
+                           << static_cast<int>(bin_expr.getKind()) << "\n";
+              return Value();
+          }
+        }
+        
+        llvm::errs() << "[affine2neura] Unsupported affine expression type\n";
+        return Value();
+      };
+      
+      Value expanded = expandExpr(expr);
+      if (!expanded) {
+        // Fallback: if expansion fails, use affine.apply (ensures correctness).
+        llvm::errs() << "[affine2neura] Failed to expand, using affine.apply\n";
+        AffineMap single_result_map = AffineMap::get(
+            map.getNumDims(), map.getNumSymbols(), expr, rewriter.getContext());
+        expanded = rewriter.create<affine::AffineApplyOp>(
+            loc, single_result_map, map_operands);
+      }
+      new_indices.push_back(expanded);
+    }
+  }
+  return success();
+}
+
+// Converts affine.load to neura.load_indexed.
+// Expands the affine map into explicit index computations.
+//
+// Example 1 - Simple 2D array access:
+// Before: %val = affine.load %A[%i, %j] : memref<10x20xf32>
+// After:  %val = neura.load_indexed %A[%i, %j : index, index] memref<10x20xf32> : f32
+//
+// Example 2 - With affine expression:
+// Before: %val = affine.load %A[%i * 2 + 1, %j] : memref<100x100xf32>
+// After:  %c2 = neura.constant 2 : index
+//         %c1 = neura.constant 1 : index
+//         %mul = neura.mul %i, %c2 : index
+//         %idx0 = neura.add %mul, %c1 : index
+//         %val = neura.load_indexed %A[%idx0, %j : index, index] memref<100x100xf32> : f32
+struct AffineLoadLowering : public OpRewritePattern<affine::AffineLoadOp> {
+  AffineLoadLowering(MLIRContext *context)
+      : OpRewritePattern<affine::AffineLoadOp>(context, /*benefit=*/1) {}
+  
+  LogicalResult matchAndRewrite(affine::AffineLoadOp load_op,
+                                PatternRewriter &rewriter) const override {
+    Location loc = load_op.getLoc();
+    auto memref = load_op.getMemref();
+    AffineMap map = load_op.getAffineMap();
+    ValueRange map_operands = load_op.getMapOperands();
+    // Gets the indices for the load operation.
+    SmallVector<Value> new_indices;
+    if (failed(convertAffineMapToIndices(map, map_operands, loc, rewriter,
+                                         new_indices))) {
+      return load_op.emitError(
+          "[affine2neura] Failed to convert affine map to indices");
+    }
+
+    MemRefType memref_type = dyn_cast<MemRefType>(memref.getType());
+    if (!memref_type) {
+      return load_op.emitError(
+          "[affine2neura] Base of load is not a MemRefType");
+    }
+    if (new_indices.size() != static_cast<size_t>(memref_type.getRank())) {
+      return load_op.emitError(
+                 "[affine2neura] Number of indices from affine map (")
+             << new_indices.size() << ") does not match memref rank ("
+             << memref_type.getRank() << ")";
+    }
+
+    // NOTE: No explicit dimension limit is enforced here. The lowering supports
+    // arbitrary dimensions theoretically. For CGRA hardware with limited address
+    // generation units, dimension constraints should be handled at a later stage
+    // (e.g., during mapping or hardware-specific lowering passes).
+
+    // Creates the neura.load_indexed operation.
+   LoadIndexedOp new_load_op = rewriter.create<neura::LoadIndexedOp>(
+        loc, load_op.getType(), memref, ValueRange{new_indices});
+
+    rewriter.replaceOp(load_op, new_load_op.getResult());
+    return success();
+  }
+};
+
+// Converts affine.store to neura.store_indexed.
+// Similar to AffineLoadLowering, expands affine maps into explicit indices.
+//
+// Example 1 - Simple store:
+// Before: affine.store %val, %A[%i, %j] : memref<10x20xf32>
+// After:  neura.store_indexed %val to %A[%i, %j : index, index] memref<10x20xf32> : f32
+//
+// Example 2 - With affine expression:
+// Before: affine.store %val, %A[%i + 1, %j * 2] : memref<100x100xf32>
+// After:  %c1 = neura.constant 1 : index
+//         %c2 = neura.constant 2 : index
+//         %idx0 = neura.add %i, %c1 : index
+//         %idx1 = neura.mul %j, %c2 : index
+//         neura.store_indexed %val to %A[%idx0, %idx1 : index, index] memref<100x100xf32> : f32
+struct AffineStoreLowering : public OpRewritePattern<affine::AffineStoreOp> {
+  AffineStoreLowering(MLIRContext *context)
+      : OpRewritePattern<affine::AffineStoreOp>(context, /*benefit=*/1) {}
+  
+  LogicalResult matchAndRewrite(affine::AffineStoreOp store_op,
+                                PatternRewriter &rewriter) const override {
+    Location loc = store_op.getLoc();
+    auto memref = store_op.getMemref();
+    Value value = store_op.getValueToStore();
+    AffineMap map = store_op.getAffineMap();
+    ValueRange mapOperands = store_op.getMapOperands();
+
+    SmallVector<Value> newIndices;
+    if (failed(convertAffineMapToIndices(map, mapOperands, loc, rewriter,
+                                         newIndices))) {
+      return store_op.emitError(
+          "[affine2neura] Failed to convert affine map to indices");
+    }
+
+    MemRefType memRefType = dyn_cast<MemRefType>(memref.getType());
+    if (!memRefType) {
+      return store_op.emitError(
+          "[affine2neura] Base of store is not a MemRefType");
+    }
+    if (newIndices.size() != static_cast<size_t>(memRefType.getRank())) {
+      return store_op.emitError(
+                 "[affine2neura] Number of indices from affine map (")
+             << newIndices.size() << ") does not match memref rank ("
+             << memRefType.getRank() << ")";
+    }
+
+    rewriter.create<neura::StoreIndexedOp>(loc, value, memref,
+                                           ValueRange{newIndices});
+    rewriter.eraseOp(store_op);
+    return success();
+  }
+};
+
+// Converts affine.apply to explicit Neura arithmetic operations.
+// Recursively expands the affine expression tree into primitive operations.
+//
+// Example 1 - Linear expression:
+// Before: %result = affine.apply affine_map<(d0) -> (d0 + 5)>(%i)
+// After:  %c5 = neura.constant 5 : index
+//         %result = neura.add %i, %c5 : index
+//
+// Example 2 - Multiply-add:
+// Before: %result = affine.apply affine_map<(d0, d1) -> (d0 * 2 + d1)>(%i, %j)
+// After:  %c2 = neura.constant 2 : index
+//         %mul = neura.mul %i, %c2 : index
+//         %result = neura.add %mul, %j : index
+//
+// Example 3 - Modulo operation:
+// Before: %result = affine.apply affine_map<(d0) -> (d0 mod 8)>(%i)
+// After:  %c8 = neura.constant 8 : index
+//         %result = neura.rem %i, %c8 : index
+//
+// Example 4 - Complex nested expression:
+// Before: %result = affine.apply affine_map<(d0, d1) -> ((d0 + 1) * d1)>(%i, %j)
+// After:  %c1 = neura.constant 1 : index
+//         %add = neura.add %i, %c1 : index
+//         %result = neura.mul %add, %j : index
+struct AffineApplyLowering : public OpRewritePattern<affine::AffineApplyOp> {
+  AffineApplyLowering(MLIRContext *context)
+      : OpRewritePattern<affine::AffineApplyOp>(context, /*benefit=*/1) {}
+  
+  LogicalResult matchAndRewrite(affine::AffineApplyOp apply_op,
+                                PatternRewriter &rewriter) const override {
+    AffineMap map = apply_op.getAffineMap();
+    ValueRange operands = apply_op.getMapOperands();
+    Location loc = apply_op.getLoc();
+
+    // Note: AffineMap can have multiple results in general MLIR contexts
+    // (e.g., affine_map<(d0, d1) -> (d0 + 1, d1 * 2)> returns two values).
+    // However, AffineApplyOp specifically enforces single-result maps at
+    // construction time. This check serves as a safety guard.
+    //
+    // Example transformation:
+    // Before: %result = affine.apply affine_map<(d0, d1) -> (d0 * 2 + d1)>(%i, %j)
+    // After:  %c2 = arith.constant 2 : index
+    //         %mul = arith.muli %i, %c2 : index
+    //         %result = arith.addi %mul, %j : index
+    if (map.getNumResults() != 1) {
+      return apply_op.emitError(
+          "[affine2neura] AffineApplyOp must have a single result");
+    }
+
+    AffineExpr expr = map.getResult(0);
+    llvm::errs() << "[affine2neura] Expanding affine.apply expression: " 
+                 << expr << "\n";
+    
+    // Helper lambda: recursively expands AffineExpr to Value.
+    std::function<Value(AffineExpr)> expandExpr = 
+        [&](AffineExpr e) -> Value {
+      // Constant expression.
+      if (auto const_expr = dyn_cast<AffineConstantExpr>(e)) {
+        return rewriter.create<neura::ConstantOp>(
+            loc, rewriter.getIndexType(),
+            rewriter.getIntegerAttr(rewriter.getIndexType(), 
+                                    const_expr.getValue()));
+      }
+      // Dimension expression.
+      else if (auto dim_expr = dyn_cast<AffineDimExpr>(e)) {
+        return operands[dim_expr.getPosition()];
+      }
+      // Symbol expression.
+      else if (auto sym_expr = dyn_cast<AffineSymbolExpr>(e)) {
+        unsigned symbol_operand_index = 
+            map.getNumDims() + sym_expr.getPosition();
+        return operands[symbol_operand_index];
+      }
+      // Binary operation expression.
+      else if (auto bin_expr = dyn_cast<AffineBinaryOpExpr>(e)) {
+        Value lhs = expandExpr(bin_expr.getLHS());
+        Value rhs = expandExpr(bin_expr.getRHS());
+        
+        if (!lhs || !rhs) {
+          return Value();
+        }
+        
+        switch (bin_expr.getKind()) {
+          case AffineExprKind::Add:
+            return rewriter.create<neura::AddOp>(
+                loc, rewriter.getIndexType(), lhs, rhs).getResult();
+          case AffineExprKind::Mul:
+            return rewriter.create<neura::MulOp>(
+                loc, rewriter.getIndexType(), lhs, rhs).getResult();
+          case AffineExprKind::Mod:
+            return rewriter.create<neura::RemOp>(
+                loc, rewriter.getIndexType(), lhs, rhs).getResult();
+          case AffineExprKind::FloorDiv:
+            return rewriter.create<neura::DivOp>(
+                loc, rewriter.getIndexType(), lhs, rhs).getResult();
+          case AffineExprKind::CeilDiv: {
+            // ceildiv(a, b) = floordiv(a + b - 1, b).
+            Value one = rewriter.create<neura::ConstantOp>(
+                loc, rewriter.getIndexType(),
+                rewriter.getIntegerAttr(rewriter.getIndexType(), 1));
+            Value b_minus_1 = rewriter.create<neura::SubOp>(
+                loc, rewriter.getIndexType(), rhs, one).getResult();
+            Value numerator = rewriter.create<neura::AddOp>(
+                loc, rewriter.getIndexType(), lhs, b_minus_1).getResult();
+            return rewriter.create<neura::DivOp>(
+                loc, rewriter.getIndexType(), numerator, rhs).getResult();
+          }
+          default:
+            llvm::errs() << "[affine2neura] Unsupported binary op kind: "
+                         << static_cast<int>(bin_expr.getKind()) << "\n";
+            return Value();
+        }
+      }
+      
+      llvm::errs() << "[affine2neura] Unsupported affine expression type\n";
+      return Value();
+    };
+    
+    Value expanded = expandExpr(expr);
+    if (!expanded) {
+      return apply_op.emitError("[affine2neura] Failed to expand affine.apply expression");
+    }
+    
+    rewriter.replaceOp(apply_op, expanded);
+    return success();
+  }
+};
+
+// Converts affine.for loops to neura.loop_control with dataflow semantics.
+// Creates constant true for top-level loops, reuses parent's valid signal for nested loops.
+//
+// Example 1 - Simple single loop:
+// Before: affine.for %i = 0 to 10 {
+//           %val = affine.load %A[%i] : memref<10xf32>
+//         }
+// After:  %c_true = neura.constant 1 : i1
+//         %i, %valid1 = "neura.loop_control"(%c_true) <{end = 10, start = 0, step = 1}> : (i1) -> (index, i1)
+//         %val = neura.load_indexed %A[%i : index] memref<10xf32> : f32
+//
+// Example 2 - Nested loops (demonstrates valid signal reuse):
+// Before: affine.for %i = 0 to 10 {
+//           affine.for %j = 0 to 20 {
+//             %val = affine.load %A[%i, %j] : memref<10x20xf32>
+//           }
+//         }
+// After:  %c_true = neura.constant 1 : i1
+//         %i, %valid_i = "neura.loop_control"(%c_true) <{end = 10, start = 0, step = 1}> : (i1) -> (index, i1)
+//         %j, %valid_j = "neura.loop_control"(%valid_i) <{end = 20, start = 0, step = 1}> : (i1) -> (index, i1)
+//         %val = neura.load_indexed %A[%i, %j : index, index] memref<10x20xf32> : f32
+//         (Note: Inner loop reuses outer loop's valid_i signal, no second constant)
+//
+// Example 3 - Non-zero bounds and step:
+// Before: affine.for %i = 5 to 100 step 2 {
+//           %val = affine.load %A[%i] : memref<100xf32>
+//         }
+// After:  %c_true = neura.constant 1 : i1
+//         %i, %valid1 = "neura.loop_control"(%c_true) <{end = 100, start = 5, step = 2}> : (i1) -> (index, i1)
+//         %val = neura.load_indexed %A[%i : index] memref<100xf32> : f32
+struct AffineForLowering : public OpRewritePattern<affine::AffineForOp> {
+  const LoopNestAnalysis &analysis;
+  llvm::DenseMap<Operation *, Value> &loopValidSignals;
+  
+  AffineForLowering(MLIRContext *context, const LoopNestAnalysis &analysis,
+                    llvm::DenseMap<Operation *, Value> &loopValidSignals)
+      : OpRewritePattern<affine::AffineForOp>(context, /*benefit=*/1),
+        analysis(analysis), loopValidSignals(loopValidSignals) {}
+  
+  LogicalResult matchAndRewrite(affine::AffineForOp for_op,
+                                PatternRewriter &rewriter) const override {
+    Location loc = for_op.getLoc();
+    
+    // Extracts loop bounds - must be constant.
+    // Dynamic bounds are not supported as neura.loop_control requires
+    // compile-time constant attributes for hardware configuration.
+    if (!for_op.hasConstantLowerBound() || !for_op.hasConstantUpperBound()) {
+      return for_op.emitError(
+          "[affine2neura] Non-constant loop bounds not supported. "
+          "Loop bounds must be compile-time constants for CGRA configuration");
+    }
+
+    int64_t lower_bound = for_op.getConstantLowerBound();
+    int64_t upper_bound = for_op.getConstantUpperBound();
+    int64_t step = for_op.getStepAsInt();
+
+    // Get loop nesting information
+    LoopInfo *loopInfo = analysis.getLoopInfo(for_op);
+    Type i1_type = rewriter.getI1Type();
+    Value parent_valid;
+    
+    // Optimization: Reuse parent loop's valid signal for nested loops.
+    // This avoids creating redundant initialization for each nested loop.
+    if (loopInfo && loopInfo->parent) {
+      // This is a nested loop - try to reuse parent's loop_valid signal
+      auto it = loopValidSignals.find(loopInfo->parent->loop.getOperation());
+      if (it != loopValidSignals.end()) {
+        parent_valid = it->second;
+        llvm::errs() << "[affine2neura] Reusing parent valid signal for "
+                     << "nested loop (depth=" << loopInfo->depth << ")\n";
+      } else {
+        // Fallback: parent not yet converted, create constant true
+        IntegerAttr true_attr = rewriter.getIntegerAttr(i1_type, 1);
+        parent_valid = rewriter.create<neura::ConstantOp>(loc, i1_type, true_attr);
+        llvm::errs() << "[affine2neura] Parent valid not available, "
+                     << "creating constant true for nested loop\n";
+      }
+    } else {
+      // Top-level loop - create constant true to ensure it's always valid
+      IntegerAttr true_attr = rewriter.getIntegerAttr(i1_type, 1);
+      parent_valid = rewriter.create<neura::ConstantOp>(loc, i1_type, true_attr);
+      if (loopInfo) {
+        llvm::errs() << "[affine2neura] Created constant true for top-level loop "
+                     << "(depth=" << loopInfo->depth << ")\n";
+      }
+    }
+
+    // Creates loop_control operation.
+    auto index_type = rewriter.getIndexType();
+    
+    auto loop_control = rewriter.create<neura::LoopControlOp>(
+        loc,
+        /*resultTypes=*/TypeRange{index_type, i1_type},
+        /*parentValid=*/parent_valid,
+        /*iterationType=*/rewriter.getStringAttr("increment"),
+        /*start=*/rewriter.getI64IntegerAttr(lower_bound),
+        /*end=*/rewriter.getI64IntegerAttr(upper_bound),
+        /*step=*/rewriter.getI64IntegerAttr(step));
+
+    Value loop_index = loop_control.getResult(0);
+    Value loop_valid = loop_control.getResult(1);
+    
+    // Store the loop_valid signal for child loops to use.
+    // This enables the optimization for nested loops.
+    loopValidSignals[for_op.getOperation()] = loop_valid;
+
+    // Inlines the body operations before the for_op.
+    Block &body_block = for_op.getRegion().front();
+    Operation *terminator = body_block.getTerminator();
+    rewriter.eraseOp(terminator);  // Removes affine.yield first.
+    
+    // Merge the loop body into the parent block before the for_op.
+    // Pass the loop_index as replacement for the induction variable block argument.
+    rewriter.inlineBlockBefore(&body_block, for_op.getOperation(), {loop_index});
+    
+    // Erases the for_op.
+    rewriter.eraseOp(for_op);
+
+    return success();
+  }
+};
+
+struct LowerAffineToNeuraPass
+    : public PassWrapper<LowerAffineToNeuraPass, OperationPass<ModuleOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(LowerAffineToNeuraPass)
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<neura::NeuraDialect, arith::ArithDialect,
+                    memref::MemRefDialect, affine::AffineDialect>();
+  }
+
+  StringRef getArgument() const override { return "lower-affine-to-neura"; }
+  StringRef getDescription() const override {
+    return "Lower affine operations to Neura dialect operations";
+  }
+
+  void runOnOperation() override {
+    ModuleOp module_op = getOperation();
+    MLIRContext *context = module_op.getContext();
+
+    module_op.walk([&](func::FuncOp func_op) {
+      // Checks if function targets neura accelerator, or applies to all if no attribute.
+      if (func_op->hasAttr(mlir::accel::kAcceleratorAttr)) {
+        auto target = func_op->getAttrOfType<StringAttr>(
+            mlir::accel::kAcceleratorAttr);
+        if (!target || target.getValue() != mlir::accel::kNeuraTarget) {
+          return;  // Skips this function.
+        }
+      }
+      // If no accelerator attribute, applies the pass anyway (for testing).
+      
+      // Step 1: Perform loop nest analysis
+      // This builds the loop hierarchy and identifies perfect/imperfect nests
+      llvm::errs() << "[affine2neura] Analyzing loop nests in function: "
+                   << func_op.getName() << "\n";
+      LoopNestAnalysis analysis(func_op);
+      analysis.dump();  // Print analysis results for debugging
+      
+      // Step 2: Create a map to store loop_valid signals
+      // This allows nested loops to reuse parent's valid signal
+      llvm::DenseMap<Operation *, Value> loopValidSignals;
+      
+      // Step 3: Set up dialect conversion
+      // We use Dialect Conversion instead of Greedy Pattern Rewriter because:
+      // 1. It provides better error reporting when conversion fails
+      // 2. It explicitly defines which operations are legal/illegal
+      // 3. It's the standard approach for dialect lowering passes
+      ConversionTarget target(*context);
+      target.addLegalDialect<neura::NeuraDialect, arith::ArithDialect,
+                             memref::MemRefDialect, func::FuncDialect>();
+      target.addIllegalDialect<affine::AffineDialect>();
+      
+      // Step 4: Register rewrite patterns with analysis
+      RewritePatternSet patterns(context);
+      patterns.add<AffineLoadLowering, AffineStoreLowering, AffineApplyLowering>(context);
+      // Pass references to the analysis and loopValidSignals map
+      patterns.add<AffineForLowering>(context, std::cref(analysis), 
+                                      std::ref(loopValidSignals));
+
+      if (failed(applyPartialConversion(func_op, target, std::move(patterns)))) {
+        func_op.emitError("[affine2neura] Failed to lower affine "
+                          "operations to Neura dialect");
+        signalPassFailure();
+      }
+    });
+  }
+};
+} // namespace
+
+std::unique_ptr<mlir::Pass> mlir::createLowerAffineToNeuraPass() {
+  return std::make_unique<LowerAffineToNeuraPass>();
+}
\ No newline at end of file
diff --git a/lib/Conversion/AffineToNeura/CMakeLists.txt b/lib/Conversion/AffineToNeura/CMakeLists.txt
new file mode 100644
index 00000000..285099f3
--- /dev/null
+++ b/lib/Conversion/AffineToNeura/CMakeLists.txt
@@ -0,0 +1,19 @@
+add_mlir_dialect_library(MLIRNeuraAffineToNeuraPass
+  AffineToNeuraPass.cpp
+  LoopNestAnalysis.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/Conversion
+
+  DEPENDS
+  MLIRConversionIncGen
+
+  LINK_LIBS PUBLIC
+  MLIRAffineDialect
+  MLIRNeura
+  MLIRIR
+  MLIRPass
+  MLIRSupport
+  MLIRTransforms
+  MLIRFuncDialect
+)
diff --git a/lib/Conversion/AffineToNeura/LoopNestAnalysis.cpp b/lib/Conversion/AffineToNeura/LoopNestAnalysis.cpp
new file mode 100644
index 00000000..e7410994
--- /dev/null
+++ b/lib/Conversion/AffineToNeura/LoopNestAnalysis.cpp
@@ -0,0 +1,191 @@
+#include "Conversion/AffineToNeura/LoopNestAnalysis.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+using namespace mlir::neura;
+
+/// Constructor - Performs complete loop nest analysis.
+LoopNestAnalysis::LoopNestAnalysis(func::FuncOp func) {
+  llvm::errs() << "[LoopNestAnalysis] Starting analysis for function: " 
+               << func.getName() << "\n";
+  buildLoopNestTree(func);
+  llvm::errs() << "[LoopNestAnalysis] Found " << allLoops.size() << " loops\n";
+  analyzePerfectNests();
+  llvm::errs() << "[LoopNestAnalysis] Analysis complete\n";
+}
+
+// Builds the loop hierarchy tree.
+void LoopNestAnalysis::buildLoopNestTree(func::FuncOp func) {
+  // Step 1: Collects all loops.
+  func.walk([&](affine::AffineForOp loop) {
+    auto loopInfo = std::make_unique<LoopInfo>(loop);
+    loopMap[loop.getOperation()] = loopInfo.get();
+    allLoops.push_back(std::move(loopInfo));
+  });
+  
+  // Step 2: Establishes parent-child relationships.
+  for (auto &loopInfoPtr : allLoops) {
+    LoopInfo *loopInfo = loopInfoPtr.get();
+    affine::AffineForOp loop = loopInfo->loop;
+    
+    // Searches upward for parent loop.
+    Operation *parentOp = loop->getParentOp();
+    while (parentOp && !isa<func::FuncOp>(parentOp)) {
+      if (auto parentLoop = dyn_cast<affine::AffineForOp>(parentOp)) {
+        auto it = loopMap.find(parentLoop.getOperation());
+        if (it != loopMap.end()) {
+          loopInfo->parent = it->second;
+          loopInfo->depth = loopInfo->parent->depth + 1;  // depth = parent_depth + 1
+          it->second->children.push_back(loopInfo);
+        }
+        break;
+      }
+      parentOp = parentOp->getParentOp();
+    }
+    
+    // If no parent loop, this is a top-level loop.
+    if (!loopInfo->parent) {
+      topLevelLoops.push_back(loopInfo);
+    }
+  }
+}
+
+// Analyzes perfect nesting characteristics.
+void LoopNestAnalysis::analyzePerfectNests() {
+  for (auto &loopInfoPtr : allLoops) {
+    LoopInfo *info = loopInfoPtr.get();
+    
+    // Leaf loops are automatically perfect.
+    if (info->children.empty()) {
+      info->is_perfect_nest = true;
+      continue;
+    }
+    
+    Block &body = info->loop.getRegion().front();
+    
+    // Builds child loop operation set for fast lookup.
+    llvm::DenseSet<Operation *> childLoopOps;
+    for (LoopInfo *child : info->children) {
+      childLoopOps.insert(child->loop.getOperation());
+    }
+    
+    Operation *firstChild = info->children.front()->loop.getOperation();
+    Operation *lastChild = info->children.back()->loop.getOperation();
+    
+    // Checks if operations exist before the first child loop.
+    for (Operation &op : body.getOperations()) {
+      if (&op == firstChild) break;
+      if (isa<affine::AffineYieldOp>(&op)) continue;
+      info->operations_before_child.push_back(&op);
+      info->is_perfect_nest = false;  // Operations before child → imperfect
+    }
+    
+    // Checks if operations exist after the last child loop.
+    bool afterLastChild = false;
+    for (Operation &op : body.getOperations()) {
+      if (&op == lastChild) {
+        afterLastChild = true;
+        continue;
+      }
+      if (afterLastChild && !isa<affine::AffineYieldOp>(&op)) {
+        info->operations_after_child.push_back(&op);
+        info->is_perfect_nest = false;  // Operations after child → imperfect
+      }
+    }
+    
+    // Checks if operations exist between sibling child loops.
+    // Example: affine.for i { affine.for j1; op; affine.for j2 }
+    if (info->children.size() > 1) {
+      bool betweenChildren = false;
+      Operation *prevChild = nullptr;
+      
+      for (Operation &op : body.getOperations()) {
+        if (childLoopOps.contains(&op)) {
+          if (prevChild && betweenChildren) {
+            info->is_perfect_nest = false;  // Operations between siblings → imperfect
+            break;
+          }
+          prevChild = &op;
+          betweenChildren = false;
+        } else if (prevChild && !isa<affine::AffineYieldOp>(&op)) {
+          betweenChildren = true;
+        }
+      }
+    }
+  }
+}
+
+
+// Query Interface Implementation
+
+// Queries LoopInfo by loop operation.
+LoopInfo *LoopNestAnalysis::getLoopInfo(affine::AffineForOp loop) const {
+  auto it = loopMap.find(loop.getOperation());
+  return it != loopMap.end() ? it->second : nullptr;
+}
+
+// Checks if the loop is a perfect nest.
+bool LoopNestAnalysis::isPerfectNest(affine::AffineForOp loop) const {
+  LoopInfo *info = getLoopInfo(loop);
+  return info ? info->is_perfect_nest : false;
+}
+
+// Gets the parent loop.
+LoopInfo *LoopNestAnalysis::getParentLoop(affine::AffineForOp loop) const {
+  LoopInfo *info = getLoopInfo(loop);
+  return info ? info->parent : nullptr;
+}
+
+// Gets the list of child loops.
+llvm::ArrayRef<LoopInfo *> 
+LoopNestAnalysis::getChildLoops(affine::AffineForOp loop) const {
+  LoopInfo *info = getLoopInfo(loop);
+  return info ? llvm::ArrayRef<LoopInfo *>(info->children) 
+              : llvm::ArrayRef<LoopInfo *>();
+}
+
+
+// Debug Output Implementation
+void LoopNestAnalysis::dump() const {
+  llvm::errs() << "=== Loop Nest Analysis ===\n";
+  llvm::errs() << "Total loops: " << allLoops.size() << "\n";
+  llvm::errs() << "Top-level loops: " << topLevelLoops.size() << "\n\n";
+  
+  // Recursive print function.
+  std::function<void(LoopInfo *, unsigned)> printLoop;
+  printLoop = [&](LoopInfo *info, unsigned indent) {
+    // Prints indentation.
+    for (unsigned i = 0; i < indent; ++i) llvm::errs() << "  ";
+    
+    // Prints basic loop information.
+    llvm::errs() << "Loop (depth=" << info->depth 
+                 << ", perfect=" << (info->is_perfect_nest ? "yes" : "no")
+                 << ", children=" << info->children.size() << ")";
+    
+    // If imperfect nest, prints detailed information.
+    if (!info->is_perfect_nest) {
+      llvm::errs() << " [IMPERFECT: "
+                   << "ops_before=" << info->operations_before_child.size()
+                   << ", ops_after=" << info->operations_after_child.size()
+                   << "]";
+    }
+    llvm::errs() << "\n";
+    
+    // Prints location information.
+    for (unsigned i = 0; i < indent; ++i) llvm::errs() << "  ";
+    llvm::errs() << "  at: ";
+    info->loop.getLoc().print(llvm::errs());
+    llvm::errs() << "\n";
+    
+    // Recursively prints child loops.
+    for (LoopInfo *child : info->children) {
+      printLoop(child, indent + 1);
+    }
+  };
+  
+  for (LoopInfo *topLoop : topLevelLoops) {
+    printLoop(topLoop, 0);
+  }
+  
+  llvm::errs() << "=== End Loop Nest Analysis ===\n\n";
+}
diff --git a/lib/Conversion/ArithToNeura/ArithToNeuraPass.cpp b/lib/Conversion/ArithToNeura/ArithToNeuraPass.cpp
index dc6f4532..8328eb61 100644
--- a/lib/Conversion/ArithToNeura/ArithToNeuraPass.cpp
+++ b/lib/Conversion/ArithToNeura/ArithToNeuraPass.cpp
@@ -343,8 +343,9 @@ struct LowerArithToNeuraPass
               ArithFDivToNeuraFDiv, ArithExtfToNeuraCast, ArithMulFToNeuraFMul,
               ArithSubIToNeuraSub, ArithSubFToNeuraFSub, ArithMulIToNeuraMul,
               ArithDivSIToNeuraDiv, ArithRemSIToNeuraOp>(context);
+          // Apply patterns to the function, not the entire module
           if (failed(
-                  applyPatternsGreedily(getOperation(), std::move(patterns)))) {
+                  applyPatternsGreedily(func_op, std::move(patterns)))) {
             signalPassFailure();
           }
         }
diff --git a/lib/Conversion/CMakeLists.txt b/lib/Conversion/CMakeLists.txt
index 98f5dac2..bb6ccd5a 100644
--- a/lib/Conversion/CMakeLists.txt
+++ b/lib/Conversion/CMakeLists.txt
@@ -1,6 +1,7 @@
 get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
 
 add_subdirectory(ArithToNeura)
+add_subdirectory(AffineToNeura)
 add_subdirectory(LlvmToNeura)
 add_subdirectory(MemRefToNeura)
 add_subdirectory(BuiltinToNeura)
@@ -16,6 +17,7 @@ target_link_libraries(MLIRConversion INTERFACE
   MLIRTransforms
   MLIRNeura
   MLIRNeuraArithToNeuraPass
+  MLIRNeuraAffineToNeuraPass
   MLIRNeuraLlvmToNeuraPass
   MLIRNeuraMemRefToNeuraPass
   MLIRNeuraBuiltinToNeuraPass
diff --git a/lib/NeuraDialect/Mapping/mapping_util.cpp b/lib/NeuraDialect/Mapping/mapping_util.cpp
index 18b2a899..c3b3696d 100644
--- a/lib/NeuraDialect/Mapping/mapping_util.cpp
+++ b/lib/NeuraDialect/Mapping/mapping_util.cpp
@@ -625,9 +625,16 @@ bool mlir::neura::tryRouteDataMove(Operation *mov_op, MappingLoc src_loc,
 
 Operation *mlir::neura::getMaterializedProducer(Value operand) {
   Operation *producer = operand.getDefiningOp();
+  
+  // ReserveOp is not wrapped by DataMovOp (see InsertDataMovPass).
+  // Return it directly as it represents the loop-carried dependency placeholder.
+  if (isa<neura::ReserveOp>(producer)) {
+    return producer;
+  }
+  
+  // For operations wrapped by DataMovOp, find the actual producer.
   assert(isa<neura::DataMovOp>(producer) &&
-         "Expected operand to be defined by a DataMovOp");
-  // Finds the actual producer.
+         "Expected a DataMovOp as operand producer for non-ReserveOp operations");
   auto mov_op = dyn_cast<neura::DataMovOp>(producer);
   auto materialized_producer = mov_op.getOperand().getDefiningOp();
   return materialized_producer;
@@ -957,12 +964,22 @@ bool mlir::neura::placeAndRoute(Operation *op, const MappingLoc &target_loc,
     for (Value operand : op->getOperands()) {
       llvm::errs() << "Processing operand: " << operand << "\n";
       if (isa<neura::ReserveOp>(operand.getDefiningOp())) {
-        // Skips Reserve ops (backward ctrl move) when estimate cost.
+        // Skips Reserve ops (backward ctrl move) when routing.
         continue;
       }
       Operation *data_move = operand.getDefiningOp();
+      
+      // ReserveOp is not wrapped by DataMovOp (see InsertDataMovPass).
+      // Skip routing for ReserveOp as it represents loop-carried dependency.
+      if (isa<neura::ReserveOp>(data_move)) {
+        llvm::errs() << "Skipping unwrapped operand: " << *data_move
+                     << "\n";
+        continue;
+      }
+      
       assert(isa<neura::DataMovOp>(data_move) &&
-             "Expected a DataMovOp as operand producer");
+             "Expected a DataMovOp as operand for non-ReserveOp operations");
+      
       Operation *producer = getMaterializedProducer(operand);
       MappingLoc src_loc = mapping_state.getAllLocsOfOp(producer).back();
 
diff --git a/test/Conversion/AffineToNeura/complex-affine-expressions.mlir b/test/Conversion/AffineToNeura/complex-affine-expressions.mlir
new file mode 100644
index 00000000..612b1328
--- /dev/null
+++ b/test/Conversion/AffineToNeura/complex-affine-expressions.mlir
@@ -0,0 +1,106 @@
+// RUN: mlir-neura-opt %s --lower-affine-to-neura | FileCheck %s
+
+// This test verifies that complex affine expressions are correctly expanded
+// into explicit Neura arithmetic operations.
+
+module {
+  // Test 1: Multiplication expression (d0 * 2)
+  func.func @mul_expression(%arg0: memref<10xf32>) {
+    affine.for %i = 0 to 10 {
+      %0 = affine.load %arg0[2 * %i] : memref<10xf32>
+    }
+    return
+  }
+
+  // Test 2: Addition and multiplication (d0 * 3 + 1)
+  func.func @complex_expression(%arg0: memref<10xf32>) {
+    affine.for %i = 0 to 10 {
+      %0 = affine.load %arg0[3 * %i + 1] : memref<10xf32>
+    }
+    return
+  }
+
+  // Test 3: Modulo operation (d0 % 4)
+  func.func @modulo_expression(%arg0: memref<10xf32>) {
+    affine.for %i = 0 to 10 {
+      %0 = affine.load %arg0[%i mod 4] : memref<10xf32>
+    }
+    return
+  }
+
+  // Test 4: Floor division (d0 floordiv 2)
+  func.func @floordiv_expression(%arg0: memref<10xf32>) {
+    affine.for %i = 0 to 10 {
+      %0 = affine.load %arg0[%i floordiv 2] : memref<10xf32>
+    }
+    return
+  }
+
+  // Test 5: Multiple dimensions with complex expressions
+  func.func @multi_dim_complex(%arg0: memref<10x20xf32>) {
+    affine.for %i = 0 to 10 {
+      affine.for %j = 0 to 20 {
+        %0 = affine.load %arg0[%i, 2 * %i + 3 * %j + 1] : memref<10x20xf32>
+      }
+    }
+    return
+  }
+}
+
+// CHECK-LABEL: func.func @mul_expression
+// CHECK-NEXT: %0 = "neura.constant"() <{value = true}> : () -> i1
+// CHECK-NEXT: %{{.*}}, %{{.*}} = "neura.loop_control"(%0) <{end = 10 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1)
+//
+// CHECK-NEXT: %{{.*}} = "neura.constant"() <{value = 2 : index}> : () -> index
+// CHECK-NEXT: %{{.*}} = "neura.mul"(%{{.*}}, %{{.*}}) : (index, index) -> index
+// CHECK-NEXT: %{{.*}} = neura.load_indexed %arg0[%{{.*}} : index] memref<10xf32> : f32
+// CHECK-NEXT: return
+// CHECK-NEXT: }
+
+// CHECK-LABEL: func.func @complex_expression
+// CHECK-NEXT: %0 = "neura.constant"() <{value = true}> : () -> i1
+// CHECK-NEXT: %{{.*}}, %{{.*}} = "neura.loop_control"(%0) <{end = 10 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1)
+//
+// CHECK-NEXT: %{{.*}} = "neura.constant"() <{value = 3 : index}> : () -> index
+// CHECK-NEXT: %{{.*}} = "neura.mul"(%{{.*}}, %{{.*}}) : (index, index) -> index
+// CHECK-NEXT: %{{.*}} = "neura.constant"() <{value = 1 : index}> : () -> index
+// CHECK-NEXT: %{{.*}} = "neura.add"(%{{.*}}, %{{.*}}) : (index, index) -> index
+// CHECK-NEXT: %{{.*}} = neura.load_indexed %arg0[%{{.*}} : index] memref<10xf32> : f32
+// CHECK-NEXT: return
+// CHECK-NEXT: }
+
+// CHECK-LABEL: func.func @modulo_expression
+// CHECK-NEXT: %0 = "neura.constant"() <{value = true}> : () -> i1
+// CHECK-NEXT: %{{.*}}, %{{.*}} = "neura.loop_control"(%0) <{end = 10 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1)
+//
+// CHECK-NEXT: %{{.*}} = "neura.constant"() <{value = 4 : index}> : () -> index
+// CHECK-NEXT: %{{.*}} = "neura.rem"(%{{.*}}, %{{.*}}) : (index, index) -> index
+// CHECK-NEXT: %{{.*}} = neura.load_indexed %arg0[%{{.*}} : index] memref<10xf32> : f32
+// CHECK-NEXT: return
+// CHECK-NEXT: }
+
+// CHECK-LABEL: func.func @floordiv_expression
+// CHECK-NEXT: %0 = "neura.constant"() <{value = true}> : () -> i1
+// CHECK-NEXT: %{{.*}}, %{{.*}} = "neura.loop_control"(%0) <{end = 10 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1)
+//
+// CHECK-NEXT: %{{.*}} = "neura.constant"() <{value = 2 : index}> : () -> index
+// CHECK-NEXT: %{{.*}} = "neura.div"(%{{.*}}, %{{.*}}) : (index, index) -> index
+// CHECK-NEXT: %{{.*}} = neura.load_indexed %arg0[%{{.*}} : index] memref<10xf32> : f32
+// CHECK-NEXT: return
+// CHECK-NEXT: }
+
+// CHECK-LABEL: func.func @multi_dim_complex
+// CHECK-NEXT: %0 = "neura.constant"() <{value = true}> : () -> i1
+// CHECK-NEXT: %{{.*}}, %{{.*}} = "neura.loop_control"(%0) <{end = 10 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1)
+// CHECK-NEXT: %{{.*}}, %{{.*}} = "neura.loop_control"(%{{.*}}) <{end = 20 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1)
+//
+// CHECK-NEXT: %{{.*}} = "neura.constant"() <{value = 2 : index}> : () -> index
+// CHECK-NEXT: %{{.*}} = "neura.mul"(%{{.*}}, %{{.*}}) : (index, index) -> index
+// CHECK-NEXT: %{{.*}} = "neura.constant"() <{value = 3 : index}> : () -> index
+// CHECK-NEXT: %{{.*}} = "neura.mul"(%{{.*}}, %{{.*}}) : (index, index) -> index
+// CHECK-NEXT: %{{.*}} = "neura.add"(%{{.*}}, %{{.*}}) : (index, index) -> index
+// CHECK-NEXT: %{{.*}} = "neura.constant"() <{value = 1 : index}> : () -> index
+// CHECK-NEXT: %{{.*}} = "neura.add"(%{{.*}}, %{{.*}}) : (index, index) -> index
+// CHECK-NEXT: %{{.*}} = neura.load_indexed %arg0[%{{.*}}, %{{.*}} : index, index] memref<10x20xf32> : f32
+// CHECK-NEXT: return
+// CHECK-NEXT: }
diff --git a/test/Conversion/AffineToNeura/deep-nesting.mlir b/test/Conversion/AffineToNeura/deep-nesting.mlir
new file mode 100644
index 00000000..8189c100
--- /dev/null
+++ b/test/Conversion/AffineToNeura/deep-nesting.mlir
@@ -0,0 +1,29 @@
+// RUN: mlir-neura-opt %s --lower-affine-to-neura | FileCheck %s
+
+// Corner Case: Deeply nested loops (4 levels) - tests perfect nesting with 4D
+module {
+  func.func @deep_nesting_4d(%arg0: memref<5x5x5x5xf32>) {
+    affine.for %i = 0 to 5 {
+      affine.for %j = 0 to 5 {
+        affine.for %k = 0 to 5 {
+          affine.for %l = 0 to 5 {
+            %0 = affine.load %arg0[%i, %j, %k, %l] : memref<5x5x5x5xf32>
+          }
+        }
+      }
+    }
+    return
+  }
+}
+
+// CHECK-LABEL: func.func @deep_nesting_4d(%arg0: memref<5x5x5x5xf32>)
+// CHECK-NEXT: %0 = "neura.constant"() <{value = true}> : () -> i1
+// CHECK-NEXT: %{{.*}}, %{{.*}} = "neura.loop_control"(%0) <{end = 5 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1)
+// CHECK-NEXT: %{{.*}}, %{{.*}} = "neura.loop_control"(%{{.*}}) <{end = 5 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1)
+// CHECK-NEXT: %{{.*}}, %{{.*}} = "neura.loop_control"(%{{.*}}) <{end = 5 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1)
+// CHECK-NEXT: %{{.*}}, %{{.*}} = "neura.loop_control"(%{{.*}}) <{end = 5 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1)
+//
+// CHECK-NEXT: %{{.*}} = neura.load_indexed %arg0[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : index, index, index, index] memref<5x5x5x5xf32> : f32
+// CHECK-NEXT: return
+// CHECK-NEXT: }
+// CHECK-NOT: affine.
diff --git a/test/Conversion/AffineToNeura/imperfect-ops-after.mlir b/test/Conversion/AffineToNeura/imperfect-ops-after.mlir
new file mode 100644
index 00000000..fd9aad1c
--- /dev/null
+++ b/test/Conversion/AffineToNeura/imperfect-ops-after.mlir
@@ -0,0 +1,31 @@
+// RUN: mlir-neura-opt %s --lower-affine-to-neura | FileCheck %s
+
+// Imperfect Nesting: Operations after child loop
+// This tests that inner loop results can be used by outer loop operations
+module {
+  func.func @imperfect_ops_after(%arg0: memref<10x20xf32>, %arg1: memref<10xf32>) {
+    affine.for %i = 0 to 10 {
+      // Inner loop: compute sum of row elements
+      affine.for %j = 0 to 20 {
+        %elem = affine.load %arg0[%i, %j] : memref<10x20xf32>
+        // In real code, %elem would be accumulated or used
+      }
+      // Operations after inner loop - uses outer loop index
+      %cst = arith.constant 1.0 : f32
+      affine.store %cst, %arg1[%i] : memref<10xf32>
+    }
+    return
+  }
+}
+
+// CHECK-LABEL: func.func @imperfect_ops_after(%arg0: memref<10x20xf32>, %arg1: memref<10xf32>)
+// CHECK-NEXT: %0 = "neura.constant"() <{value = true}> : () -> i1
+// CHECK-NEXT: %{{.*}}, %{{.*}} = "neura.loop_control"(%0) <{end = 10 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1)
+// CHECK-NEXT: %{{.*}}, %{{.*}} = "neura.loop_control"(%{{.*}}) <{end = 20 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1)
+// CHECK-NEXT: %{{.*}} = neura.load_indexed %arg0[%{{.*}}, %{{.*}} : index, index] memref<10x20xf32> : f32
+//
+// CHECK-NEXT: %{{.*}} = arith.constant 1.000000e+00 : f32
+// CHECK-NEXT: neura.store_indexed %{{.*}} to %arg1[%{{.*}} : index] memref<10xf32> : f32
+// CHECK-NEXT: return
+// CHECK-NEXT: }
+// CHECK-NOT: affine.
diff --git a/test/Conversion/AffineToNeura/loop-nest-optimization.mlir b/test/Conversion/AffineToNeura/loop-nest-optimization.mlir
new file mode 100644
index 00000000..c2ca0b9e
--- /dev/null
+++ b/test/Conversion/AffineToNeura/loop-nest-optimization.mlir
@@ -0,0 +1,44 @@
+// RUN: mlir-neura-opt %s --lower-affine-to-neura | FileCheck %s
+
+// This test verifies proper handling of various loop nest patterns.
+
+module {
+  func.func @perfect_nest_2d(%arg0: memref<10x20xf32>) {
+    affine.for %i = 0 to 10 {
+      affine.for %j = 0 to 20 {
+        %v = affine.load %arg0[%i, %j] : memref<10x20xf32>
+      }
+    }
+    return
+  }
+
+  func.func @two_top_level_loops(%arg0: memref<10xf32>, %arg1: memref<20xf32>) {
+    affine.for %i = 0 to 10 {
+      %v = affine.load %arg0[%i] : memref<10xf32>
+    }
+    affine.for %j = 0 to 20 {
+      %w = affine.load %arg1[%j] : memref<20xf32>
+    }
+    return
+  }
+}
+
+// CHECK-LABEL: func.func @perfect_nest_2d(%arg0: memref<10x20xf32>)
+// CHECK-NEXT: %0 = "neura.constant"() <{value = true}> : () -> i1
+// CHECK-NEXT: %{{.*}}, %{{.*}} = "neura.loop_control"(%0) <{end = 10 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1)
+// CHECK-NEXT: %{{.*}}, %{{.*}} = "neura.loop_control"(%{{.*}}) <{end = 20 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1)
+//
+// CHECK-NEXT: %{{.*}} = neura.load_indexed %arg0[%{{.*}}, %{{.*}} : index, index] memref<10x20xf32> : f32
+// CHECK-NEXT: return
+// CHECK-NEXT: }
+
+// CHECK-LABEL: func.func @two_top_level_loops(%arg0: memref<10xf32>, %arg1: memref<20xf32>)
+// CHECK-NEXT: %0 = "neura.constant"() <{value = true}> : () -> i1
+// CHECK-NEXT: %{{.*}}, %{{.*}} = "neura.loop_control"(%0) <{end = 10 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1)
+// CHECK-NEXT: %{{.*}} = neura.load_indexed %arg0[%{{.*}} : index] memref<10xf32> : f32
+//
+// CHECK-NEXT: %{{.*}} = "neura.constant"() <{value = true}> : () -> i1
+// CHECK-NEXT: %{{.*}}, %{{.*}} = "neura.loop_control"(%{{.*}}) <{end = 20 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1)
+// CHECK-NEXT: %{{.*}} = neura.load_indexed %arg1[%{{.*}} : index] memref<20xf32> : f32
+// CHECK-NEXT: return
+// CHECK-NEXT: }
diff --git a/test/Conversion/AffineToNeura/single-iteration.mlir b/test/Conversion/AffineToNeura/single-iteration.mlir
new file mode 100644
index 00000000..3e2bed79
--- /dev/null
+++ b/test/Conversion/AffineToNeura/single-iteration.mlir
@@ -0,0 +1,20 @@
+// RUN: mlir-neura-opt %s --lower-affine-to-neura | FileCheck %s
+
+// Corner Case: Single iteration loop
+module {
+  func.func @single_iteration(%arg0: memref<1xf32>) {
+    affine.for %i = 0 to 1 {
+      %0 = affine.load %arg0[%i] : memref<1xf32>
+    }
+    return
+  }
+}
+
+// CHECK-LABEL: func.func @single_iteration(%arg0: memref<1xf32>)
+// CHECK-NEXT: %0 = "neura.constant"() <{value = true}> : () -> i1
+// CHECK-NEXT: %{{.*}}, %{{.*}} = "neura.loop_control"(%0) <{end = 1 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1)
+//
+// CHECK-NEXT: %{{.*}} = neura.load_indexed %arg0[%{{.*}} : index] memref<1xf32> : f32
+// CHECK-NEXT: return
+// CHECK-NEXT: }
+// CHECK-NOT: affine.
diff --git a/test/Conversion/AffineToNeura/unsupported-affine-if.mlir b/test/Conversion/AffineToNeura/unsupported-affine-if.mlir
new file mode 100644
index 00000000..1095a239
--- /dev/null
+++ b/test/Conversion/AffineToNeura/unsupported-affine-if.mlir
@@ -0,0 +1,70 @@
+// RUN: mlir-neura-opt %s --lower-affine | FileCheck %s --check-prefix=CHECK-SCF
+// RUN: mlir-neura-opt %s --lower-affine --convert-scf-to-cf --convert-cf-to-llvm --convert-arith-to-llvm --convert-func-to-llvm | FileCheck %s --check-prefix=CHECK-LLVM
+// RUN: mlir-neura-opt %s --lower-affine --convert-scf-to-cf --convert-cf-to-llvm --convert-arith-to-llvm --convert-func-to-llvm --lower-llvm-to-neura | FileCheck %s --check-prefix=CHECK-NEURA-BR
+
+// This test demonstrates the complete multi-stage lowering chain for conditionals.
+// Note: Direct lowering affine.if to Neura is not supported.
+// 
+// The complete transformation chain:
+//   affine.if → scf.if → cf.cond_br → llvm.cond_br → neura.cond_br
+//
+// While neura.cond_br operations are generated, they cannot be mapped to CGRA
+// hardware because CGRAs are spatial dataflow architectures without program
+// counters or branch prediction units.
+
+module {
+  func.func @affine_if_example(%arg0: memref<10xf32>) {
+    affine.for %i = 0 to 10 {
+      affine.if affine_set<(d0) : (d0 - 5 >= 0)>(%i) {
+        %val = affine.load %arg0[%i] : memref<10xf32>
+      }
+    }
+    return
+  }
+}
+
+// CHECK-SCF-LABEL: func.func @affine_if_example(%arg0: memref<10xf32>)
+// CHECK-SCF-NEXT: %c0 = arith.constant 0 : index
+// CHECK-SCF-NEXT: %c10 = arith.constant 10 : index
+// CHECK-SCF-NEXT: %c1 = arith.constant 1 : index
+// CHECK-SCF-NEXT: scf.for %arg1 = %c0 to %c10 step %c1
+// CHECK-SCF-NEXT:   %c0_0 = arith.constant 0 : index
+// CHECK-SCF-NEXT:   %c-5 = arith.constant -5 : index
+// CHECK-SCF-NEXT:   %0 = arith.addi %arg1, %c-5 : index
+// CHECK-SCF-NEXT:   %1 = arith.cmpi sge, %0, %c0_0 : index
+// CHECK-SCF-NEXT:   scf.if %1
+// CHECK-SCF-NEXT:     %2 = memref.load %arg0[%arg1] : memref<10xf32>
+// CHECK-SCF-NEXT:   }
+// CHECK-SCF-NEXT: }
+// CHECK-SCF-NEXT: return
+
+// CHECK-LLVM-LABEL: llvm.func @affine_if_example
+// CHECK-LLVM: %{{.*}} = llvm.mlir.constant(0 : index) : i64
+// CHECK-LLVM: %{{.*}} = llvm.mlir.constant(10 : index) : i64
+// CHECK-LLVM: %{{.*}} = llvm.mlir.constant(1 : index) : i64
+// CHECK-LLVM: llvm.br ^bb1(%{{.*}} : i64)
+// CHECK-LLVM: ^bb1(%{{.*}}: i64):
+// CHECK-LLVM: %{{.*}} = llvm.icmp "slt" %{{.*}}, %{{.*}} : i64
+// CHECK-LLVM: llvm.cond_br %{{.*}}, ^bb2, ^bb5
+//
+// CHECK-LLVM: ^bb2:
+// CHECK-LLVM: %{{.*}} = llvm.mlir.constant(0 : index) : i64
+// CHECK-LLVM: %{{.*}} = llvm.mlir.constant(-5 : index) : i64
+// CHECK-LLVM: %{{.*}} = llvm.add %{{.*}}, %{{.*}} : i64
+// CHECK-LLVM: %{{.*}} = llvm.icmp "sge" %{{.*}}, %{{.*}} : i64
+// CHECK-LLVM: llvm.cond_br %{{.*}}, ^bb3, ^bb4
+
+// CHECK-NEURA-BR-LABEL: llvm.func @affine_if_example
+// CHECK-NEURA-BR: %{{.*}} = "neura.constant"() <{value = -5 : index}> : () -> i64
+// CHECK-NEURA-BR: %{{.*}} = "neura.constant"() <{value = 1 : index}> : () -> i64
+// CHECK-NEURA-BR: %{{.*}} = "neura.constant"() <{value = 10 : index}> : () -> i64
+// CHECK-NEURA-BR: %{{.*}} = "neura.constant"() <{value = 0 : index}> : () -> i64
+// CHECK-NEURA-BR: neura.br %{{.*}} : i64 to ^bb1
+// CHECK-NEURA-BR: ^bb1(%{{.*}}: i64):
+// CHECK-NEURA-BR: %{{.*}} = "neura.icmp"(%{{.*}}, %{{.*}}) <{cmpType = "slt"}> : (i64, i64) -> i1
+// CHECK-NEURA-BR: neura.cond_br %{{.*}} : i1 then to ^bb2 else to ^bb5
+//
+// CHECK-NEURA-BR: ^bb2:
+// CHECK-NEURA-BR: %{{.*}} = "neura.add"(%{{.*}}, %{{.*}}) : (i64, i64) -> i64
+// CHECK-NEURA-BR: %{{.*}} = "neura.icmp"(%{{.*}}, %{{.*}}) <{cmpType = "sge"}> : (i64, i64) -> i1
+// CHECK-NEURA-BR: neura.cond_br %{{.*}} : i1 then to ^bb3 else to ^bb4
diff --git a/tools/mlir-neura-opt/mlir-neura-opt.cpp b/tools/mlir-neura-opt/mlir-neura-opt.cpp
index 8969fa56..7edea6b5 100644
--- a/tools/mlir-neura-opt/mlir-neura-opt.cpp
+++ b/tools/mlir-neura-opt/mlir-neura-opt.cpp
@@ -1,9 +1,13 @@
 // tools/mlir-neura-opt/mlir-neura-opt.cpp
 
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
 #include "mlir/Dialect/DLTI/DLTI.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/InitAllDialects.h"
 #include "mlir/InitAllPasses.h"
+#include "mlir/Conversion/Passes.h"
 #include "mlir/Support/FileUtilities.h"
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Tools/mlir-opt/MlirOptMain.h"
@@ -57,6 +61,9 @@ int main(int argc, char **argv) {
   registry.insert<mlir::neura::NeuraDialect>();
   registry.insert<mlir::func::FuncDialect>();
   registry.insert<mlir::arith::ArithDialect>();
+  registry.insert<mlir::affine::AffineDialect>();
+  registry.insert<mlir::scf::SCFDialect>();
+  registry.insert<mlir::cf::ControlFlowDialect>();
   registry.insert<mlir::DLTIDialect>();
   registry.insert<mlir::LLVM::LLVMDialect>();
   registry.insert<mlir::memref::MemRefDialect>();
@@ -64,6 +71,9 @@ int main(int argc, char **argv) {
   mlir::neura::registerPasses();
   mlir::registerPasses();
   mlir::registerViewOpGraphPass();
+  
+  // Register all standard conversion passes
+  mlir::registerConversionPasses();
 
   // Print architecture spec file info
   if (!architecture_spec_file.empty()) {