diff --git a/include/Conversion/AffineToNeura/LoopNestAnalysis.h b/include/Conversion/AffineToNeura/LoopNestAnalysis.h
new file mode 100644
index 00000000..ce18a4cc
--- /dev/null
+++ b/include/Conversion/AffineToNeura/LoopNestAnalysis.h
@@ -0,0 +1,80 @@
+//===- LoopNestAnalysis.h - Analyze affine loop nests ----------*- C++ -*-===//
+//
+// Loop nest analysis for affine loops.
+// 
+// Features:
+// 1. Build loop hierarchy tree (parent-child relationships, nesting depth)
+// 2. Identify perfect vs imperfect nesting
+// 3. Support valid signal reuse optimization for nested loops
+//
+//===----------------------------------------------------------------------===//
+#ifndef CONVERSION_AFFINE_TO_NEURA_LOOP_NEST_ANALYSIS_H
+#define CONVERSION_AFFINE_TO_NEURA_LOOP_NEST_ANALYSIS_H
+
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/Operation.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include <memory>
+
+namespace mlir {
+namespace neura {
+
+/// Loop information structure - Stores all analysis information for a single loop.
+struct LoopInfo {
+  affine::AffineForOp loop;              // The loop operation itself.
+  LoopInfo *parent = nullptr;            // Parent loop (nullptr if top-level).
+  llvm::SmallVector<LoopInfo *, 4> children;  // Child loops list.
+  unsigned depth = 0;                    // Nesting depth (0=top-level).
+  bool isPerfectNest = true;             // Whether it is a perfect nest.
+  
+  // Operations list for imperfect nesting.
+  llvm::SmallVector<Operation *, 4> operationsBeforeChild;  // Operations before child loops.
+  llvm::SmallVector<Operation *, 4> operationsAfterChild;   // Operations after child loops.
+  
+  LoopInfo(affine::AffineForOp loop) : loop(loop) {}
+};
+
+/// Loop nest analysis class.
+/// 
+/// Purpose: Provides loop hierarchy information for AffineToNeura pass to support optimization decisions.
+/// 
+/// Usage example:
+///   LoopNestAnalysis analysis(func_op);
+///   analysis.dump();  // Prints analysis results.
+///   LoopInfo *info = analysis.getLoopInfo(loop);
+///   if (info && info->parent) {
+///     // This is a nested loop, can reuse parent's valid signal.
+///   }
+class LoopNestAnalysis {
+public:
+  /// Constructor - Performs loop nest analysis on the given function.
+  explicit LoopNestAnalysis(func::FuncOp func);
+  
+  /// Query interfaces.
+  LoopInfo *getLoopInfo(affine::AffineForOp loop) const;  // Gets loop information.
+  llvm::ArrayRef<LoopInfo *> getTopLevelLoops() const { return topLevelLoops; }  // Gets top-level loops.
+  llvm::ArrayRef<std::unique_ptr<LoopInfo>> getAllLoops() const { return allLoops; }  // Gets all loops.
+  bool isPerfectNest(affine::AffineForOp loop) const;  // Checks if perfect nest.
+  LoopInfo *getParentLoop(affine::AffineForOp loop) const;  // Gets parent loop.
+  llvm::ArrayRef<LoopInfo *> getChildLoops(affine::AffineForOp loop) const;  // Gets child loops.
+  
+  /// Debug interface - Prints analysis results.
+  void dump() const;
+
+private:
+  /// Internal analysis methods.
+  void buildLoopNestTree(func::FuncOp func);  // Builds loop hierarchy tree.
+  void analyzePerfectNests();  // Analyzes perfect nest characteristics.
+  
+  /// Data members.
+  llvm::DenseMap<Operation *, LoopInfo *> loopMap;  // Loop fast lookup table.
+  llvm::SmallVector<std::unique_ptr<LoopInfo>, 8> allLoops;  // All loops (owns ownership).
+  llvm::SmallVector<LoopInfo *, 4> topLevelLoops;  // Top-level loop pointers list.
+};
+
+} // namespace neura
+} // namespace mlir
+
+#endif
diff --git a/include/Conversion/ConversionPasses.h b/include/Conversion/ConversionPasses.h
index 30cbf0e8..15f9b2d6 100644
--- a/include/Conversion/ConversionPasses.h
+++ b/include/Conversion/ConversionPasses.h
@@ -18,6 +18,7 @@ std::unique_ptr<mlir::Pass> createLowerArithToNeuraPass();
 std::unique_ptr<mlir::Pass> createLowerLlvmToNeuraPass();
 std::unique_ptr<mlir::Pass> createLowerMemRefToNeuraPass();
 std::unique_ptr<mlir::Pass> createLowerBuiltinToNeuraPass();
+std::unique_ptr<mlir::Pass> createLowerAffineToNeuraPass();
 
 #define GEN_PASS_REGISTRATION
 #include "Conversion/ConversionPasses.h.inc"
diff --git a/include/Conversion/ConversionPasses.td b/include/Conversion/ConversionPasses.td
index 2e79dd96..7044b9ad 100644
--- a/include/Conversion/ConversionPasses.td
+++ b/include/Conversion/ConversionPasses.td
@@ -32,4 +32,16 @@ def LowerBuiltinToNeura : Pass<"lower-builtin-to-neura", "ModuleOp">{
   let constructor = "mlir::createLowerBuiltinToNeuraPass()";
 }
 
+def LowerAffineToNeura : Pass<"lower-affine-to-neura", "func::FuncOp">{
+  let summary = "Lower Affine perfect nested loops to Neura loop_control operations";
+  let description = [{
+    Converts perfectly nested affine.for loops directly to Neura dialect using 
+    loop_control operations, avoiding the need to flatten to LLVM IR first.
+    This preserves loop structure information for better optimization on 
+    dataflow architectures.
+  }];
+  let constructor = "mlir::createLowerAffineToNeuraPass()";
+  let dependentDialects = ["mlir::neura::NeuraDialect", "mlir::affine::AffineDialect"];
+}
+
 #endif // CONVERSION_PASSES_TD
\ No newline at end of file
diff --git a/include/NeuraDialect/Architecture/Architecture.h b/include/NeuraDialect/Architecture/Architecture.h
index ca7a4951..bae2db90 100644
--- a/include/NeuraDialect/Architecture/Architecture.h
+++ b/include/NeuraDialect/Architecture/Architecture.h
@@ -57,7 +57,9 @@ enum OperationKind {
   // Loop control operations.
   ILoopControl = 34,
   // Constant operations.
-  IConstant = 35
+  IConstant = 35,
+  // Steering control fused operations.
+  ICarryInvariant = 36, IConditionalSelect = 37, IInvariantGroup = 38
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/include/NeuraDialect/Mapping/mapping_util.h b/include/NeuraDialect/Mapping/mapping_util.h
index 8c301aa1..cf85d2a2 100644
--- a/include/NeuraDialect/Mapping/mapping_util.h
+++ b/include/NeuraDialect/Mapping/mapping_util.h
@@ -12,6 +12,10 @@ OperationKind getOperationKindFromMlirOp(Operation *op);
 // Returns true if the operation does not need CGRA tile placement.
 bool is_non_materialized(Operation *op);
 
+// Returns true if the operation is a steering-mode operation that doesn't
+// require DataMovOp wrapping (e.g., constants, carry, invariant, etc.).
+bool is_steering_unwrapped_op(Operation *op);
+
 // Returns true if the operation is a materialized reserve user, i.e.,
 // phi, invariant, carry.
 bool isMaterializedReserveUser(Operation *op);
diff --git a/include/NeuraDialect/NeuraOps.td b/include/NeuraDialect/NeuraOps.td
index eeb2677a..91f303fa 100644
--- a/include/NeuraDialect/NeuraOps.td
+++ b/include/NeuraDialect/NeuraOps.td
@@ -657,4 +657,133 @@ def Neura_InvariantOp : Op<NeuraDialect, "invariant">{
   let arguments = (ins AnyType:$initial, AnyType:$condition);
   let results = (outs AnyType:$result);
   let assemblyFormat = "$initial `,` $condition attr-dict `:` type($initial) `,` type($condition) `->` type($result)";
+}
+
+// ============================================================================
+// FUSED OPERATIONS FOR RECMII OPTIMIZATION
+// ============================================================================
+
+// Defines the carry_invariant fused operation.
+def Neura_CarryInvariantOp : Op<NeuraDialect, "carry_invariant">{
+  let summary = "Fused carry and invariant operation for nested loops.";
+  let description = [{
+    Combines carry and invariant operations into a single operation to reduce RecMII.
+    This is optimized for nested loop patterns where an inner loop's carry result
+    is used as an invariant in the outer loop.
+    
+    Semantics:
+    - If inner_condition is false (first inner iteration): return initial value
+    - Else if outer_condition is false (outer loop active, inner loop invariant): 
+        return initial value from inner carry
+    - Else: return carried value
+    
+    Replaces the pattern:
+      %carry_result = neura.carry %init, %inner_cond, %carried
+      %inv_result = neura.invariant %carry_result, %outer_cond
+    
+    With:
+      %result = neura.carry_invariant %init, %inner_cond, %outer_cond, %carried
+    
+    RecMII Impact: Reduces 2 operations to 1 operation (-50% on critical path)
+    
+    Example:
+      %out = neura.carry_invariant %init, %inner_cond, %outer_cond, %carried 
+             : i64, i1, i1, i64 -> i64
+  }];
+
+  let arguments = (ins 
+    AnyType:$initial,
+    AnyType:$inner_condition,
+    AnyType:$outer_condition,
+    AnyType:$carried
+  );
+  let results = (outs AnyType:$result);
+  
+  let assemblyFormat = [{
+    $initial `,` $inner_condition `,` $outer_condition `,` $carried attr-dict 
+    `:` type($initial) `,` type($inner_condition) `,` type($outer_condition) `,` 
+    type($carried) `->` type($result)
+  }];
+}
+
+// Defines the conditional_select fused operation.
+def Neura_ConditionalSelectOp : Op<NeuraDialect, "cond_select">{
+  let summary = "Fused comparison and conditional selection operation.";
+  let description = [{
+    Combines comparison (icmp) and conditional selection (false_steer) into a 
+    single atomic operation to reduce RecMII.
+    
+    Semantics:
+    - Performs comparison: result = (lhs <predicate> rhs)
+    - If result is false: return value
+    - If result is true: return default value (typically from hardware)
+    
+    Replaces the pattern:
+      %cond = neura.icmp %lhs, %rhs <{cmpType = "slt"}>
+      %result = neura.false_steer %value, %cond
+    
+    With:
+      %result = neura.cond_select %lhs, %rhs, %value <{predicate = "slt"}>
+    
+    RecMII Impact: Reduces 2 operations to 1 operation (-50% on critical path)
+    
+    Supported predicates: "eq", "ne", "slt", "sle", "sgt", "sge", "ult", "ule", "ugt", "uge"
+    
+    Example:
+      %out = neura.cond_select %a, %b, %val <{predicate = "slt"}> 
+             : i64, i64, i64 -> i64
+  }];
+
+  let arguments = (ins 
+    AnyType:$lhs,
+    AnyType:$rhs,
+    AnyType:$value,
+    StrAttr:$predicate
+  );
+  let results = (outs AnyType:$result);
+  
+  let assemblyFormat = [{
+    $lhs `,` $rhs `,` $value attr-dict `:` type($lhs) `,` type($rhs) `,` 
+    type($value) `->` type($result)
+  }];
+}
+
+// Defines the invariant_group batch operation.
+def Neura_InvariantGroupOp : Op<NeuraDialect, "invariant_group">{
+  let summary = "Batch invariant extraction for multiple values.";
+  let description = [{
+    Extracts multiple invariants with the same condition in a single operation.
+    This is optimized for nested loops where many values need to be marked as
+    invariant with respect to the outer loop.
+    
+    Hardware can optimize this by:
+    - Sharing condition checking logic
+    - Parallel invariant extraction
+    - Reduced control overhead
+    
+    Replaces multiple individual invariant operations:
+      %inv1 = neura.invariant %val1, %cond
+      %inv2 = neura.invariant %val2, %cond
+      %inv3 = neura.invariant %val3, %cond
+    
+    With a single batch operation:
+      %inv1, %inv2, %inv3 = neura.invariant_group %val1, %val2, %val3, %cond
+    
+    ResMII Impact: Reduces N operations to 1 operation (improves resource utilization)
+    
+    Example:
+      %out1, %out2, %out3 = neura.invariant_group %in1, %in2, %in3, %cond
+             : i64, i64, i64, i1 -> i64, i64, i64
+  }];
+
+  let arguments = (ins 
+    Variadic<AnyType>:$inputs,
+    AnyType:$condition
+  );
+  let results = (outs Variadic<AnyType>:$outputs);
+  
+  let assemblyFormat = [{
+    $inputs `,` $condition attr-dict `:` type($inputs) `,` type($condition) 
+    `->` type($outputs)
+  }];
 }
\ No newline at end of file
diff --git a/include/NeuraDialect/NeuraPasses.td b/include/NeuraDialect/NeuraPasses.td
index 3d70af2c..d7f4974a 100644
--- a/include/NeuraDialect/NeuraPasses.td
+++ b/include/NeuraDialect/NeuraPasses.td
@@ -134,4 +134,5 @@ def RemovePredicatedType : Pass<"remove-predicated-type", "ModuleOp"> {
   }];
   let constructor = "neura::createRemovePredicatedTypePass()";
 }
+
 #endif // NEURA_PASSES_TD
\ No newline at end of file
diff --git a/lib/Conversion/AffineToNeura/AffineToNeuraPass.cpp b/lib/Conversion/AffineToNeura/AffineToNeuraPass.cpp
new file mode 100644
index 00000000..77afea12
--- /dev/null
+++ b/lib/Conversion/AffineToNeura/AffineToNeuraPass.cpp
@@ -0,0 +1,480 @@
+#include "Common/AcceleratorAttrs.h"
+#include "Conversion/ConversionPasses.h"
+#include "Conversion/AffineToNeura/LoopNestAnalysis.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/Region.h"
+#include "mlir/IR/ValueRange.h"
+#include "mlir/IR/Visitors.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+#include "NeuraDialect/NeuraDialect.h"
+#include "NeuraDialect/NeuraOps.h"
+#include "llvm/Support/raw_ostream.h"
+#include <memory>
+
+using namespace mlir;
+using namespace mlir::neura;
+using namespace mlir::func;
+
+#define GEN_PASS_DEF_LOWERAFFINETONEURA
+#include "Conversion/ConversionPasses.h.inc"
+
+namespace {
+LogicalResult convertAffineMapToIndices(AffineMap map, ValueRange map_operands,
+                                        Location loc, PatternRewriter &rewriter,
+                                        SmallVector<Value> &new_indices) {
+  new_indices.clear();
+  new_indices.reserve(map.getNumResults());
+  for (AffineExpr expr : map.getResults()) {
+    if (AffineConstantExpr const_expr = dyn_cast<AffineConstantExpr>(expr)) {
+      IndexType index_type = rewriter.getIndexType();
+      IntegerAttr value_attr =
+          rewriter.getIntegerAttr(index_type, const_expr.getValue());
+      new_indices.push_back(rewriter.create<neura::ConstantOp>(
+          loc, index_type, value_attr));
+    } else if (AffineDimExpr dim_expr = dyn_cast<AffineDimExpr>(expr)) {
+      if (dim_expr.getPosition() >= map.getNumDims() ||
+          dim_expr.getPosition() >=
+              map_operands
+                  .size()) { // Checks against mapOperands size for safety.
+        return failure();
+      }
+      new_indices.push_back(map_operands[dim_expr.getPosition()]);
+    } else if (AffineSymbolExpr sym_expr = dyn_cast<AffineSymbolExpr>(expr)) {
+      unsigned symbol_operand_index = map.getNumDims() + sym_expr.getPosition();
+      if (symbol_operand_index >= map_operands.size()) {
+        return failure();
+      }
+      new_indices.push_back(map_operands[symbol_operand_index]);
+    } else {
+      // For more complex affine expressions (e.g., d0 + c1),
+      // expands them into explicit Neura arithmetic operations.
+      // Supports: Add, Mul, Mod, FloorDiv, CeilDiv.
+      llvm::errs() << "[affine2neura] Expanding complex affine expression: " 
+                   << expr << "\n";
+      
+      // Helper lambda: recursively expands AffineExpr to Value.
+      std::function<Value(AffineExpr)> expandExpr = 
+          [&](AffineExpr e) -> Value {
+        // Constant expression.
+        if (auto const_expr = dyn_cast<AffineConstantExpr>(e)) {
+          return rewriter.create<neura::ConstantOp>(
+              loc, rewriter.getIndexType(),
+              rewriter.getIntegerAttr(rewriter.getIndexType(), 
+                                      const_expr.getValue()));
+        }
+        // Dimension expression.
+        else if (auto dim_expr = dyn_cast<AffineDimExpr>(e)) {
+          return map_operands[dim_expr.getPosition()];
+        }
+        // Symbol expression.
+        else if (auto sym_expr = dyn_cast<AffineSymbolExpr>(e)) {
+          unsigned symbol_operand_index = 
+              map.getNumDims() + sym_expr.getPosition();
+          return map_operands[symbol_operand_index];
+        }
+        // Binary operation expression.
+        else if (auto bin_expr = dyn_cast<AffineBinaryOpExpr>(e)) {
+          Value lhs = expandExpr(bin_expr.getLHS());
+          Value rhs = expandExpr(bin_expr.getRHS());
+          
+          switch (bin_expr.getKind()) {
+            case AffineExprKind::Add:
+              return rewriter.create<neura::AddOp>(
+                  loc, rewriter.getIndexType(), lhs, rhs).getResult();
+            case AffineExprKind::Mul:
+              return rewriter.create<neura::MulOp>(
+                  loc, rewriter.getIndexType(), lhs, rhs).getResult();
+            case AffineExprKind::Mod:
+              return rewriter.create<neura::RemOp>(
+                  loc, rewriter.getIndexType(), lhs, rhs).getResult();
+            case AffineExprKind::FloorDiv:
+              return rewriter.create<neura::DivOp>(
+                  loc, rewriter.getIndexType(), lhs, rhs).getResult();
+            case AffineExprKind::CeilDiv: {
+              // ceildiv(a, b) = floordiv(a + b - 1, b).
+              Value one = rewriter.create<neura::ConstantOp>(
+                  loc, rewriter.getIndexType(),
+                  rewriter.getIntegerAttr(rewriter.getIndexType(), 1));
+              Value b_minus_1 = rewriter.create<neura::SubOp>(
+                  loc, rewriter.getIndexType(), rhs, one).getResult();
+              Value numerator = rewriter.create<neura::AddOp>(
+                  loc, rewriter.getIndexType(), lhs, b_minus_1).getResult();
+              return rewriter.create<neura::DivOp>(
+                  loc, rewriter.getIndexType(), numerator, rhs).getResult();
+            }
+            default:
+              llvm::errs() << "[affine2neura] Unsupported binary op kind: "
+                           << static_cast<int>(bin_expr.getKind()) << "\n";
+              return Value();
+          }
+        }
+        
+        llvm::errs() << "[affine2neura] Unsupported affine expression type\n";
+        return Value();
+      };
+      
+      Value expanded = expandExpr(expr);
+      if (!expanded) {
+        // Fallback: if expansion fails, use affine.apply (ensures correctness).
+        llvm::errs() << "[affine2neura] Failed to expand, using affine.apply\n";
+        AffineMap single_result_map = AffineMap::get(
+            map.getNumDims(), map.getNumSymbols(), expr, rewriter.getContext());
+        expanded = rewriter.create<affine::AffineApplyOp>(
+            loc, single_result_map, map_operands);
+      }
+      new_indices.push_back(expanded);
+    }
+  }
+  return success();
+}
+
+struct AffineLoadLowering : public OpRewritePattern<affine::AffineLoadOp> {
+  AffineLoadLowering(MLIRContext *context)
+      : OpRewritePattern<affine::AffineLoadOp>(context, /*benefit=*/1) {}
+  
+  LogicalResult matchAndRewrite(affine::AffineLoadOp load_op,
+                                PatternRewriter &rewriter) const override {
+    Location loc = load_op.getLoc();
+    auto memref = load_op.getMemref();
+    AffineMap map = load_op.getAffineMap();
+    ValueRange map_operands = load_op.getMapOperands();
+    // Gets the indices for the load operation.
+    SmallVector<Value> new_indices;
+    if (failed(convertAffineMapToIndices(map, map_operands, loc, rewriter,
+                                         new_indices))) {
+      return load_op.emitError(
+          "[affine2neura] Failed to convert affine map to indices");
+    }
+
+    MemRefType memref_type = dyn_cast<MemRefType>(memref.getType());
+    if (!memref_type) {
+      return load_op.emitError(
+          "[affine2neura] Base of load is not a MemRefType");
+    }
+    if (new_indices.size() != static_cast<size_t>(memref_type.getRank())) {
+      return load_op.emitError(
+                 "[affine2neura] Number of indices from affine map (")
+             << new_indices.size() << ") does not match memref rank ("
+             << memref_type.getRank() << ")";
+    }
+
+    // Creates the neura.load_indexed operation.
+   LoadIndexedOp new_load_op = rewriter.create<neura::LoadIndexedOp>(
+        loc, load_op.getType(), memref, ValueRange{new_indices});
+
+    rewriter.replaceOp(load_op, new_load_op.getResult());
+    return success();
+  }
+};
+
+struct AffineStoreLowering : public OpRewritePattern<affine::AffineStoreOp> {
+  AffineStoreLowering(MLIRContext *context)
+      : OpRewritePattern<affine::AffineStoreOp>(context, /*benefit=*/1) {}
+  
+  LogicalResult matchAndRewrite(affine::AffineStoreOp store_op,
+                                PatternRewriter &rewriter) const override {
+    Location loc = store_op.getLoc();
+    auto memref = store_op.getMemref();
+    Value value = store_op.getValueToStore();
+    AffineMap map = store_op.getAffineMap();
+    ValueRange mapOperands = store_op.getMapOperands();
+
+    SmallVector<Value> newIndices;
+    if (failed(convertAffineMapToIndices(map, mapOperands, loc, rewriter,
+                                         newIndices))) {
+      return store_op.emitError(
+          "[affine2neura] Failed to convert affine map to indices");
+    }
+
+    MemRefType memRefType = dyn_cast<MemRefType>(memref.getType());
+    if (!memRefType) {
+      return store_op.emitError(
+          "[affine2neura] Base of store is not a MemRefType");
+    }
+    if (newIndices.size() != static_cast<size_t>(memRefType.getRank())) {
+      return store_op.emitError(
+                 "[affine2neura] Number of indices from affine map (")
+             << newIndices.size() << ") does not match memref rank ("
+             << memRefType.getRank() << ")";
+    }
+
+    rewriter.create<neura::StoreIndexedOp>(loc, value, memref,
+                                           ValueRange{newIndices});
+    rewriter.eraseOp(store_op);
+    return success();
+  }
+};
+
+struct AffineApplyLowering : public OpRewritePattern<affine::AffineApplyOp> {
+  AffineApplyLowering(MLIRContext *context)
+      : OpRewritePattern<affine::AffineApplyOp>(context, /*benefit=*/1) {}
+  
+  LogicalResult matchAndRewrite(affine::AffineApplyOp apply_op,
+                                PatternRewriter &rewriter) const override {
+    AffineMap map = apply_op.getAffineMap();
+    ValueRange operands = apply_op.getMapOperands();
+    Location loc = apply_op.getLoc();
+
+    if (map.getNumResults() != 1) {
+      return apply_op.emitError(
+          "[affine2neura] AffineApplyOp must have a single result");
+    }
+
+    AffineExpr expr = map.getResult(0);
+    llvm::errs() << "[affine2neura] Expanding affine.apply expression: " 
+                 << expr << "\n";
+    
+    // Helper lambda: recursively expands AffineExpr to Value.
+    std::function<Value(AffineExpr)> expandExpr = 
+        [&](AffineExpr e) -> Value {
+      // Constant expression.
+      if (auto const_expr = dyn_cast<AffineConstantExpr>(e)) {
+        return rewriter.create<neura::ConstantOp>(
+            loc, rewriter.getIndexType(),
+            rewriter.getIntegerAttr(rewriter.getIndexType(), 
+                                    const_expr.getValue()));
+      }
+      // Dimension expression.
+      else if (auto dim_expr = dyn_cast<AffineDimExpr>(e)) {
+        return operands[dim_expr.getPosition()];
+      }
+      // Symbol expression.
+      else if (auto sym_expr = dyn_cast<AffineSymbolExpr>(e)) {
+        unsigned symbol_operand_index = 
+            map.getNumDims() + sym_expr.getPosition();
+        return operands[symbol_operand_index];
+      }
+      // Binary operation expression.
+      else if (auto bin_expr = dyn_cast<AffineBinaryOpExpr>(e)) {
+        Value lhs = expandExpr(bin_expr.getLHS());
+        Value rhs = expandExpr(bin_expr.getRHS());
+        
+        if (!lhs || !rhs) {
+          return Value();
+        }
+        
+        switch (bin_expr.getKind()) {
+          case AffineExprKind::Add:
+            return rewriter.create<neura::AddOp>(
+                loc, rewriter.getIndexType(), lhs, rhs).getResult();
+          case AffineExprKind::Mul:
+            return rewriter.create<neura::MulOp>(
+                loc, rewriter.getIndexType(), lhs, rhs).getResult();
+          case AffineExprKind::Mod:
+            return rewriter.create<neura::RemOp>(
+                loc, rewriter.getIndexType(), lhs, rhs).getResult();
+          case AffineExprKind::FloorDiv:
+            return rewriter.create<neura::DivOp>(
+                loc, rewriter.getIndexType(), lhs, rhs).getResult();
+          case AffineExprKind::CeilDiv: {
+            // ceildiv(a, b) = floordiv(a + b - 1, b).
+            Value one = rewriter.create<neura::ConstantOp>(
+                loc, rewriter.getIndexType(),
+                rewriter.getIntegerAttr(rewriter.getIndexType(), 1));
+            Value b_minus_1 = rewriter.create<neura::SubOp>(
+                loc, rewriter.getIndexType(), rhs, one).getResult();
+            Value numerator = rewriter.create<neura::AddOp>(
+                loc, rewriter.getIndexType(), lhs, b_minus_1).getResult();
+            return rewriter.create<neura::DivOp>(
+                loc, rewriter.getIndexType(), numerator, rhs).getResult();
+          }
+          default:
+            llvm::errs() << "[affine2neura] Unsupported binary op kind: "
+                         << static_cast<int>(bin_expr.getKind()) << "\n";
+            return Value();
+        }
+      }
+      
+      llvm::errs() << "[affine2neura] Unsupported affine expression type\n";
+      return Value();
+    };
+    
+    Value expanded = expandExpr(expr);
+    if (!expanded) {
+      return apply_op.emitError("[affine2neura] Failed to expand affine.apply expression");
+    }
+    
+    rewriter.replaceOp(apply_op, expanded);
+    return success();
+  }
+};
+
+struct AffineForLowering : public OpRewritePattern<affine::AffineForOp> {
+  const LoopNestAnalysis &analysis;
+  llvm::DenseMap<Operation *, Value> &loopValidSignals;
+  
+  AffineForLowering(MLIRContext *context, const LoopNestAnalysis &analysis,
+                    llvm::DenseMap<Operation *, Value> &loopValidSignals)
+      : OpRewritePattern<affine::AffineForOp>(context, /*benefit=*/1),
+        analysis(analysis), loopValidSignals(loopValidSignals) {}
+  
+  LogicalResult matchAndRewrite(affine::AffineForOp for_op,
+                                PatternRewriter &rewriter) const override {
+    Location loc = for_op.getLoc();
+    
+    // Extracts loop bounds - must be constant.
+    // Dynamic bounds are not supported as neura.loop_control requires
+    // compile-time constant attributes for hardware configuration.
+    if (!for_op.hasConstantLowerBound() || !for_op.hasConstantUpperBound()) {
+      return for_op.emitError(
+          "[affine2neura] Non-constant loop bounds not supported. "
+          "Loop bounds must be compile-time constants for CGRA configuration");
+    }
+
+    int64_t lower_bound = for_op.getConstantLowerBound();
+    int64_t upper_bound = for_op.getConstantUpperBound();
+    int64_t step = for_op.getStepAsInt();
+
+    // Get loop nesting information
+    LoopInfo *loopInfo = analysis.getLoopInfo(for_op);
+    Type i1_type = rewriter.getI1Type();
+    Value parent_valid;
+    
+    // Optimization: Reuse parent loop's valid signal for nested loops.
+    // This avoids creating redundant grant_once operations.
+    if (loopInfo && loopInfo->parent) {
+      // This is a nested loop - try to reuse parent's loop_valid signal
+      auto it = loopValidSignals.find(loopInfo->parent->loop.getOperation());
+      if (it != loopValidSignals.end()) {
+        parent_valid = it->second;
+        llvm::errs() << "[affine2neura] Reusing parent valid signal for "
+                     << "nested loop (depth=" << loopInfo->depth << ")\n";
+      } else {
+        // Fallback: parent not yet converted, create grant_once
+        parent_valid = rewriter.create<neura::GrantOnceOp>(
+            loc, i1_type, /*value=*/Value(), /*constant_value=*/nullptr);
+        llvm::errs() << "[affine2neura] Parent valid not available, "
+                     << "creating grant_once for nested loop\n";
+      }
+    } else {
+      // Top-level loop - create grant_once
+      parent_valid = rewriter.create<neura::GrantOnceOp>(
+          loc, i1_type, /*value=*/Value(), /*constant_value=*/nullptr);
+      if (loopInfo) {
+        llvm::errs() << "[affine2neura] Created grant_once for top-level loop "
+                     << "(depth=" << loopInfo->depth << ")\n";
+      }
+    }
+
+    // Creates loop_control operation.
+    auto index_type = rewriter.getIndexType();
+    
+    auto loop_control = rewriter.create<neura::LoopControlOp>(
+        loc,
+        /*resultTypes=*/TypeRange{index_type, i1_type},
+        /*parentValid=*/parent_valid,
+        /*iterationType=*/rewriter.getStringAttr("increment"),
+        /*start=*/rewriter.getI64IntegerAttr(lower_bound),
+        /*end=*/rewriter.getI64IntegerAttr(upper_bound),
+        /*step=*/rewriter.getI64IntegerAttr(step));
+
+    Value loop_index = loop_control.getResult(0);
+    Value loop_valid = loop_control.getResult(1);
+    
+    // Store the loop_valid signal for child loops to use.
+    // This enables the optimization for nested loops.
+    loopValidSignals[for_op.getOperation()] = loop_valid;
+
+    // Replaces uses of the induction variable.
+    for_op.getInductionVar().replaceAllUsesWith(loop_index);
+
+    // Inlines the body operations before the for_op.
+    Block &body_block = for_op.getRegion().front();
+    Operation *terminator = body_block.getTerminator();
+    rewriter.eraseOp(terminator);  // Removes affine.yield first.
+    
+    // Merge the loop body into the parent block before the for_op.
+    // Pass empty ValueRange since we've already replaced the induction variable.
+    rewriter.inlineBlockBefore(&body_block, for_op.getOperation(), {});
+    
+    // Erases the for_op.
+    rewriter.eraseOp(for_op);
+
+    return success();
+  }
+};
+
+struct LowerAffineToNeuraPass
+    : public PassWrapper<LowerAffineToNeuraPass, OperationPass<ModuleOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(LowerAffineToNeuraPass)
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<neura::NeuraDialect, arith::ArithDialect,
+                    memref::MemRefDialect, affine::AffineDialect>();
+  }
+
+  StringRef getArgument() const override { return "lower-affine-to-neura"; }
+  StringRef getDescription() const override {
+    return "Lower affine operations to Neura dialect operations";
+  }
+
+  void runOnOperation() override {
+    ModuleOp module_op = getOperation();
+    MLIRContext *context = module_op.getContext();
+
+    module_op.walk([&](func::FuncOp func_op) {
+      // Checks if function targets neura accelerator, or applies to all if no attribute.
+      if (func_op->hasAttr(mlir::accel::kAcceleratorAttr)) {
+        auto target = func_op->getAttrOfType<StringAttr>(
+            mlir::accel::kAcceleratorAttr);
+        if (!target || target.getValue() != mlir::accel::kNeuraTarget) {
+          return;  // Skips this function.
+        }
+      }
+      // If no accelerator attribute, applies the pass anyway (for testing).
+      
+      // Step 1: Perform loop nest analysis
+      // This builds the loop hierarchy and identifies perfect/imperfect nests
+      llvm::errs() << "[affine2neura] Analyzing loop nests in function: "
+                   << func_op.getName() << "\n";
+      LoopNestAnalysis analysis(func_op);
+      analysis.dump();  // Print analysis results for debugging
+      
+      // Step 2: Create a map to store loop_valid signals
+      // This allows nested loops to reuse parent's valid signal
+      llvm::DenseMap<Operation *, Value> loopValidSignals;
+      
+      // Step 3: Set up dialect conversion
+      // We use Dialect Conversion instead of Greedy Pattern Rewriter because:
+      // 1. It provides better error reporting when conversion fails
+      // 2. It explicitly defines which operations are legal/illegal
+      // 3. It's the standard approach for dialect lowering passes
+      ConversionTarget target(*context);
+      target.addLegalDialect<neura::NeuraDialect, arith::ArithDialect,
+                             memref::MemRefDialect, func::FuncDialect>();
+      target.addIllegalDialect<affine::AffineDialect>();
+      
+      // Step 4: Register rewrite patterns with analysis
+      RewritePatternSet patterns(context);
+      patterns.add<AffineLoadLowering, AffineStoreLowering, AffineApplyLowering>(context);
+      // Pass references to the analysis and loopValidSignals map
+      patterns.add<AffineForLowering>(context, std::cref(analysis), 
+                                      std::ref(loopValidSignals));
+
+      if (failed(applyPartialConversion(func_op, target, std::move(patterns)))) {
+        func_op.emitError("[affine2neura] Failed to lower affine "
+                          "operations to Neura dialect");
+        signalPassFailure();
+      }
+    });
+  }
+};
+} // namespace
+
+std::unique_ptr<mlir::Pass> mlir::createLowerAffineToNeuraPass() {
+  return std::make_unique<LowerAffineToNeuraPass>();
+}
\ No newline at end of file
diff --git a/lib/Conversion/AffineToNeura/CMakeLists.txt b/lib/Conversion/AffineToNeura/CMakeLists.txt
new file mode 100644
index 00000000..285099f3
--- /dev/null
+++ b/lib/Conversion/AffineToNeura/CMakeLists.txt
@@ -0,0 +1,19 @@
+add_mlir_dialect_library(MLIRNeuraAffineToNeuraPass
+  AffineToNeuraPass.cpp
+  LoopNestAnalysis.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/Conversion
+
+  DEPENDS
+  MLIRConversionIncGen
+
+  LINK_LIBS PUBLIC
+  MLIRAffineDialect
+  MLIRNeura
+  MLIRIR
+  MLIRPass
+  MLIRSupport
+  MLIRTransforms
+  MLIRFuncDialect
+)
diff --git a/lib/Conversion/AffineToNeura/LoopNestAnalysis.cpp b/lib/Conversion/AffineToNeura/LoopNestAnalysis.cpp
new file mode 100644
index 00000000..64b6a029
--- /dev/null
+++ b/lib/Conversion/AffineToNeura/LoopNestAnalysis.cpp
@@ -0,0 +1,191 @@
+#include "Conversion/AffineToNeura/LoopNestAnalysis.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+using namespace mlir::neura;
+
+/// Constructor - Performs complete loop nest analysis.
+LoopNestAnalysis::LoopNestAnalysis(func::FuncOp func) {
+  llvm::errs() << "[LoopNestAnalysis] Starting analysis for function: " 
+               << func.getName() << "\n";
+  buildLoopNestTree(func);
+  llvm::errs() << "[LoopNestAnalysis] Found " << allLoops.size() << " loops\n";
+  analyzePerfectNests();
+  llvm::errs() << "[LoopNestAnalysis] Analysis complete\n";
+}
+
+// Builds the loop hierarchy tree.
+void LoopNestAnalysis::buildLoopNestTree(func::FuncOp func) {
+  // Step 1: Collects all loops.
+  func.walk([&](affine::AffineForOp loop) {
+    auto loopInfo = std::make_unique<LoopInfo>(loop);
+    loopMap[loop.getOperation()] = loopInfo.get();
+    allLoops.push_back(std::move(loopInfo));
+  });
+  
+  // Step 2: Establishes parent-child relationships.
+  for (auto &loopInfoPtr : allLoops) {
+    LoopInfo *loopInfo = loopInfoPtr.get();
+    affine::AffineForOp loop = loopInfo->loop;
+    
+    // Searches upward for parent loop.
+    Operation *parentOp = loop->getParentOp();
+    while (parentOp && !isa<func::FuncOp>(parentOp)) {
+      if (auto parentLoop = dyn_cast<affine::AffineForOp>(parentOp)) {
+        auto it = loopMap.find(parentLoop.getOperation());
+        if (it != loopMap.end()) {
+          loopInfo->parent = it->second;
+          loopInfo->depth = loopInfo->parent->depth + 1;  // depth = parent_depth + 1
+          it->second->children.push_back(loopInfo);
+        }
+        break;
+      }
+      parentOp = parentOp->getParentOp();
+    }
+    
+    // If no parent loop, this is a top-level loop.
+    if (!loopInfo->parent) {
+      topLevelLoops.push_back(loopInfo);
+    }
+  }
+}
+
+// Analyzes perfect nesting characteristics.
+void LoopNestAnalysis::analyzePerfectNests() {
+  for (auto &loopInfoPtr : allLoops) {
+    LoopInfo *info = loopInfoPtr.get();
+    
+    // Leaf loops are automatically perfect.
+    if (info->children.empty()) {
+      info->isPerfectNest = true;
+      continue;
+    }
+    
+    Block &body = info->loop.getRegion().front();
+    
+    // Builds child loop operation set for fast lookup.
+    llvm::DenseSet<Operation *> childLoopOps;
+    for (LoopInfo *child : info->children) {
+      childLoopOps.insert(child->loop.getOperation());
+    }
+    
+    Operation *firstChild = info->children.front()->loop.getOperation();
+    Operation *lastChild = info->children.back()->loop.getOperation();
+    
+    // Checks if operations exist before the first child loop.
+    for (Operation &op : body.getOperations()) {
+      if (&op == firstChild) break;
+      if (isa<affine::AffineYieldOp>(&op)) continue;
+      info->operationsBeforeChild.push_back(&op);
+      info->isPerfectNest = false;  // Operations before child → imperfect
+    }
+    
+    // Checks if operations exist after the last child loop.
+    bool afterLastChild = false;
+    for (Operation &op : body.getOperations()) {
+      if (&op == lastChild) {
+        afterLastChild = true;
+        continue;
+      }
+      if (afterLastChild && !isa<affine::AffineYieldOp>(&op)) {
+        info->operationsAfterChild.push_back(&op);
+        info->isPerfectNest = false;  // Operations after child → imperfect
+      }
+    }
+    
+    // Checks if operations exist between sibling child loops.
+    // Example: affine.for i { affine.for j1; op; affine.for j2 }
+    if (info->children.size() > 1) {
+      bool betweenChildren = false;
+      Operation *prevChild = nullptr;
+      
+      for (Operation &op : body.getOperations()) {
+        if (childLoopOps.contains(&op)) {
+          if (prevChild && betweenChildren) {
+            info->isPerfectNest = false;  // Operations between siblings → imperfect
+            break;
+          }
+          prevChild = &op;
+          betweenChildren = false;
+        } else if (prevChild && !isa<affine::AffineYieldOp>(&op)) {
+          betweenChildren = true;
+        }
+      }
+    }
+  }
+}
+
+
+// Query Interface Implementation
+
+// Queries LoopInfo by loop operation.
+LoopInfo *LoopNestAnalysis::getLoopInfo(affine::AffineForOp loop) const {
+  auto it = loopMap.find(loop.getOperation());
+  return it != loopMap.end() ? it->second : nullptr;
+}
+
+// Checks if the loop is a perfect nest.
+bool LoopNestAnalysis::isPerfectNest(affine::AffineForOp loop) const {
+  LoopInfo *info = getLoopInfo(loop);
+  return info ? info->isPerfectNest : false;
+}
+
+// Gets the parent loop.
+LoopInfo *LoopNestAnalysis::getParentLoop(affine::AffineForOp loop) const {
+  LoopInfo *info = getLoopInfo(loop);
+  return info ? info->parent : nullptr;
+}
+
+// Gets the list of child loops.
+llvm::ArrayRef<LoopInfo *> 
+LoopNestAnalysis::getChildLoops(affine::AffineForOp loop) const {
+  LoopInfo *info = getLoopInfo(loop);
+  return info ? llvm::ArrayRef<LoopInfo *>(info->children) 
+              : llvm::ArrayRef<LoopInfo *>();
+}
+
+
+// Debug Output Implementation
+void LoopNestAnalysis::dump() const {
+  llvm::errs() << "=== Loop Nest Analysis ===\n";
+  llvm::errs() << "Total loops: " << allLoops.size() << "\n";
+  llvm::errs() << "Top-level loops: " << topLevelLoops.size() << "\n\n";
+  
+  // Recursive print function.
+  std::function<void(LoopInfo *, unsigned)> printLoop;
+  printLoop = [&](LoopInfo *info, unsigned indent) {
+    // Prints indentation.
+    for (unsigned i = 0; i < indent; ++i) llvm::errs() << "  ";
+    
+    // Prints basic loop information.
+    llvm::errs() << "Loop (depth=" << info->depth 
+                 << ", perfect=" << (info->isPerfectNest ? "yes" : "no")
+                 << ", children=" << info->children.size() << ")";
+    
+    // If imperfect nest, prints detailed information.
+    if (!info->isPerfectNest) {
+      llvm::errs() << " [IMPERFECT: "
+                   << "ops_before=" << info->operationsBeforeChild.size()
+                   << ", ops_after=" << info->operationsAfterChild.size()
+                   << "]";
+    }
+    llvm::errs() << "\n";
+    
+    // Prints location information.
+    for (unsigned i = 0; i < indent; ++i) llvm::errs() << "  ";
+    llvm::errs() << "  at: ";
+    info->loop.getLoc().print(llvm::errs());
+    llvm::errs() << "\n";
+    
+    // Recursively prints child loops.
+    for (LoopInfo *child : info->children) {
+      printLoop(child, indent + 1);
+    }
+  };
+  
+  for (LoopInfo *topLoop : topLevelLoops) {
+    printLoop(topLoop, 0);
+  }
+  
+  llvm::errs() << "=== End Loop Nest Analysis ===\n\n";
+}
diff --git a/lib/Conversion/ArithToNeura/ArithToNeuraPass.cpp b/lib/Conversion/ArithToNeura/ArithToNeuraPass.cpp
index dc6f4532..8328eb61 100644
--- a/lib/Conversion/ArithToNeura/ArithToNeuraPass.cpp
+++ b/lib/Conversion/ArithToNeura/ArithToNeuraPass.cpp
@@ -343,8 +343,9 @@ struct LowerArithToNeuraPass
               ArithFDivToNeuraFDiv, ArithExtfToNeuraCast, ArithMulFToNeuraFMul,
               ArithSubIToNeuraSub, ArithSubFToNeuraFSub, ArithMulIToNeuraMul,
               ArithDivSIToNeuraDiv, ArithRemSIToNeuraOp>(context);
+          // Apply patterns to the function, not the entire module
           if (failed(
-                  applyPatternsGreedily(getOperation(), std::move(patterns)))) {
+                  applyPatternsGreedily(func_op, std::move(patterns)))) {
             signalPassFailure();
           }
         }
diff --git a/lib/Conversion/CMakeLists.txt b/lib/Conversion/CMakeLists.txt
index 98f5dac2..bb6ccd5a 100644
--- a/lib/Conversion/CMakeLists.txt
+++ b/lib/Conversion/CMakeLists.txt
@@ -1,6 +1,7 @@
 get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
 
 add_subdirectory(ArithToNeura)
+add_subdirectory(AffineToNeura)
 add_subdirectory(LlvmToNeura)
 add_subdirectory(MemRefToNeura)
 add_subdirectory(BuiltinToNeura)
@@ -16,6 +17,7 @@ target_link_libraries(MLIRConversion INTERFACE
   MLIRTransforms
   MLIRNeura
   MLIRNeuraArithToNeuraPass
+  MLIRNeuraAffineToNeuraPass
   MLIRNeuraLlvmToNeuraPass
   MLIRNeuraMemRefToNeuraPass
   MLIRNeuraBuiltinToNeuraPass
diff --git a/lib/NeuraDialect/Mapping/mapping_util.cpp b/lib/NeuraDialect/Mapping/mapping_util.cpp
index 18b2a899..21d33250 100644
--- a/lib/NeuraDialect/Mapping/mapping_util.cpp
+++ b/lib/NeuraDialect/Mapping/mapping_util.cpp
@@ -57,6 +57,11 @@ OperationKind getOperationKindFromMlirOp(Operation *op) {
   if (isa<neura::FAddFAddOp>(op)) return FAddFAdd;
   if (isa<neura::FMulFAddOp>(op)) return FMulFAdd;
   
+  // Steering control fused operations
+  if (isa<neura::CarryInvariantOp>(op)) return ICarryInvariant;
+  if (isa<neura::ConditionalSelectOp>(op)) return IConditionalSelect;
+  if (isa<neura::InvariantGroupOp>(op)) return IInvariantGroup;
+  
   // Control flow operations
   if (isa<neura::ReturnOp>(op)) return IReturn;
   if (isa<neura::PhiOp>(op)) return IPhi;
@@ -87,6 +92,14 @@ bool is_non_materialized(Operation *op) {
   return mlir::isa<neura::ReserveOp, neura::CtrlMovOp, neura::DataMovOp>(op);
 }
 
+// Returns true if the operation is a steering-mode operation that doesn't
+// require DataMovOp wrapping (e.g., constants, carry, invariant, etc.).
+bool is_steering_unwrapped_op(Operation *op) {
+  return mlir::isa<neura::ConstantOp, neura::CarryOp, neura::InvariantOp,
+                   neura::CarryInvariantOp, neura::ConditionalSelectOp,
+                   neura::InvariantGroupOp, neura::ReserveOp>(op);
+}
+
 } // namespace neura
 } // namespace mlir
 
@@ -625,9 +638,16 @@ bool mlir::neura::tryRouteDataMove(Operation *mov_op, MappingLoc src_loc,
 
 Operation *mlir::neura::getMaterializedProducer(Value operand) {
   Operation *producer = operand.getDefiningOp();
+  
+  // In steering mode, some operations (like constants, carry, invariant, etc.)
+  // may not be wrapped by DataMovOp. Return them directly.
+  if (is_steering_unwrapped_op(producer)) {
+    return producer;
+  }
+  
+  // For operations wrapped by DataMovOp, find the actual producer.
   assert(isa<neura::DataMovOp>(producer) &&
-         "Expected operand to be defined by a DataMovOp");
-  // Finds the actual producer.
+         "Expected a DataMovOp as operand producer for non-steering operations");
   auto mov_op = dyn_cast<neura::DataMovOp>(producer);
   auto materialized_producer = mov_op.getOperand().getDefiningOp();
   return materialized_producer;
@@ -760,6 +780,16 @@ bool mlir::neura::isMaterializedReserveUser(Operation *user) {
   if (isa<neura::CarryOp>(user)) {
     return true;
   }
+  // Fused steering control operations
+  if (isa<neura::CarryInvariantOp>(user)) {
+    return true;
+  }
+  if (isa<neura::ConditionalSelectOp>(user)) {
+    return true;
+  }
+  if (isa<neura::InvariantGroupOp>(user)) {
+    return true;
+  }
   return false;
 }
 
@@ -961,8 +991,18 @@ bool mlir::neura::placeAndRoute(Operation *op, const MappingLoc &target_loc,
         continue;
       }
       Operation *data_move = operand.getDefiningOp();
+      
+      // In steering mode, some operands may not be DataMovOp (e.g., constants,
+      // carry, invariant, etc.). Skip routing for these operations.
+      if (is_steering_unwrapped_op(data_move)) {
+        llvm::errs() << "Skipping steering unwrapped operand: " << *data_move
+                     << "\n";
+        continue;
+      }
+      
       assert(isa<neura::DataMovOp>(data_move) &&
-             "Expected a DataMovOp as operand producer");
+             "Expected a DataMovOp as operand for non-steering operations");
+      
       Operation *producer = getMaterializedProducer(operand);
       MappingLoc src_loc = mapping_state.getAllLocsOfOp(producer).back();
 
diff --git a/test/Conversion/AffineToNeura/complex-affine-expressions.mlir b/test/Conversion/AffineToNeura/complex-affine-expressions.mlir
new file mode 100644
index 00000000..06c417ac
--- /dev/null
+++ b/test/Conversion/AffineToNeura/complex-affine-expressions.mlir
@@ -0,0 +1,90 @@
+// RUN: mlir-neura-opt %s --lower-affine-to-neura | FileCheck %s
+
+// This test verifies that complex affine expressions are correctly expanded
+// into explicit Neura arithmetic operations.
+
+module {
+  // Test 1: Multiplication expression (d0 * 2)
+  // CHECK-LABEL: func.func @mul_expression
+  // CHECK-NEXT: %[[GRANT:.*]] = "neura.grant_once"() : () -> i1
+  // CHECK-NEXT: %[[I:.*]], %[[VALID:.*]] = "neura.loop_control"(%[[GRANT]]) <{end = 10 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1)
+  // CHECK-NEXT: %[[C2:.*]] = "neura.constant"() <{value = 2 : index}> : () -> index
+  // CHECK-NEXT: %[[MUL:.*]] = "neura.mul"(%[[I]], %[[C2]]) : (index, index) -> index
+  // CHECK-NEXT: %{{.*}} = neura.load_indexed %arg0[%[[MUL]] : index] memref<10xf32> : f32
+  // CHECK-NEXT: return
+  func.func @mul_expression(%arg0: memref<10xf32>) {
+    affine.for %i = 0 to 10 {
+      %0 = affine.load %arg0[2 * %i] : memref<10xf32>
+    }
+    return
+  }
+
+  // Test 2: Addition and multiplication (d0 * 2 + 1)
+  // CHECK-LABEL: func.func @complex_expression
+  // CHECK-NEXT: %[[GRANT:.*]] = "neura.grant_once"() : () -> i1
+  // CHECK-NEXT: %[[I:.*]], %[[VALID:.*]] = "neura.loop_control"(%[[GRANT]]) <{end = 10 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1)
+  // CHECK-NEXT: %[[C2:.*]] = "neura.constant"() <{value = 2 : index}> : () -> index
+  // CHECK-NEXT: %[[MUL:.*]] = "neura.mul"(%[[I]], %[[C2]]) : (index, index) -> index
+  // CHECK-NEXT: %[[C1:.*]] = "neura.constant"() <{value = 1 : index}> : () -> index
+  // CHECK-NEXT: %[[ADD:.*]] = "neura.add"(%[[MUL]], %[[C1]]) : (index, index) -> index
+  // CHECK-NEXT: %{{.*}} = neura.load_indexed %arg0[%[[ADD]] : index] memref<100xf32> : f32
+  // CHECK-NEXT: return
+  func.func @complex_expression(%arg0: memref<100xf32>) {
+    affine.for %i = 0 to 10 {
+      %0 = affine.load %arg0[2 * %i + 1] : memref<100xf32>
+    }
+    return
+  }
+
+  // Test 3: Modulo operation (d0 % 8)
+  // CHECK-LABEL: func.func @modulo_expression
+  // CHECK-NEXT: %[[GRANT:.*]] = "neura.grant_once"() : () -> i1
+  // CHECK-NEXT: %[[I:.*]], %[[VALID:.*]] = "neura.loop_control"(%[[GRANT]]) <{end = 64 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1)
+  // CHECK-NEXT: %[[C8:.*]] = "neura.constant"() <{value = 8 : index}> : () -> index
+  // CHECK-NEXT: %[[REM:.*]] = "neura.rem"(%[[I]], %[[C8]]) : (index, index) -> index
+  // CHECK-NEXT: %{{.*}} = neura.load_indexed %arg0[%[[REM]] : index] memref<64xf32> : f32
+  // CHECK-NEXT: return
+  func.func @modulo_expression(%arg0: memref<64xf32>) {
+    affine.for %i = 0 to 64 {
+      %0 = affine.load %arg0[%i mod 8] : memref<64xf32>
+    }
+    return
+  }
+
+  // Test 4: Floor division (d0 floordiv 4)
+  // CHECK-LABEL: func.func @floordiv_expression
+  // CHECK-NEXT: %[[GRANT:.*]] = "neura.grant_once"() : () -> i1
+  // CHECK-NEXT: %[[I:.*]], %[[VALID:.*]] = "neura.loop_control"(%[[GRANT]]) <{end = 32 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1)
+  // CHECK-NEXT: %[[C4_1:.*]] = "neura.constant"() <{value = 4 : index}> : () -> index
+  // CHECK-NEXT: %[[DIV:.*]] = "neura.div"(%[[I]], %[[C4_1]]) : (index, index) -> index
+  // CHECK-NEXT: %[[C4_2:.*]] = "neura.constant"() <{value = 4 : index}> : () -> index
+  // CHECK-NEXT: %[[REM:.*]] = "neura.rem"(%[[I]], %[[C4_2]]) : (index, index) -> index
+  // CHECK-NEXT: %{{.*}} = neura.load_indexed %arg0[%[[DIV]], %[[REM]] : index, index] memref<8x8xf32> : f32
+  // CHECK-NEXT: return
+  func.func @floordiv_expression(%arg0: memref<8x8xf32>) {
+    affine.for %i = 0 to 32 {
+      %row = affine.apply affine_map<(d0) -> (d0 floordiv 4)>(%i)
+      %col = affine.apply affine_map<(d0) -> (d0 mod 4)>(%i)
+      %0 = affine.load %arg0[%row, %col] : memref<8x8xf32>
+    }
+    return
+  }
+
+  // Test 5: Multiple dimensions with complex expressions
+  // CHECK-LABEL: func.func @multi_dim_complex
+  // CHECK-NEXT: %[[GRANT:.*]] = "neura.grant_once"() : () -> i1
+  // CHECK-NEXT: %[[I:.*]], %[[VALID_I:.*]] = "neura.loop_control"(%[[GRANT]]) <{end = 10 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1)
+  // CHECK-NEXT: %[[J:.*]], %[[VALID_J:.*]] = "neura.loop_control"(%[[VALID_I]]) <{end = 20 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1)
+  // CHECK-NEXT: %[[C1:.*]] = "neura.constant"() <{value = 1 : index}> : () -> index
+  // CHECK-NEXT: %[[ADD:.*]] = "neura.add"(%[[J]], %[[C1]]) : (index, index) -> index
+  // CHECK-NEXT: %{{.*}} = neura.load_indexed %arg0[%[[I]], %[[ADD]] : index, index] memref<10x20xf32> : f32
+  // CHECK-NEXT: return
+  func.func @multi_dim_complex(%arg0: memref<10x20xf32>) {
+    affine.for %i = 0 to 10 {
+      affine.for %j = 0 to 20 {
+        %0 = affine.load %arg0[%i, %j + 1] : memref<10x20xf32>
+      }
+    }
+    return
+  }
+}
diff --git a/test/Conversion/AffineToNeura/deep-nesting.mlir b/test/Conversion/AffineToNeura/deep-nesting.mlir
new file mode 100644
index 00000000..c558eda0
--- /dev/null
+++ b/test/Conversion/AffineToNeura/deep-nesting.mlir
@@ -0,0 +1,31 @@
+// RUN: mlir-neura-opt %s --lower-affine-to-neura | FileCheck %s
+
+// Corner Case: Deeply nested loops (4 levels) - tests perfect nesting with 4D
+module {
+  func.func @deep_nesting_4d(%arg0: memref<5x5x5x5xf32>) {
+    affine.for %i = 0 to 5 {
+      affine.for %j = 0 to 5 {
+        affine.for %k = 0 to 5 {
+          affine.for %l = 0 to 5 {
+            %0 = affine.load %arg0[%i, %j, %k, %l] : memref<5x5x5x5xf32>
+          }
+        }
+      }
+    }
+    return
+  }
+}
+
+// ============================================================================
+// Verify transformation: no affine ops, only neura ops, 1 grant_once for perfect nest
+// ============================================================================
+// CHECK-LABEL: func.func @deep_nesting_4d
+// CHECK-NOT: affine.
+// CHECK-NEXT: %[[V0:.*]] = "neura.grant_once"() : () -> i1
+// CHECK-NEXT: %[[I:.*]], %[[VI:.*]] = "neura.loop_control"(%[[V0]]) <{end = 5 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1)
+// CHECK-NEXT: %[[J:.*]], %[[VJ:.*]] = "neura.loop_control"(%[[VI]]) <{end = 5 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1)
+// CHECK-NEXT: %[[K:.*]], %[[VK:.*]] = "neura.loop_control"(%[[VJ]]) <{end = 5 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1)
+// CHECK-NEXT: %[[L:.*]], %[[VL:.*]] = "neura.loop_control"(%[[VK]]) <{end = 5 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1)
+// CHECK-NEXT: %{{.*}} = neura.load_indexed %arg0[%[[I]], %[[J]], %[[K]], %[[L]] : index, index, index, index] memref<5x5x5x5xf32> : f32
+// CHECK-NEXT: return
+// CHECK-NOT: affine.
diff --git a/test/Conversion/AffineToNeura/imperfect-ops-after.mlir b/test/Conversion/AffineToNeura/imperfect-ops-after.mlir
new file mode 100644
index 00000000..899dc1c9
--- /dev/null
+++ b/test/Conversion/AffineToNeura/imperfect-ops-after.mlir
@@ -0,0 +1,29 @@
+// RUN: mlir-neura-opt %s --lower-affine-to-neura | FileCheck %s
+
+// Imperfect Nesting: Operations after child loop
+module {
+  func.func @imperfect_ops_after(%arg0: memref<10x20xf32>, %arg1: memref<10xf32>) {
+    affine.for %i = 0 to 10 {
+      affine.for %j = 0 to 20 {
+        %0 = affine.load %arg0[%i, %j] : memref<10x20xf32>
+      }
+      %cst = arith.constant 1.0 : f32
+      affine.store %cst, %arg1[%i] : memref<10xf32>
+    }
+    return
+  }
+}
+
+// ============================================================================
+// Verify transformation: no affine ops, valid signal reuse for inner loop
+// ============================================================================
+// CHECK-LABEL: func.func @imperfect_ops_after(%arg0: memref<10x20xf32>, %arg1: memref<10xf32>)
+// CHECK-NEXT: %[[GRANT:.*]] = "neura.grant_once"() : () -> i1
+// CHECK-NEXT: %[[I:.*]], %[[VI:.*]] = "neura.loop_control"(%[[GRANT]]) <{end = 10 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1)
+// CHECK-NEXT: %[[J:.*]], %[[VJ:.*]] = "neura.loop_control"(%[[VI]]) <{end = 20 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1)
+// CHECK-NEXT: %{{.*}} = neura.load_indexed %arg0[%[[I]], %[[J]] : index, index] memref<10x20xf32> : f32
+// CHECK-NEXT: %[[CST:.*]] = arith.constant 1.000000e+00 : f32
+// CHECK-NEXT: neura.store_indexed %[[CST]] to %arg1[%[[I]] : index] memref<10xf32> : f32
+// CHECK-NEXT: return
+// CHECK-NEXT: }
+// CHECK-NOT: affine.
diff --git a/test/Conversion/AffineToNeura/loop-nest-optimization.mlir b/test/Conversion/AffineToNeura/loop-nest-optimization.mlir
new file mode 100644
index 00000000..3e4af366
--- /dev/null
+++ b/test/Conversion/AffineToNeura/loop-nest-optimization.mlir
@@ -0,0 +1,98 @@
+// RUN: mlir-neura-opt %s --lower-affine-to-neura | FileCheck %s
+
+// Test 1: Perfect nested loops - should reuse valid signals
+// CHECK-LABEL: func.func @perfect_nest_2d
+// CHECK-NEXT: %[[GRANT:.*]] = "neura.grant_once"() : () -> i1
+// CHECK-NEXT: %[[I:.*]], %[[VALID_I:.*]] = "neura.loop_control"(%[[GRANT]]) <{end = 10 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1)
+// CHECK-NEXT: %[[J:.*]], %[[VALID_J:.*]] = "neura.loop_control"(%[[VALID_I]]) <{end = 20 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1)
+// CHECK-NEXT: %{{.*}} = neura.load_indexed %arg0[%[[I]], %[[J]] : index, index] memref<10x20xf32> : f32
+// CHECK-NEXT: return
+func.func @perfect_nest_2d(%A: memref<10x20xf32>) {
+  affine.for %i = 0 to 10 {
+    affine.for %j = 0 to 20 {
+      %v = affine.load %A[%i, %j] : memref<10x20xf32>
+    }
+  }
+  return
+}
+
+// Test 2: Triple nested loops - should reuse valid signals transitively
+// CHECK-LABEL: func.func @perfect_nest_3d
+// CHECK-NEXT: %[[GRANT:.*]] = "neura.grant_once"() : () -> i1
+// CHECK-NEXT: %[[I:.*]], %[[VALID_I:.*]] = "neura.loop_control"(%[[GRANT]]) <{end = 10 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1)
+// CHECK-NEXT: %[[J:.*]], %[[VALID_J:.*]] = "neura.loop_control"(%[[VALID_I]]) <{end = 20 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1)
+// CHECK-NEXT: %[[K:.*]], %[[VALID_K:.*]] = "neura.loop_control"(%[[VALID_J]]) <{end = 30 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1)
+// CHECK-NEXT: %{{.*}} = neura.load_indexed %arg0[%[[I]], %[[J]], %[[K]] : index, index, index] memref<10x20x30xf32> : f32
+// CHECK-NEXT: return
+func.func @perfect_nest_3d(%A: memref<10x20x30xf32>) {
+  affine.for %i = 0 to 10 {
+    affine.for %j = 0 to 20 {
+      affine.for %k = 0 to 30 {
+        %v = affine.load %A[%i, %j, %k] : memref<10x20x30xf32>
+      }
+    }
+  }
+  return
+}
+
+// Test 3: Imperfect nested loop - operations before inner loop
+// CHECK-LABEL: func.func @imperfect_nest_before
+// CHECK-NEXT: %[[GRANT:.*]] = "neura.grant_once"() : () -> i1
+// CHECK-NEXT: %[[I:.*]], %[[VALID_I:.*]] = "neura.loop_control"(%[[GRANT]]) <{end = 10 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1)
+// CHECK-NEXT: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK-NEXT: %[[J:.*]], %[[VALID_J:.*]] = "neura.loop_control"(%[[VALID_I]]) <{end = 20 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1)
+// CHECK-NEXT: %{{.*}} = neura.load_indexed %arg0[%[[I]], %[[J]] : index, index] memref<10x20xf32> : f32
+// CHECK-NEXT: return
+func.func @imperfect_nest_before(%A: memref<10x20xf32>, %B: memref<10xf32>) {
+  affine.for %i = 0 to 10 {
+    %c = arith.constant 0.0 : f32
+    affine.for %j = 0 to 20 {
+      %v = affine.load %A[%i, %j] : memref<10x20xf32>
+    }
+  }
+  return
+}
+
+// Test 4: Two separate top-level loops - each should get its own grant_once
+// CHECK-LABEL: func.func @two_top_level_loops
+// CHECK-NEXT: %[[GRANT1:.*]] = "neura.grant_once"() : () -> i1
+// CHECK-NEXT: %[[I:.*]], %[[VALID_I:.*]] = "neura.loop_control"(%[[GRANT1]]) <{end = 10 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1)
+// CHECK-NEXT: %{{.*}} = neura.load_indexed %arg0[%[[I]] : index] memref<10xf32> : f32
+// CHECK-NEXT: %[[GRANT2:.*]] = "neura.grant_once"() : () -> i1
+// CHECK-NEXT: %[[J:.*]], %[[VALID_J:.*]] = "neura.loop_control"(%[[GRANT2]]) <{end = 20 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1)
+// CHECK-NEXT: %{{.*}} = neura.load_indexed %arg1[%[[J]] : index] memref<20xf32> : f32
+// CHECK-NEXT: return
+func.func @two_top_level_loops(%A: memref<10xf32>, %B: memref<20xf32>) {
+  affine.for %i = 0 to 10 {
+    %v = affine.load %A[%i] : memref<10xf32>
+  }
+  
+  affine.for %j = 0 to 20 {
+    %w = affine.load %B[%j] : memref<20xf32>
+  }
+  return
+}
+
+// Test 5: Siblings - two inner loops should both reuse parent's valid
+// CHECK-LABEL: func.func @sibling_loops
+// CHECK-NEXT: %[[GRANT:.*]] = "neura.grant_once"() : () -> i1
+// CHECK-NEXT: %[[I:.*]], %[[VALID_I:.*]] = "neura.loop_control"(%[[GRANT]]) <{end = 10 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1)
+// CHECK-NEXT: %[[J1:.*]], %[[VALID_J1:.*]] = "neura.loop_control"(%[[VALID_I]]) <{end = 20 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1)
+// CHECK-NEXT: %{{.*}} = neura.load_indexed %arg0[%[[I]], %[[J1]] : index, index] memref<10x20xf32> : f32
+// CHECK-NEXT: %[[J2:.*]], %[[VALID_J2:.*]] = "neura.loop_control"(%[[VALID_I]]) <{end = 20 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1)
+// CHECK-NEXT: %{{.*}} = neura.load_indexed %arg1[%[[I]], %[[J2]] : index, index] memref<10x20xf32> : f32
+// CHECK-NEXT: return
+func.func @sibling_loops(%A: memref<10x20xf32>, %B: memref<10x20xf32>) {
+  affine.for %i = 0 to 10 {
+    // First inner loop
+    affine.for %j = 0 to 20 {
+      %v = affine.load %A[%i, %j] : memref<10x20xf32>
+    }
+    
+    // Second inner loop (sibling)
+    affine.for %k = 0 to 20 {
+      %w = affine.load %B[%i, %k] : memref<10x20xf32>
+    }
+  }
+  return
+}
diff --git a/test/Conversion/AffineToNeura/single-iteration.mlir b/test/Conversion/AffineToNeura/single-iteration.mlir
new file mode 100644
index 00000000..08999f38
--- /dev/null
+++ b/test/Conversion/AffineToNeura/single-iteration.mlir
@@ -0,0 +1,23 @@
+// RUN: mlir-neura-opt %s --lower-affine-to-neura | FileCheck %s
+
+// Corner Case: Single iteration loop
+module {
+  func.func @single_iteration(%arg0: memref<1xf32>) {
+    affine.for %i = 0 to 1 {
+      %0 = affine.load %arg0[%i] : memref<1xf32>
+    }
+    return
+  }
+}
+
+// ============================================================================
+// Expected output after --lower-affine-to-neura transformation:
+// Verify: 1) no affine ops, 2) all neura ops present, 3) exact IR match
+// ============================================================================
+// CHECK-LABEL: func.func @single_iteration(%arg0: memref<1xf32>)
+// CHECK-NEXT: %[[V0:.*]] = "neura.grant_once"() : () -> i1
+// CHECK-NEXT: %[[NEXT:.*]], %[[VALID:.*]] = "neura.loop_control"(%[[V0]]) <{end = 1 : i64, iterationType = "increment", start = 0 : i64, step = 1 : i64}> : (i1) -> (index, i1)
+// CHECK-NEXT: %{{.*}} = neura.load_indexed %arg0[%[[NEXT]] : index] memref<1xf32> : f32
+// CHECK-NEXT: return
+// CHECK-NEXT: }
+// CHECK-NOT: affine.
diff --git a/tools/mlir-neura-opt/mlir-neura-opt.cpp b/tools/mlir-neura-opt/mlir-neura-opt.cpp
index 8969fa56..e88202fe 100644
--- a/tools/mlir-neura-opt/mlir-neura-opt.cpp
+++ b/tools/mlir-neura-opt/mlir-neura-opt.cpp
@@ -1,5 +1,6 @@
 // tools/mlir-neura-opt/mlir-neura-opt.cpp
 
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/DLTI/DLTI.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/InitAllDialects.h"
@@ -57,6 +58,7 @@ int main(int argc, char **argv) {
   registry.insert<mlir::neura::NeuraDialect>();
   registry.insert<mlir::func::FuncDialect>();
   registry.insert<mlir::arith::ArithDialect>();
+  registry.insert<mlir::affine::AffineDialect>();
   registry.insert<mlir::DLTIDialect>();
   registry.insert<mlir::LLVM::LLVMDialect>();
   registry.insert<mlir::memref::MemRefDialect>();