diff --git a/include/TaskflowDialect/Analysis/HyperblockDependencyAnalysis.h b/include/TaskflowDialect/Analysis/HyperblockDependencyAnalysis.h
new file mode 100644
index 0000000..ca49a24
--- /dev/null
+++ b/include/TaskflowDialect/Analysis/HyperblockDependencyAnalysis.h
@@ -0,0 +1,99 @@
+// HyperblockDependencyAnalysis.h - Analyzes dependencies between hyperblocks.
+//
+// This file provides utilities for analyzing data dependencies between
+// hyperblocks within a Taskflow task.
+
+#ifndef TASKFLOW_ANALYSIS_HYPERBLOCK_DEPENDENCY_ANALYSIS_H
+#define TASKFLOW_ANALYSIS_HYPERBLOCK_DEPENDENCY_ANALYSIS_H
+
+#include "TaskflowDialect/TaskflowOps.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallVector.h"
+
+namespace mlir {
+namespace taskflow {
+
+/// Represents the type of data dependency between hyperblocks.
+enum class DependencyType {
+  None,
+  RAW,  // Read-After-Write.
+  WAR,  // Write-After-Read.
+  WAW   // Write-After-Write.
+};
+
+/// Represents a dependency edge between two hyperblocks.
+struct HyperblockDependencyEdge {
+  TaskflowHyperblockOp source;
+  TaskflowHyperblockOp target;
+  DependencyType type;
+  Value memref;  // The memory location causing the dependency.
+};
+
+/// Analyzes dependencies between hyperblocks within a task.
+class HyperblockDependencyGraph {
+public:
+  /// Builds the dependency graph from a task operation.
+  void buildFromTask(TaskflowTaskOp taskOp);
+
+  /// Clears all stored dependency information.
+  void clear();
+
+  /// Returns true if there is any dependency from source to target.
+  bool hasDependency(TaskflowHyperblockOp source,
+                     TaskflowHyperblockOp target) const;
+
+  /// Returns all dependencies from source to target.
+  llvm::SmallVector<HyperblockDependencyEdge>
+  getDependencies(TaskflowHyperblockOp source,
+                  TaskflowHyperblockOp target) const;
+
+  /// Returns all predecessors of a hyperblock (hyperblocks it depends on).
+  llvm::SmallVector<TaskflowHyperblockOp>
+  getPredecessors(TaskflowHyperblockOp op) const;
+
+  /// Returns all successors of a hyperblock (hyperblocks that depend on it).
+  llvm::SmallVector<TaskflowHyperblockOp>
+  getSuccessors(TaskflowHyperblockOp op) const;
+
+  /// Checks if two hyperblocks can be fused without creating circular deps.
+  bool canFuse(TaskflowHyperblockOp a, TaskflowHyperblockOp b) const;
+
+  /// Checks if two hyperblocks have compatible counter structures.
+  bool areCountersCompatible(TaskflowHyperblockOp a, TaskflowHyperblockOp b,
+                             int maxBoundDiff) const;
+
+  /// Returns all hyperblocks in the analyzed task.
+  const llvm::SmallVector<TaskflowHyperblockOp> &getHyperblocks() const {
+    return hyperblocks_;
+  }
+
+private:
+  /// Collects memory reads from a hyperblock.
+  llvm::DenseSet<Value> collectReads(TaskflowHyperblockOp op) const;
+
+  /// Collects memory writes from a hyperblock.
+  llvm::DenseSet<Value> collectWrites(TaskflowHyperblockOp op) const;
+
+  /// Adds a dependency edge to the graph.
+  void addEdge(TaskflowHyperblockOp source, TaskflowHyperblockOp target,
+               DependencyType type, Value memref);
+
+  /// All hyperblocks in program order.
+  llvm::SmallVector<TaskflowHyperblockOp> hyperblocks_;
+
+  /// Maps each hyperblock to its predecessor edges.
+  llvm::DenseMap<TaskflowHyperblockOp,
+                 llvm::SmallVector<HyperblockDependencyEdge>>
+      predecessorEdges_;
+
+  /// Maps each hyperblock to its successor edges.
+  llvm::DenseMap<TaskflowHyperblockOp,
+                 llvm::SmallVector<HyperblockDependencyEdge>>
+      successorEdges_;
+};
+
+} // namespace taskflow
+} // namespace mlir
+
+#endif // TASKFLOW_ANALYSIS_HYPERBLOCK_DEPENDENCY_ANALYSIS_H
diff --git a/include/TaskflowDialect/TaskflowPasses.h b/include/TaskflowDialect/TaskflowPasses.h
index f621951..367c22f 100644
--- a/include/TaskflowDialect/TaskflowPasses.h
+++ b/include/TaskflowDialect/TaskflowPasses.h
@@ -1,4 +1,4 @@
-// TaskflowPasses.h - Header file for Taskflow passes
+// TaskflowPasses.h - Header file for Taskflow passes.
 
 #ifndef TASKFLOW_PASSES_H
 #define TASKFLOW_PASSES_H
@@ -10,15 +10,23 @@
 #include "mlir/Pass/PassRegistry.h"
 
 #include <memory>
+
 namespace mlir {
 namespace taskflow {
-// Passes defined in TaskflowPasses.td
+
+// Passes defined in TaskflowPasses.td.
 #define GEN_PASS_DECL
 #include "TaskflowDialect/TaskflowPasses.h.inc"
+
+/// Creates a pass that constructs hyperblocks and counter chains from tasks.
 std::unique_ptr<mlir::Pass> createConstructHyperblockFromTaskPass();
 
+/// Creates a pass that optimizes the task graph by fusing hyperblocks and tasks.
+std::unique_ptr<mlir::Pass> createOptimizeTaskGraphPass();
+
 #define GEN_PASS_REGISTRATION
 #include "TaskflowDialect/TaskflowPasses.h.inc"
+
 } // namespace taskflow
 } // namespace mlir
 
diff --git a/include/TaskflowDialect/TaskflowPasses.td b/include/TaskflowDialect/TaskflowPasses.td
index 1bcf3b2..3b32713 100644
--- a/include/TaskflowDialect/TaskflowPasses.td
+++ b/include/TaskflowDialect/TaskflowPasses.td
@@ -15,4 +15,26 @@ def ConstructHyperblockFromTask : Pass<"construct-hyperblock-from-task", "func::
   }];
   let constructor = "taskflow::createConstructHyperblockFromTaskPass()";
 }
+
+def OptimizeTaskGraph : Pass<"optimize-task-graph", "func::FuncOp"> {
+  let summary = "Optimizes Taskflow task graph by fusing hyperblocks and tasks.";
+  let description = [{
+    Performs the following optimizations on the Taskflow task graph:
+    1. Hyperblock Fusion: Merges hyperblocks with compatible counter structures.
+       Supports loop peeling when counter bound differences are small.
+    2. Task Fusion: Merges producer-consumer tasks to reduce data transfer
+       overhead between tasks.
+    3. Dead Hyperblock Elimination: Removes unused hyperblocks.
+  }];
+  let constructor = "taskflow::createOptimizeTaskGraphPass()";
+  let options = [
+    Option<"enableHyperblockFusion", "enable-hyperblock-fusion", "bool",
+           /*default=*/"true", "Enables hyperblock fusion optimization.">,
+    Option<"enableTaskFusion", "enable-task-fusion", "bool",
+           /*default=*/"false", "Enables task fusion optimization (not yet implemented).">,
+    Option<"maxBoundDiffForPeeling", "max-bound-diff", "int",
+           /*default=*/"2", "Specifies max loop bound difference for peeling.">
+  ];
+}
+
 #endif // TASKFLOW_PASSES_TD
\ No newline at end of file
diff --git a/lib/TaskflowDialect/Analysis/CMakeLists.txt b/lib/TaskflowDialect/Analysis/CMakeLists.txt
new file mode 100644
index 0000000..b93b278
--- /dev/null
+++ b/lib/TaskflowDialect/Analysis/CMakeLists.txt
@@ -0,0 +1,10 @@
+add_mlir_library(MLIRTaskflowAnalysis
+    HyperblockDependencyAnalysis.cpp
+    # TaskDependencyAnalysis.cpp
+
+    LINK_LIBS PUBLIC
+    MLIRIR
+    MLIRSupport
+    MLIRMemRefDialect
+    MLIRTaskflow
+)
diff --git a/lib/TaskflowDialect/Analysis/HyperblockDependencyAnalysis.cpp b/lib/TaskflowDialect/Analysis/HyperblockDependencyAnalysis.cpp
new file mode 100644
index 0000000..1d9ed88
--- /dev/null
+++ b/lib/TaskflowDialect/Analysis/HyperblockDependencyAnalysis.cpp
@@ -0,0 +1,223 @@
+// HyperblockDependencyAnalysis.cpp - Implements hyperblock dependency analysis.
+
+#include "TaskflowDialect/Analysis/HyperblockDependencyAnalysis.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+
+using namespace mlir;
+using namespace mlir::taskflow;
+
+void HyperblockDependencyGraph::buildFromTask(TaskflowTaskOp taskOp) {
+  clear();
+
+  // Collects all hyperblocks in program order.
+  taskOp.getBody().walk([&](TaskflowHyperblockOp op) {
+    hyperblocks_.push_back(op);
+  });
+
+  // Builds dependency edges between all pairs of hyperblocks.
+  for (size_t i = 0; i < hyperblocks_.size(); ++i) {
+    auto hbI = hyperblocks_[i];
+    auto writesI = collectWrites(hbI);
+    auto readsI = collectReads(hbI);
+
+    for (size_t j = i + 1; j < hyperblocks_.size(); ++j) {
+      auto hbJ = hyperblocks_[j];
+      auto writesJ = collectWrites(hbJ);
+      auto readsJ = collectReads(hbJ);
+
+      // Checks RAW: I writes, J reads.
+      for (Value memref : writesI) {
+        if (readsJ.contains(memref)) {
+          addEdge(hbI, hbJ, DependencyType::RAW, memref);
+        }
+      }
+
+      // Checks WAR: I reads, J writes.
+      for (Value memref : readsI) {
+        if (writesJ.contains(memref)) {
+          addEdge(hbI, hbJ, DependencyType::WAR, memref);
+        }
+      }
+
+      // Checks WAW: I writes, J writes.
+      for (Value memref : writesI) {
+        if (writesJ.contains(memref)) {
+          addEdge(hbI, hbJ, DependencyType::WAW, memref);
+        }
+      }
+    }
+  }
+}
+
+void HyperblockDependencyGraph::clear() {
+  hyperblocks_.clear();
+  predecessorEdges_.clear();
+  successorEdges_.clear();
+}
+
+bool HyperblockDependencyGraph::hasDependency(
+    TaskflowHyperblockOp source, TaskflowHyperblockOp target) const {
+  auto it = successorEdges_.find(source);
+  if (it == successorEdges_.end()) {
+    return false;
+  }
+  for (const auto &edge : it->second) {
+    if (edge.target == target) {
+      return true;
+    }
+  }
+  return false;
+}
+
+llvm::SmallVector<HyperblockDependencyEdge>
+HyperblockDependencyGraph::getDependencies(TaskflowHyperblockOp source,
+                                            TaskflowHyperblockOp target) const {
+  llvm::SmallVector<HyperblockDependencyEdge> result;
+  auto it = successorEdges_.find(source);
+  if (it != successorEdges_.end()) {
+    for (const auto &edge : it->second) {
+      if (edge.target == target) {
+        result.push_back(edge);
+      }
+    }
+  }
+  return result;
+}
+
+llvm::SmallVector<TaskflowHyperblockOp>
+HyperblockDependencyGraph::getPredecessors(TaskflowHyperblockOp op) const {
+  llvm::SmallVector<TaskflowHyperblockOp> result;
+  llvm::DenseSet<TaskflowHyperblockOp> seen;
+
+  auto it = predecessorEdges_.find(op);
+  if (it != predecessorEdges_.end()) {
+    for (const auto &edge : it->second) {
+      if (!seen.contains(edge.source)) {
+        seen.insert(edge.source);
+        result.push_back(edge.source);
+      }
+    }
+  }
+  return result;
+}
+
+llvm::SmallVector<TaskflowHyperblockOp>
+HyperblockDependencyGraph::getSuccessors(TaskflowHyperblockOp op) const {
+  llvm::SmallVector<TaskflowHyperblockOp> result;
+  llvm::DenseSet<TaskflowHyperblockOp> seen;
+
+  auto it = successorEdges_.find(op);
+  if (it != successorEdges_.end()) {
+    for (const auto &edge : it->second) {
+      if (!seen.contains(edge.target)) {
+        seen.insert(edge.target);
+        result.push_back(edge.target);
+      }
+    }
+  }
+  return result;
+}
+
+bool HyperblockDependencyGraph::canFuse(TaskflowHyperblockOp a,
+                                         TaskflowHyperblockOp b) const {
+  // Fusing two hyperblocks (A and B) is safe only if it does not violate 
+  // intermediate dependencies. Specifically, if there is a block C between 
+  // A and B in program order, we cannot fuse A and B if A -> C and C -> B.
+  // Fusing A and B would effectively move B before C, breaking C -> B.
+
+  // Finds positions in program order.
+  int posA = -1, posB = -1;
+  for (size_t i = 0; i < hyperblocks_.size(); ++i) {
+    if (hyperblocks_[i] == a) posA = i;
+    if (hyperblocks_[i] == b) posB = i;
+  }
+
+  if (posA < 0 || posB < 0) {
+    return false;
+  }
+
+  // Ensures a comes before b for fusion (or they are adjacent).
+  if (posA > posB) {
+    std::swap(a, b);
+    std::swap(posA, posB);
+  }
+
+  // Checks if there are any hyperblocks between a and b that depend on a
+  // and b depends on them (would create cycle after fusion).
+  for (size_t i = posA + 1; i < static_cast<size_t>(posB); ++i) {
+    auto middle = hyperblocks_[i];
+    if (hasDependency(a, middle) && hasDependency(middle, b)) {
+      return false;  // Fusion would break dependency chain.
+    }
+  }
+
+  return true;
+}
+
+bool HyperblockDependencyGraph::areCountersCompatible(
+    TaskflowHyperblockOp a, TaskflowHyperblockOp b, int maxBoundDiff) const {
+  auto indicesA = a.getIndices();
+  auto indicesB = b.getIndices();
+
+  // Requires same number of indices.
+  if (indicesA.size() != indicesB.size()) {
+    return false;
+  }
+
+  // Checks each counter pair.
+  for (size_t i = 0; i < indicesA.size(); ++i) {
+    auto counterA = indicesA[i].getDefiningOp<TaskflowCounterOp>();
+    auto counterB = indicesB[i].getDefiningOp<TaskflowCounterOp>();
+
+    if (!counterA || !counterB) {
+      return false;
+    }
+
+    int64_t lowerA = counterA.getLowerBound().getSExtValue();
+    int64_t upperA = counterA.getUpperBound().getSExtValue();
+    int64_t stepA = counterA.getStep().getSExtValue();
+
+    int64_t lowerB = counterB.getLowerBound().getSExtValue();
+    int64_t upperB = counterB.getUpperBound().getSExtValue();
+    int64_t stepB = counterB.getStep().getSExtValue();
+
+    // Requires same lower bound and step.
+    if (lowerA != lowerB || stepA != stepB) {
+      return false;
+    }
+
+    // Checks upper bound difference.
+    int diff = std::abs(static_cast<int>(upperA - upperB));
+    if (diff > maxBoundDiff) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+llvm::DenseSet<Value>
+HyperblockDependencyGraph::collectReads(TaskflowHyperblockOp op) const {
+  llvm::DenseSet<Value> reads;
+  op.getBody().walk([&](memref::LoadOp loadOp) {
+    reads.insert(loadOp.getMemRef());
+  });
+  return reads;
+}
+
+llvm::DenseSet<Value>
+HyperblockDependencyGraph::collectWrites(TaskflowHyperblockOp op) const {
+  llvm::DenseSet<Value> writes;
+  op.getBody().walk([&](memref::StoreOp storeOp) {
+    writes.insert(storeOp.getMemRef());
+  });
+  return writes;
+}
+
+void HyperblockDependencyGraph::addEdge(TaskflowHyperblockOp source,
+                                         TaskflowHyperblockOp target,
+                                         DependencyType type, Value memref) {
+  HyperblockDependencyEdge edge{source, target, type, memref};
+  successorEdges_[source].push_back(edge);
+  predecessorEdges_[target].push_back(edge);
+}
diff --git a/lib/TaskflowDialect/CMakeLists.txt b/lib/TaskflowDialect/CMakeLists.txt
index d8e5d7f..c6cb0c5 100644
--- a/lib/TaskflowDialect/CMakeLists.txt
+++ b/lib/TaskflowDialect/CMakeLists.txt
@@ -13,4 +13,5 @@ add_mlir_dialect_library(MLIRTaskflow
         MLIRInferTypeOpInterface
 )
 
+add_subdirectory(Analysis)
 add_subdirectory(Transforms)
\ No newline at end of file
diff --git a/lib/TaskflowDialect/Transforms/CMakeLists.txt b/lib/TaskflowDialect/Transforms/CMakeLists.txt
index 270ce96..e8ef663 100644
--- a/lib/TaskflowDialect/Transforms/CMakeLists.txt
+++ b/lib/TaskflowDialect/Transforms/CMakeLists.txt
@@ -2,6 +2,7 @@ get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
 
 add_mlir_library(MLIRTaskflowTransforms
     ConstructHyperblockFromTaskPass.cpp
+    OptimizeTaskGraphPass.cpp
 
     DEPENDS
     MLIRTaskflowTransformsIncGen
@@ -12,6 +13,7 @@ add_mlir_library(MLIRTaskflowTransforms
     MLIRSupport
     MLIRTransforms
     MLIRTaskflow
+    MLIRTaskflowAnalysis
     ${dialect_libs}
     LLVMSupport
 )
\ No newline at end of file
diff --git a/lib/TaskflowDialect/Transforms/OptimizeTaskGraphPass.cpp b/lib/TaskflowDialect/Transforms/OptimizeTaskGraphPass.cpp
new file mode 100644
index 0000000..e6356bf
--- /dev/null
+++ b/lib/TaskflowDialect/Transforms/OptimizeTaskGraphPass.cpp
@@ -0,0 +1,353 @@
+// OptimizeTaskGraphPass.cpp - Optimizes Taskflow task graph.
+//
+// This pass performs the following optimizations on the Taskflow task graph:
+// 1. Hyperblock Fusion: Merges hyperblocks with compatible counter structures.
+// 2. Task Fusion: Merges producer-consumer tasks to reduce data transfer.
+// 3. Dead Hyperblock Elimination: Removes unused hyperblocks.
+
+#include "TaskflowDialect/Analysis/HyperblockDependencyAnalysis.h"
+// #include "TaskflowDialect/Analysis/TaskDependencyAnalysis.h"
+#include "TaskflowDialect/TaskflowDialect.h"
+#include "TaskflowDialect/TaskflowOps.h"
+#include "TaskflowDialect/TaskflowPasses.h"
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <memory>
+#include <optional>
+
+using namespace mlir;
+using namespace mlir::taskflow;
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+// Resource Estimation (for future Architecture integration).
+//===----------------------------------------------------------------------===//
+
+/// Represents the estimated resource requirements for a hyperblock.
+struct ResourceEstimate {
+  int numOperations = 0;
+  int numMemoryOps = 0;
+  int numArithOps = 0;
+};
+
+/// Estimates the resource requirements for a hyperblock.
+/// Used for resource constraint checking when Architecture is available.
+[[maybe_unused]]
+static ResourceEstimate estimateHyperblockResources(TaskflowHyperblockOp op) {
+  ResourceEstimate estimate;
+  op.getBody().walk([&](Operation *innerOp) {
+    estimate.numOperations++;
+    if (isa<memref::LoadOp, memref::StoreOp>(innerOp)) {
+      estimate.numMemoryOps++;
+    } else {
+      Dialect *dialect = innerOp->getDialect();
+      if (dialect && dialect->getNamespace() == "arith") {
+        estimate.numArithOps++;
+      }
+    }
+  });
+  return estimate;
+}
+
+//===----------------------------------------------------------------------===//
+// Hyperblock Fusion.
+//===----------------------------------------------------------------------===//
+
+/// Fuses two hyperblocks with identical counter structures.
+/// The second hyperblock's operations are moved into the first hyperblock.
+/// Handles SSA outputs by creating a new fused hyperblock.
+static LogicalResult fuseHyperblocks(TaskflowHyperblockOp first,
+                                      TaskflowHyperblockOp second,
+                                      OpBuilder &builder) {
+  // Verifies that the hyperblocks have the same indices.
+  auto indicesFirst = first.getIndices();
+  auto indicesSecond = second.getIndices();
+
+  if (indicesFirst.size() != indicesSecond.size()) {
+    return failure();
+  }
+
+  // Gets the blocks from both hyperblocks.
+  Block &firstBlock = first.getBody().front();
+  Block &secondBlock = second.getBody().front();
+
+  // Finds the yield operations.
+  auto firstYield = cast<TaskflowHyperblockYieldOp>(firstBlock.getTerminator());
+  auto secondYield =
+      cast<TaskflowHyperblockYieldOp>(secondBlock.getTerminator());
+
+  // Creates a mapping from second's block arguments to first's block arguments.
+  IRMapping mapping;
+  for (size_t i = 0; i < indicesSecond.size(); ++i) {
+    mapping.map(secondBlock.getArgument(i), firstBlock.getArgument(i));
+  }
+
+  // Sets insertion point before the first yield.
+  builder.setInsertionPoint(firstYield);
+
+  // Clones all operations from second (except the yield) into first.
+  for (Operation &op : secondBlock.without_terminator()) {
+    builder.clone(op, mapping);
+  }
+
+  // Merges outputs from both yields.
+  SmallVector<Value> combinedOutputs;
+  for (Value output : firstYield.getOutputs()) {
+    combinedOutputs.push_back(output);
+  }
+  for (Value output : secondYield.getOutputs()) {
+    // Maps the output through our mapping in case it references cloned values.
+    Value mappedOutput = mapping.lookupOrDefault(output);
+    combinedOutputs.push_back(mappedOutput);
+  }
+
+  // Replaces the first yield with a new one that has combined outputs.
+  builder.setInsertionPoint(firstYield);
+  builder.create<TaskflowHyperblockYieldOp>(firstYield.getLoc(),
+                                             combinedOutputs);
+  firstYield.erase();
+
+  // Handles SSA outputs by creating a new hyperblock with combined
+  // result types if either hyperblock has outputs.
+  size_t firstOutputCount = first.getOutputs().size();
+  size_t secondOutputCount = second.getOutputs().size();
+
+  if (firstOutputCount > 0 || secondOutputCount > 0) {
+    // Builds combined result types.
+    SmallVector<Type> combinedResultTypes;
+    for (Value res : first.getOutputs()) {
+      combinedResultTypes.push_back(res.getType());
+    }
+    for (Value res : second.getOutputs()) {
+      combinedResultTypes.push_back(res.getType());
+    }
+
+    // Creates a new hyperblock with the combined result types.
+    builder.setInsertionPoint(first);
+    auto newHyperblock = builder.create<TaskflowHyperblockOp>(
+        first.getLoc(), combinedResultTypes, first.getIndices());
+
+    // Moves the body from first to the new hyperblock.
+    newHyperblock.getBody().takeBody(first.getBody());
+
+    // Replaces uses of the original hyperblocks' results.
+    for (size_t i = 0; i < firstOutputCount; ++i) {
+      first.getOutputs()[i].replaceAllUsesWith(newHyperblock.getOutputs()[i]);
+    }
+    for (size_t i = 0; i < secondOutputCount; ++i) {
+      second.getOutputs()[i].replaceAllUsesWith(
+          newHyperblock.getOutputs()[firstOutputCount + i]);
+    }
+
+    // Erases both original hyperblocks.
+    first.erase();
+    second.erase();
+  } else {
+    // No outputs: simple case, just erase the second hyperblock.
+    second.erase();
+  }
+
+  return success();
+}
+
+/// Attempts to fuse hyperblocks within a task.
+/// Iteratively checks all ordered pairs of hyperblocks and fuses the first
+/// compatible, safe pair whose counters match (within peeling bounds) and
+/// for which the dependency graph reports that fusion will not introduce cycles.
+static void fuseHyperblocksInTask(TaskflowTaskOp taskOp,
+                                   int maxBoundDiffForPeeling) {
+  OpBuilder builder(taskOp.getContext());
+  bool changed = true;
+
+  // Iterates until no more fusions can be performed.
+  while (changed) {
+    changed = false;
+
+    // Rebuilds the dependency graph after each fusion.
+    HyperblockDependencyGraph depGraph;
+    depGraph.buildFromTask(taskOp);
+
+    const auto &hyperblocks = depGraph.getHyperblocks();
+    if (hyperblocks.size() < 2) {
+      return;
+    }
+
+    // Finds first fusable pair by checking all pairs (i, j) where i < j.
+    bool foundPair = false;
+    for (size_t i = 0; i < hyperblocks.size() && !foundPair; ++i) {
+      for (size_t j = i + 1; j < hyperblocks.size() && !foundPair; ++j) {
+        auto first = hyperblocks[i];
+        auto second = hyperblocks[j];
+
+        // Checks counter compatibility.
+        if (!depGraph.areCountersCompatible(first, second,
+                                            maxBoundDiffForPeeling)) {
+          continue;
+        }
+
+        // Checks if fusion is safe (no circular dependencies would be created).
+        // canFuse already checks for intermediate blocking dependencies.
+        if (!depGraph.canFuse(first, second)) {
+          continue;
+        }
+
+        // RAW dependency (first -> second) is safe to fuse because:
+        // - We clone second's operations AFTER first's operations
+        // - This preserves the original execution order
+        // - Memory dependencies are satisfied
+        //
+        // Reverse dependency (second -> first) is NOT safe and is already
+        // handled by canFuse() which checks program order.
+
+        // Performs the fusion.
+        llvm::errs() << "[OptimizeTaskGraph] Fusing hyperblocks at "
+                     << first.getLoc() << " and " << second.getLoc() << "\n";
+
+        if (succeeded(fuseHyperblocks(first, second, builder))) {
+          changed = true;
+          foundPair = true;
+          // Restarts the loop with updated dependency graph.
+        }
+      }
+    }
+  }
+}
+
+
+//===----------------------------------------------------------------------===//
+// Task Fusion (placeholder for future implementation).
+//===----------------------------------------------------------------------===//
+
+/// Fuses producer-consumer task pairs.
+/// TODO: Implements actual task fusion logic.
+[[maybe_unused]]
+static void fuseProducerConsumerTasks(func::FuncOp funcOp) {
+  // Task fusion is not yet implemented.
+  // When enabled, this will:
+  // 1. Build the task dependency graph.
+  // 2. Find producer-consumer pairs.
+  // 3. Check counter compatibility.
+  // 4. Fuse compatible task pairs.
+  (void)funcOp;
+}
+
+//===----------------------------------------------------------------------===//
+// Dead Hyperblock Elimination.
+//===----------------------------------------------------------------------===//
+
+/// Checks if a hyperblock has no side effects that are used.
+static bool isHyperblockDead(TaskflowHyperblockOp op) {
+  // A hyperblock is considered dead if:
+  // 1. It has no store operations, AND
+  // 2. Its results (if any) are not used.
+
+  bool hasStores = false;
+  op.getBody().walk([&](memref::StoreOp storeOp) {
+    hasStores = true;
+  });
+
+  if (hasStores) {
+    return false;
+  }
+
+  // Checks if any results are used.
+  for (Value result : op.getResults()) {
+    if (!result.use_empty()) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+/// Eliminates dead hyperblocks from a function.
+static void eliminateDeadHyperblocks(func::FuncOp funcOp) {
+  SmallVector<TaskflowHyperblockOp> toErase;
+
+  funcOp.walk([&](TaskflowHyperblockOp op) {
+    if (isHyperblockDead(op)) {
+      toErase.push_back(op);
+    }
+  });
+
+  for (auto op : toErase) {
+    op.erase();
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Pass Implementation.
+//===----------------------------------------------------------------------===//
+
+struct OptimizeTaskGraphPass
+    : public PassWrapper<OptimizeTaskGraphPass, OperationPass<func::FuncOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(OptimizeTaskGraphPass)
+
+  OptimizeTaskGraphPass() = default;
+  OptimizeTaskGraphPass(const OptimizeTaskGraphPass &other)
+      : PassWrapper(other) {}
+
+  StringRef getArgument() const override { return "optimize-task-graph"; }
+
+  StringRef getDescription() const override {
+    return "Optimizes Taskflow task graph by fusing hyperblocks and tasks.";
+  }
+
+  void runOnOperation() override {
+    func::FuncOp funcOp = getOperation();
+
+    // Phase 1: Hyperblock Fusion.
+    if (enableHyperblockFusion) {
+      funcOp.walk([&](TaskflowTaskOp taskOp) {
+        fuseHyperblocksInTask(taskOp, maxBoundDiffForPeeling);
+      });
+    }
+
+    // Phase 2: Task Fusion.
+    if (enableTaskFusion) {
+      fuseProducerConsumerTasks(funcOp);
+    }
+
+    // Phase 3: Dead Hyperblock Elimination.
+    eliminateDeadHyperblocks(funcOp);
+  }
+
+  Option<bool> enableHyperblockFusion{
+      *this, "enable-hyperblock-fusion",
+      llvm::cl::desc("Enables hyperblock fusion optimization."),
+      llvm::cl::init(true)};
+
+  Option<bool> enableTaskFusion{
+      *this, "enable-task-fusion",
+      llvm::cl::desc("Enables task fusion optimization (not yet implemented)."),
+      llvm::cl::init(false)};
+
+  Option<int> maxBoundDiffForPeeling{
+      *this, "max-bound-diff",
+      llvm::cl::desc("Specifies max loop bound difference for peeling."),
+      llvm::cl::init(2)};
+};
+
+} // namespace
+
+namespace mlir {
+namespace taskflow {
+
+/// Creates a pass that optimizes the task graph.
+std::unique_ptr<Pass> createOptimizeTaskGraphPass() {
+  return std::make_unique<OptimizeTaskGraphPass>();
+}
+
+} // namespace taskflow
+} // namespace mlir
diff --git a/test/benchmark/CGRA-Bench b/test/benchmark/CGRA-Bench
index ccc0f9f..cd84bd3 160000
--- a/test/benchmark/CGRA-Bench
+++ b/test/benchmark/CGRA-Bench
@@ -1 +1 @@
-Subproject commit ccc0f9f100462a83942b8bf06247cca032fb817e
+Subproject commit cd84bd3e755a529a2a9f3631107850dac71f5063
diff --git a/test/multi-cgra/taskflow/optimization/fusion-with-outputs.mlir b/test/multi-cgra/taskflow/optimization/fusion-with-outputs.mlir
new file mode 100644
index 0000000..9f29719
--- /dev/null
+++ b/test/multi-cgra/taskflow/optimization/fusion-with-outputs.mlir
@@ -0,0 +1,57 @@
+// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \
+// RUN: --construct-hyperblock-from-task --optimize-task-graph \
+// RUN: | FileCheck %s
+
+// Tests hyperblock fusion with independent loops that have different operations.
+
+module {
+  func.func @test_fusion_with_outputs(%A: memref<16xf32>, %B: memref<16xf32>) {
+    // First loop: writes to A.
+    affine.for %i = 0 to 16 {
+      %idx = arith.index_cast %i : index to i32
+      %val = arith.sitofp %idx : i32 to f32
+      affine.store %val, %A[%i] : memref<16xf32>
+    }
+    
+    // Second loop: writes to B - independent from first loop.
+    affine.for %i = 0 to 16 {
+      %idx = arith.index_cast %i : index to i32
+      %val = arith.sitofp %idx : i32 to f32
+      %doubled = arith.mulf %val, %val : f32
+      affine.store %doubled, %B[%i] : memref<16xf32>
+    }
+    
+    return
+  }
+}
+
+// After conversion and optimization, both loops become separate tasks.
+
+// CHECK:      module {
+// CHECK-NEXT:   func.func @test_fusion_with_outputs(%arg0: memref<16xf32>, %arg1: memref<16xf32>) {
+// CHECK-NEXT:     %memory_outputs = "taskflow.task"(%arg0) <{operandSegmentSizes = array<i32: 1, 0>, resultSegmentSizes = array<i32: 1, 0>, task_name = "Task_0"}> ({
+// CHECK-NEXT:     ^bb0(%arg2: memref<16xf32>):
+// CHECK-NEXT:       %0 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 16 : index} : index
+// CHECK-NEXT:       taskflow.hyperblock indices(%0 : index) {
+// CHECK-NEXT:       ^bb0(%arg3: index):
+// CHECK-NEXT:         %1 = arith.index_cast %arg3 : index to i32
+// CHECK-NEXT:         %2 = arith.sitofp %1 : i32 to f32
+// CHECK-NEXT:         memref.store %2, %arg2[%arg3] : memref<16xf32>
+// CHECK-NEXT:       } -> ()
+// CHECK-NEXT:       "taskflow.yield"(%arg2) <{operandSegmentSizes = array<i32: 1, 0>}> : (memref<16xf32>) -> ()
+// CHECK-NEXT:     }) : (memref<16xf32>) -> memref<16xf32>
+// CHECK-NEXT:     %memory_outputs_0 = "taskflow.task"(%arg1) <{operandSegmentSizes = array<i32: 1, 0>, resultSegmentSizes = array<i32: 1, 0>, task_name = "Task_1"}> ({
+// CHECK-NEXT:     ^bb0(%arg2: memref<16xf32>):
+// CHECK-NEXT:       %0 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 16 : index} : index
+// CHECK-NEXT:       taskflow.hyperblock indices(%0 : index) {
+// CHECK-NEXT:       ^bb0(%arg3: index):
+// CHECK-NEXT:         %1 = arith.index_cast %arg3 : index to i32
+// CHECK-NEXT:         %2 = arith.sitofp %1 : i32 to f32
+// CHECK-NEXT:         %3 = arith.mulf %2, %2 : f32
+// CHECK-NEXT:         memref.store %3, %arg2[%arg3] : memref<16xf32>
+// CHECK-NEXT:       } -> ()
+// CHECK-NEXT:       "taskflow.yield"(%arg2) <{operandSegmentSizes = array<i32: 1, 0>}> : (memref<16xf32>) -> ()
+// CHECK-NEXT:     }) : (memref<16xf32>) -> memref<16xf32>
+// CHECK-NEXT:     return
+// CHECK-NEXT:   }
+// CHECK-NEXT: }
diff --git a/test/multi-cgra/taskflow/optimization/hyperblock-fusion.mlir b/test/multi-cgra/taskflow/optimization/hyperblock-fusion.mlir
new file mode 100644
index 0000000..4839b93
--- /dev/null
+++ b/test/multi-cgra/taskflow/optimization/hyperblock-fusion.mlir
@@ -0,0 +1,57 @@
+// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \
+// RUN: --construct-hyperblock-from-task --optimize-task-graph \
+// RUN: | FileCheck %s
+
+// Tests hyperblock fusion behavior for adjacent hyperblocks with identical counter structures.
+// Two independent top-level loops with the same bounds become separate tasks, each with its own hyperblock (no cross-task fusion).
+
+module {
+  func.func @test_hyperblock_fusion(%A: memref<16xf32>, %B: memref<16xf32>, %scale: f32) {
+    // First loop: reads A, writes A.
+    affine.for %i = 0 to 16 {
+      %v = affine.load %A[%i] : memref<16xf32>
+      %scaled = arith.mulf %v, %scale : f32
+      affine.store %scaled, %A[%i] : memref<16xf32>
+    }
+    
+    // Second loop: reads B, writes B - independent from first loop.
+    affine.for %i = 0 to 16 {
+      %v = affine.load %B[%i] : memref<16xf32>
+      %scaled = arith.mulf %v, %scale : f32
+      affine.store %scaled, %B[%i] : memref<16xf32>
+    }
+    
+    return
+  }
+}
+
+// After conversion and optimization, both top-level loops become separate tasks.
+// Cross-task fusion is not performed; each task has one hyperblock.
+
+// CHECK:      module {
+// CHECK-NEXT:   func.func @test_hyperblock_fusion(%arg0: memref<16xf32>, %arg1: memref<16xf32>, %arg2: f32) {
+// CHECK-NEXT:     %memory_outputs = "taskflow.task"(%arg0, %arg2) <{operandSegmentSizes = array<i32: 1, 1>, resultSegmentSizes = array<i32: 1, 0>, task_name = "Task_0"}> ({
+// CHECK-NEXT:     ^bb0(%arg3: memref<16xf32>, %arg4: f32):
+// CHECK-NEXT:       %0 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 16 : index} : index
+// CHECK-NEXT:       taskflow.hyperblock indices(%0 : index) {
+// CHECK-NEXT:       ^bb0(%arg5: index):
+// CHECK-NEXT:         %1 = memref.load %arg3[%arg5] : memref<16xf32>
+// CHECK-NEXT:         %2 = arith.mulf %1, %arg4 : f32
+// CHECK-NEXT:         memref.store %2, %arg3[%arg5] : memref<16xf32>
+// CHECK-NEXT:       } -> ()
+// CHECK-NEXT:       "taskflow.yield"(%arg3) <{operandSegmentSizes = array<i32: 1, 0>}> : (memref<16xf32>) -> ()
+// CHECK-NEXT:     }) : (memref<16xf32>, f32) -> memref<16xf32>
+// CHECK-NEXT:     %memory_outputs_0 = "taskflow.task"(%arg1, %arg2) <{operandSegmentSizes = array<i32: 1, 1>, resultSegmentSizes = array<i32: 1, 0>, task_name = "Task_1"}> ({
+// CHECK-NEXT:     ^bb0(%arg3: memref<16xf32>, %arg4: f32):
+// CHECK-NEXT:       %0 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 16 : index} : index
+// CHECK-NEXT:       taskflow.hyperblock indices(%0 : index) {
+// CHECK-NEXT:       ^bb0(%arg5: index):
+// CHECK-NEXT:         %1 = memref.load %arg3[%arg5] : memref<16xf32>
+// CHECK-NEXT:         %2 = arith.mulf %1, %arg4 : f32
+// CHECK-NEXT:         memref.store %2, %arg3[%arg5] : memref<16xf32>
+// CHECK-NEXT:       } -> ()
+// CHECK-NEXT:       "taskflow.yield"(%arg3) <{operandSegmentSizes = array<i32: 1, 0>}> : (memref<16xf32>) -> ()
+// CHECK-NEXT:     }) : (memref<16xf32>, f32) -> memref<16xf32>
+// CHECK-NEXT:     return
+// CHECK-NEXT:   }
+// CHECK-NEXT: }
diff --git a/test/multi-cgra/taskflow/optimization/nested-fusion.mlir b/test/multi-cgra/taskflow/optimization/nested-fusion.mlir
new file mode 100644
index 0000000..1000396
--- /dev/null
+++ b/test/multi-cgra/taskflow/optimization/nested-fusion.mlir
@@ -0,0 +1,49 @@
+// RUN: mlir-neura-opt %s --convert-affine-to-taskflow \
+// RUN: --construct-hyperblock-from-task --optimize-task-graph \
+// RUN: | FileCheck %s
+
+// Tests hyperblock fusion for nested loops with identical counter structures.
+// Two independent nested loops in the same task should be fused.
+
+module {
+  func.func @test_nested_fusion(%A: memref<8x8xf32>, %B: memref<8x8xf32>,
+                                 %C: memref<8x8xf32>, %D: memref<8x8xf32>) {
+    // Outer loop creates a single task with two inner loops.
+    affine.for %i = 0 to 8 {
+      // First inner loop: copies A to C.
+      affine.for %j = 0 to 8 {
+        %v = affine.load %A[%i, %j] : memref<8x8xf32>
+        affine.store %v, %C[%i, %j] : memref<8x8xf32>
+      }
+      // Second inner loop: copies B to D - independent from first.
+      // Should be fused with first loop since same counter structure.
+      affine.for %j = 0 to 8 {
+        %v = affine.load %B[%i, %j] : memref<8x8xf32>
+        affine.store %v, %D[%i, %j] : memref<8x8xf32>
+      }
+    }
+    return
+  }
+}
+
+// After optimization, both inner loops should be fused into ONE hyperblock.
+
+// CHECK:      module {
+// CHECK-NEXT:   func.func @test_nested_fusion(%arg0: memref<8x8xf32>, %arg1: memref<8x8xf32>, %arg2: memref<8x8xf32>, %arg3: memref<8x8xf32>) {
+// CHECK-NEXT:     %memory_outputs:2 = "taskflow.task"(%arg0, %arg1, %arg2, %arg3) <{operandSegmentSizes = array<i32: 4, 0>, resultSegmentSizes = array<i32: 2, 0>, task_name = "Task_0"}> ({
+// CHECK-NEXT:     ^bb0(%arg4: memref<8x8xf32>, %arg5: memref<8x8xf32>, %arg6: memref<8x8xf32>, %arg7: memref<8x8xf32>):
+// CHECK-NEXT:       %0 = taskflow.counter attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// CHECK-NEXT:       %1 = taskflow.counter parent(%0 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// CHECK-NEXT:       %2 = taskflow.counter parent(%0 : index) attributes {lower_bound = 0 : index, step = 1 : index, upper_bound = 8 : index} : index
+// CHECK-NEXT:       taskflow.hyperblock indices(%0, %1 : index, index) {
+// CHECK-NEXT:       ^bb0(%arg8: index, %arg9: index):
+// CHECK-NEXT:         %3 = memref.load %arg4[%arg8, %arg9] : memref<8x8xf32>
+// CHECK-NEXT:         memref.store %3, %arg6[%arg8, %arg9] : memref<8x8xf32>
+// CHECK-NEXT:         %4 = memref.load %arg5[%arg8, %arg9] : memref<8x8xf32>
+// CHECK-NEXT:         memref.store %4, %arg7[%arg8, %arg9] : memref<8x8xf32>
+// CHECK-NEXT:       } -> ()
+// CHECK-NEXT:       "taskflow.yield"(%arg6, %arg7) <{operandSegmentSizes = array<i32: 2, 0>}> : (memref<8x8xf32>, memref<8x8xf32>) -> ()
+// CHECK-NEXT:     }) : (memref<8x8xf32>, memref<8x8xf32>, memref<8x8xf32>, memref<8x8xf32>) -> (memref<8x8xf32>, memref<8x8xf32>)
+// CHECK-NEXT:     return
+// CHECK-NEXT:   }
+// CHECK-NEXT: }