From 1c1933f16f6f372c2f3eb9a88ba498e858f7456f Mon Sep 17 00:00:00 2001
From: Victor Li <vli42@sapling2.stanford.edu>
Date: Tue, 11 Mar 2025 20:42:03 -0700
Subject: [PATCH 01/11] MCMC algorithm draft

---
 .../mcmc/machine_mapping_mutation_set.h       |  32 ++
 .../include/compiler/mcmc/mcmc_algorithm.h    |  22 ++
 .../compiler/mcmc/mcmc_graph_optimize_state.h |  35 ++
 .../unity_algorithm/graph_optimize_state.h    |   4 +-
 .../mcmc/machine_mapping_mutation_set.cc      | 110 ++++++
 .../src/compiler/mcmc/mcmc_algorithm.cc       | 320 ++++++++++++++++++
 .../mcmc/mcmc_graph_optimize_state.cc         |  84 +++++
 .../test/src/compiler/mcmc/mcmc_algorithm.cc  |  88 +++++
 .../apply_substitution/apply_substitution.cc  |   2 +
 .../operator_pattern/satisfies_constraint.cc  |   5 +
 .../src/substitutions/pcg_pattern.cc          |  13 +
 .../sub_parallel_computation_graph.cc         |  55 ++-
 .../unlabelled/find_pattern_matches.cc        |   8 +-
 .../unlabelled/pattern_matching.cc            |  10 +
 14 files changed, 757 insertions(+), 31 deletions(-)
 create mode 100644 lib/compiler/include/compiler/mcmc/machine_mapping_mutation_set.h
 create mode 100644 lib/compiler/include/compiler/mcmc/mcmc_algorithm.h
 create mode 100644 lib/compiler/include/compiler/mcmc/mcmc_graph_optimize_state.h
 create mode 100644 lib/compiler/src/compiler/mcmc/machine_mapping_mutation_set.cc
 create mode 100644 lib/compiler/src/compiler/mcmc/mcmc_algorithm.cc
 create mode 100644 lib/compiler/src/compiler/mcmc/mcmc_graph_optimize_state.cc
 create mode 100644 lib/compiler/test/src/compiler/mcmc/mcmc_algorithm.cc
diff --git a/lib/compiler/include/compiler/mcmc/machine_mapping_mutation_set.h b/lib/compiler/include/compiler/mcmc/machine_mapping_mutation_set.h
new file mode 100644
index 0000000000..e41aad2f71
--- /dev/null
+++ b/lib/compiler/include/compiler/mcmc/machine_mapping_mutation_set.h
@@ -0,0 +1,32 @@
+#ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_MCMC_MACHINE_MAPPING_MUTATION_SET_H
+#define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_MCMC_MACHINE_MAPPING_MUTATION_SET_H
+
+#include "compiler/machine_mapping/machine_mapping.h"
+#include "compiler/search_result.dtg.h"
+
+namespace FlexFlow {
+std::vector<MachineMapping>
+    get_possible_mutations(SearchResult mapped_pcg,
+                           MachineSpecification const &resource);
+MachineMapping permute_layers(std::vector<parallel_layer_guid_t> layers,
+                              MachineMapping mapping);
+MachineMapping copy_layer(parallel_layer_guid_t source,
+                          parallel_layer_guid_t destination,
+                          MachineMapping mapping);
+MachineView change_stride(nonnegative_int stride,
+                          parallel_layer_guid_t layer,
+                          MachineView machine_view,
+                          nonnegative_int dim);
+MachineView change_node_idx(nonnegative_int node_ix,
+                            parallel_layer_guid_t layer,
+                            MachineView machine_view);
+MachineView change_device_idx(nonnegative_int device_idx,
+                              parallel_layer_guid_t layer,
+                              MachineView machine_view);
+MachineView change_projection(MachineSpecificationDimension projection,
+                              parallel_layer_guid_t layer,
+                              MachineView machine_view,
+                              nonnegative_int dim);
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/compiler/include/compiler/mcmc/mcmc_algorithm.h b/lib/compiler/include/compiler/mcmc/mcmc_algorithm.h
new file mode 100644
index 0000000000..53efa845cf
--- /dev/null
+++ b/lib/compiler/include/compiler/mcmc/mcmc_algorithm.h
@@ -0,0 +1,22 @@
+#ifndef _FLEXFLOW_COMPILER_MCMC_ALGORITHM_H
+#define _FLEXFLOW_COMPILER_MCMC_ALGORITHM_H
+
+#include "compiler/cost_estimator/cost_estimator.h"
+#include "compiler/search_result.dtg.h"
+#include "compiler/unity_algorithm/unity_search_config.dtg.h"
+#include "pcg/computation_graph.h"
+#include "pcg/machine_specification.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h"
+#include "substitutions/sub_parallel_computation_graph.h"
+#include "substitutions/substitution.h"
+
+namespace FlexFlow {
+
+SearchResult mcmc_graph_optimize(ParallelComputationGraph &pcg,
+                                 CostEstimator const &cost_estimator,
+                                 MachineSpecification const &resources,
+                                 UnitySearchConfig const &search_config);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/compiler/include/compiler/mcmc/mcmc_graph_optimize_state.h b/lib/compiler/include/compiler/mcmc/mcmc_graph_optimize_state.h
new file mode 100644
index 0000000000..3306af123a
--- /dev/null
+++ b/lib/compiler/include/compiler/mcmc/mcmc_graph_optimize_state.h
@@ -0,0 +1,35 @@
+#ifndef _FLEXFLOW_COMPILER_MCMC_ALGORITHM_STATE_H
+#define _FLEXFLOW_COMPILER_MCMC_ALGORITHM_STATE_H
+
+#include "compiler/search_result.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.h"
+
+namespace FlexFlow {
+
+struct MCMCOptimizeState {
+  MCMCOptimizeState() = delete;
+  explicit MCMCOptimizeState(SearchResult const &mapped_pcg, float runtime);
+
+  SearchResult mapped_pcg;
+  float runtime;
+
+  bool operator==(MCMCOptimizeState const &other) const;
+  bool operator!=(MCMCOptimizeState const &other) const;
+  bool operator<(MCMCOptimizeState const &other) const;
+};
+
+std::string format_as(MCMCOptimizeState const &);
+std::ostream &operator<<(std::ostream &, MCMCOptimizeState const &);
+
+} // namespace FlexFlow
+
+namespace std {
+
+template <>
+struct hash<::FlexFlow::MCMCOptimizeState> {
+  size_t operator()(::FlexFlow::MCMCOptimizeState const &) const;
+};
+
+} // namespace std
+
+#endif
diff --git a/lib/compiler/include/compiler/unity_algorithm/graph_optimize_state.h b/lib/compiler/include/compiler/unity_algorithm/graph_optimize_state.h
index 5f06fd242c..9f609f3118 100644
--- a/lib/compiler/include/compiler/unity_algorithm/graph_optimize_state.h
+++ b/lib/compiler/include/compiler/unity_algorithm/graph_optimize_state.h
@@ -1,5 +1,5 @@
-#ifndef _FLEXFLOW_COMPILER_MCMC_STATE_H
-#define _FLEXFLOW_COMPILER_MCMC_STATE_H
+#ifndef _FLEXFLOW_COMPILER_UNITY_ALGORITHM_STATE_H
+#define _FLEXFLOW_COMPILER_UNITY_ALGORITHM_STATE_H
 
 #include "pcg/parallel_computation_graph/parallel_computation_graph.h"
 
diff --git a/lib/compiler/src/compiler/mcmc/machine_mapping_mutation_set.cc b/lib/compiler/src/compiler/mcmc/machine_mapping_mutation_set.cc
new file mode 100644
index 0000000000..d67e4cb592
--- /dev/null
+++ b/lib/compiler/src/compiler/mcmc/machine_mapping_mutation_set.cc
@@ -0,0 +1,110 @@
+#include "compiler/mcmc/machine_mapping_mutation_set.h"
+#include "compiler/machine_mapping/allowed_machine_views.h"
+#include "pcg/machine_view.h"
+#include "pcg/operator_task_space.h"
+#include "utils/containers/vector_of.h"
+#include "utils/vector.h"
+
+namespace FlexFlow {
+
+bool mutation_is_allowed(ParallelComputationGraph &pcg,
+                         parallel_layer_guid_t layer,
+                         MachineSpecification const &resources,
+                         MachineView machine_view) {
+  OperatorTaskSpace task = get_operator_task_space(pcg, layer);
+  std::unordered_set<MachineView> allowed_machine_views =
+      get_allowed_machine_views(resources, task, DeviceType::GPU);
+  return allowed_machine_views.count(machine_view);
+}
+
+std::vector<MachineMapping>
+    get_possible_mutations(SearchResult mapped_pcg,
+                           MachineSpecification const &resources) {
+  //each mutation only changes one layer at a time
+  ParallelComputationGraph pcg = mapped_pcg.pcg;
+  std::vector<parallel_layer_guid_t> layers = topological_ordering(pcg);
+  std::vector<MachineMapping> machine_mappings;
+  for (parallel_layer_guid_t layer : layers) {
+    MachineMapping original_mapping = mapped_pcg.machine_mapping;
+    MachineView machine_view = original_mapping.machine_views.at(layer);
+    OperatorTaskSpace task = get_operator_task_space(pcg, layer);
+    std::vector<MachineView> allowed_machine_views =
+        vector_of(get_allowed_machine_views(resources, task, DeviceType::GPU));
+
+    std::vector<MachineMapping> new_machine_mappings =
+        transform(allowed_machine_views, [&](MachineView machine_views) {
+          MachineMapping original_mapping = mapped_pcg.machine_mapping;
+          original_mapping.machine_views.at(layer) = machine_views;
+          return original_mapping;
+        });
+    machine_mappings = concat(machine_mappings, new_machine_mappings);
+  }
+  return machine_mappings;
+}
+
+MachineMapping permute_layers(std::vector<parallel_layer_guid_t> layers,
+                              MachineMapping mapping) {
+  NOT_IMPLEMENTED();
+}
+
+MachineMapping copy_layer(parallel_layer_guid_t source,
+                          parallel_layer_guid_t destination,
+                          MachineMapping mapping) {
+  std::unordered_map<parallel_layer_guid_t, MachineView> machine_views =
+      mapping.machine_views;
+  MachineView machine_view_to_copy = machine_views.at(source);
+  machine_views.try_emplace(destination, machine_view_to_copy);
+  return MachineMapping{machine_views};
+}
+
+MachineView change_stride(nonnegative_int stride,
+                          parallel_layer_guid_t layer,
+                          MachineView machine_view,
+                          nonnegative_int dim) {
+  std::vector<stride_t> strides = get_strides(machine_view);
+  strides.at(dim.unwrap_nonnegative()) = stride_t{stride};
+  MachineView new_machine_view =
+      machine_view_from_strides_and_machine_spec_dimensions(
+          machine_view.start, strides, get_dimensions(machine_view));
+  return new_machine_view;
+}
+
+MachineView change_node_idx(nonnegative_int node_ix,
+                            parallel_layer_guid_t layer,
+                            MachineView machine_view) {
+  MachineView new_machine_view =
+      machine_view_from_strides_and_machine_spec_dimensions(
+          MachineSpaceCoordinate{node_ix,
+                                 machine_view.start.device_idx,
+                                 machine_view.start.device_type},
+          get_strides(machine_view),
+          get_dimensions(machine_view));
+  return new_machine_view;
+}
+
+MachineView change_device_idx(nonnegative_int device_idx,
+                              parallel_layer_guid_t layer,
+                              MachineView machine_view) {
+  MachineView new_machine_view =
+      machine_view_from_strides_and_machine_spec_dimensions(
+          MachineSpaceCoordinate{machine_view.start.node_idx,
+                                 device_idx,
+                                 machine_view.start.device_type},
+          get_strides(machine_view),
+          get_dimensions(machine_view));
+  return new_machine_view;
+}
+
+MachineView change_projection(MachineSpecificationDimension projection,
+                              parallel_layer_guid_t layer,
+                              MachineView machine_view,
+                              nonnegative_int dim) {
+  std::vector<MachineSpecificationDimension> dims =
+      get_dimensions(machine_view);
+  dims.at(dim.unwrap_nonnegative()) = projection;
+  MachineView new_machine_view =
+      machine_view_from_strides_and_machine_spec_dimensions(
+          machine_view.start, get_strides(machine_view), dims);
+  return new_machine_view;
+}
+} // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/mcmc/mcmc_algorithm.cc b/lib/compiler/src/compiler/mcmc/mcmc_algorithm.cc
new file mode 100644
index 0000000000..61f425fec6
--- /dev/null
+++ b/lib/compiler/src/compiler/mcmc/mcmc_algorithm.cc
@@ -0,0 +1,320 @@
+#include "compiler/mcmc/mcmc_algorithm.h"
+#include "compiler/machine_mapping/allowed_machine_views.h"
+#include "compiler/mcmc/machine_mapping_mutation_set.h"
+#include "compiler/mcmc/mcmc_graph_optimize_state.h"
+#include "compiler/task_graph_simulator/task_simulator.h"
+#include "pcg/operator_task_space.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph_edge.h"
+#include "pcg/parallel_computation_graph/parallel_tensor_guid_t.h"
+#include "substitutions/apply_substitution/apply_substitution.h"
+#include "substitutions/apply_substitution/evaluate_substitution_output.h"
+#include "substitutions/apply_substitution/output_expr_to_result_sub_pcg_mapping.h"
+#include "substitutions/open_parallel_tensor_guid_t.h"
+#include "substitutions/pcg_pattern.h"
+#include "substitutions/pcg_pattern_match.h"
+#include "substitutions/sub_parallel_computation_graph.h"
+#include "substitutions/sub_parallel_computation_graph_data.dtg.h"
+#include "substitutions/sub_parallel_computation_graph_edge.h"
+#include "substitutions/substitution.h"
+#include "substitutions/unity_substitution_set.h"
+#include "utils/containers/keys.h"
+#include "utils/containers/merge_maps.h"
+#include "utils/containers/restrict_keys.h"
+#include "utils/containers/set_minus.h"
+#include "utils/containers/transform.h"
+#include "utils/containers/values.h"
+#include "utils/deduplicated_priority_queue.h"
+#include "utils/full_binary_tree/binary_tree_path.h"
+#include "utils/graph/node/algorithms.h"
+#include "utils/optional.h"
+
+namespace FlexFlow {
+
+std::optional<MachineMapping>
+    get_naive_mapping(ParallelComputationGraph &pcg,
+                      MachineSpecification const &resources) {
+  std::vector<parallel_layer_guid_t> layers = topological_ordering(pcg);
+  std::unordered_map<parallel_layer_guid_t, MachineView> machine_views;
+  for (parallel_layer_guid_t layer : layers) {
+    OperatorTaskSpace task = get_operator_task_space(pcg, layer);
+    std::unordered_set<MachineView> allowed_machine_views =
+        get_allowed_machine_views(resources, task, DeviceType::GPU);
+    if (allowed_machine_views.empty()) {
+      return std::nullopt;
+    }
+    machine_views.insert({layer, *(allowed_machine_views.begin())});
+  }
+  return MachineMapping{machine_views};
+}
+
+SearchResult apply_substitution_and_update_machine_mapping(
+    SearchResult const &mapped_pcg,
+    Substitution const &sub,
+    PCGPatternMatch const &match) {
+  // std::cout << "applying substitution" << std::endl;
+  SubParallelComputationGraph spcg = sub_pcg_from_full_pcg(mapped_pcg.pcg);
+
+  auto substitution_output_result =
+      evaluate_substitution_output(spcg, sub, match);
+  SubParallelComputationGraph substitution_output_graph =
+      substitution_output_result.first;
+  OutputExprToResultSubPCGMapping output_expr_to_result_sub_pcg_mapping =
+      substitution_output_result.second;
+
+  SubParallelComputationGraphData output_graph_data =
+      get_sub_pcg_data(substitution_output_graph);
+  SubParallelComputationGraphData pre_data = get_sub_pcg_data(spcg);
+
+  std::unordered_set<parallel_layer_guid_t> pre_nodes =
+      keys(pre_data.node_data);
+  std::unordered_set<parallel_layer_guid_t> matched_nodes =
+      unordered_set_of(values(match.node_assignment));
+  std::unordered_set<parallel_layer_guid_t> post_nodes_from_original_graph =
+      set_minus(pre_nodes, matched_nodes);
+
+  std::unordered_map<parallel_layer_guid_t, MachineView> machine_views =
+      mapped_pcg.machine_mapping.machine_views;
+
+  std::unordered_set<MachineView> substituted_machine_views =
+      transform(matched_nodes, [&](parallel_layer_guid_t const &node) {
+        return machine_views.at(node);
+      });
+
+  std::unordered_map<parallel_layer_guid_t, ParallelLayerAttrs> post_node_data =
+      [&] {
+        std::unordered_map<parallel_layer_guid_t, ParallelLayerAttrs>
+            post_node_data_from_orig = restrict_keys(
+                pre_data.node_data, post_nodes_from_original_graph);
+        std::unordered_map<parallel_layer_guid_t, ParallelLayerAttrs>
+            post_node_data_from_sub = output_graph_data.node_data;
+
+        // just taking the first substituted machine view, not sure if this
+        // is fine
+        for (auto [layer, attrs] : post_node_data_from_sub) {
+          machine_views.try_emplace(layer, *substituted_machine_views.begin());
+        }
+
+        return merge_disjoint_maps(post_node_data_from_orig,
+                                   post_node_data_from_sub);
+      }();
+
+  std::unordered_set<SubParallelComputationGraphEdge> post_edges = [&] {
+    std::unordered_set<SubParallelComputationGraphEdge> post_edges_from_orig =
+        filter(pre_data.edges, [&](SubParallelComputationGraphEdge const &e) {
+          if (e.raw_edge.has<DataflowInputEdge>()) {
+            return true;
+          } else {
+            DataflowEdge dfe = e.raw_edge.get<DataflowEdge>();
+            parallel_layer_guid_t src = parallel_layer_guid_t{dfe.src.node};
+            parallel_layer_guid_t dst = parallel_layer_guid_t{dfe.dst.node};
+            return !(contains(matched_nodes, src) ||
+                     contains(matched_nodes, dst));
+          }
+        });
+
+    std::unordered_set<SubParallelComputationGraphEdge> post_edges_from_sub =
+        filter(output_graph_data.edges,
+               [&](SubParallelComputationGraphEdge const &e) {
+                 return !e.raw_edge.has<DataflowInputEdge>();
+               });
+
+    bidict<PatternNodeOutput, parallel_tensor_guid_t>
+        output_orig_pattern_mapping = get_output_mapping_for_pcg_pattern_match(
+            match, sub.pcg_pattern, spcg);
+    bidict<parallel_tensor_guid_t, OutputGraphExprNodeOutput>
+        output_post_outexpr_mapping = get_output_graph_expr_output_mapping(
+            output_expr_to_result_sub_pcg_mapping,
+            sub.output_graph_expr,
+            substitution_output_graph);
+
+    std::unordered_set<SubParallelComputationGraphEdge> incoming_to_sub_edges;
+    for (auto const &[pattern_input, base_graph_tensor] :
+         match.input_assignment) {
+      OutputGraphExprInput output_expr_input =
+          sub.inputs_mapping.at_l(pattern_input);
+      input_parallel_tensor_guid_t output_graph_input =
+          output_expr_to_result_sub_pcg_mapping.input_mapping.at_r(
+              output_expr_input);
+      std::unordered_set<parallel_tensor_use_t> uses = get_parallel_tensor_uses(
+          substitution_output_graph,
+          open_parallel_tensor_guid_from_input(output_graph_input));
+      for (parallel_tensor_use_t const &use : uses) {
+        SubParallelComputationGraphEdge new_edge =
+            subpcg_edge_from_tensor_and_use(base_graph_tensor, use);
+        incoming_to_sub_edges.insert(new_edge);
+      }
+    }
+
+    std::unordered_set<SubParallelComputationGraphEdge> outgoing_from_sub_edges;
+    for (ParallelComputationGraphEdge const &outgoing_edge :
+         get_subgraph_outgoing_edges(spcg, matched_nodes)) {
+      parallel_tensor_guid_t original_tensor =
+          get_parallel_tensor(outgoing_edge);
+      PatternNodeOutput pattern_tensor =
+          output_orig_pattern_mapping.at_r(original_tensor);
+      OutputGraphExprNodeOutput output_graph_tensor =
+          sub.outputs_mapping.at_l(pattern_tensor);
+      parallel_tensor_guid_t new_tensor =
+          output_post_outexpr_mapping.at_r(output_graph_tensor);
+
+      SubParallelComputationGraphEdge new_edge =
+          subpcg_edge_from_tensor_and_dst(
+              new_tensor,
+              get_dst_layer(outgoing_edge),
+              get_dst_layer_input_idx(outgoing_edge));
+      outgoing_from_sub_edges.insert(new_edge);
+    }
+
+    return set_union(std::vector{
+        post_edges_from_orig,
+        post_edges_from_sub,
+        incoming_to_sub_edges,
+        outgoing_from_sub_edges,
+    });
+  }();
+
+  std::unordered_set<input_parallel_tensor_guid_t> post_inputs =
+      pre_data.inputs;
+
+  std::unordered_map<open_parallel_tensor_guid_t, ParallelTensorAttrs>
+      post_value_data = [&] {
+        std::unordered_map<open_parallel_tensor_guid_t, ParallelTensorAttrs>
+            post_value_data_from_orig = filter_keys(
+                pre_data.value_data, [&](open_parallel_tensor_guid_t const &t) {
+                  return visit_open_parallel_tensor_guid(
+                      t,
+                      overload{
+                          [&](parallel_tensor_guid_t const &t) {
+                            return contains(post_nodes_from_original_graph,
+                                            get_source_layer(t));
+                          },
+                          [](input_parallel_tensor_guid_t const &) {
+                            return true;
+                          },
+                      });
+                });
+
+        std::unordered_map<open_parallel_tensor_guid_t, ParallelTensorAttrs>
+            post_value_data_from_sub = output_graph_data.value_data;
+        return merge_disjoint_maps(post_value_data_from_orig,
+                                   post_value_data_from_sub);
+      }();
+
+  SubParallelComputationGraphData post_data = SubParallelComputationGraphData{
+      post_node_data,
+      post_edges,
+      post_inputs,
+      post_value_data,
+  };
+
+  return SearchResult{
+      pcg_from_sub_pcg_by_dropping_inputs(sub_pcg_from_graph_data(post_data)),
+      MachineMapping{machine_views}};
+}
+
+std::vector<SearchResult> all_pcgs_obtained_by_applying_a_substitution(
+    SearchResult const &mapped_pcg,
+    std::vector<Substitution> const &substitutions) {
+  std::vector<SearchResult> results;
+  SubParallelComputationGraph subpcg = sub_pcg_from_full_pcg(mapped_pcg.pcg);
+  // std::cout << "len" << substitutions.size() << std::endl;
+  for (Substitution const &substitution : substitutions) {
+     std::cout << "in outer loop" << std::endl;
+    for (PCGPatternMatch const &pattern_match :
+         find_pattern_matches(substitution.pcg_pattern, subpcg)) {
+       std::cout << "getting stuff" << std::endl;
+      SearchResult mapped_pcg_from_substitution =
+          apply_substitution_and_update_machine_mapping(
+              mapped_pcg, substitution, pattern_match);
+      results.push_back(mapped_pcg_from_substitution);
+    }
+  }
+  return results;
+}
+
+SearchResult mcmc_graph_optimize(ParallelComputationGraph &pcg,
+                                 CostEstimator const &cost_estimator,
+                                 MachineSpecification const &resources,
+                                 UnitySearchConfig const &search_config) {
+
+  std::vector<Substitution> substitutions = get_substitution_set(resources);
+  DeduplicatedPriorityQueue<MCMCOptimizeState> candidates;
+
+  std::optional<MachineMapping> naive_mapping =
+      get_naive_mapping(pcg, resources);
+  if (naive_mapping == std::nullopt) {
+    throw std::runtime_error("Failed to find any solutions");
+  }
+
+  // multiply runtime by -1 to make it minheap instead of maxheap
+  MCMCOptimizeState best_state = MCMCOptimizeState{
+      SearchResult{pcg, naive_mapping.value()},
+      -1 * task_simulator_estimate_forward_pass_time(
+               pcg, cost_estimator, naive_mapping.value(), resources)};
+
+  candidates.push(best_state);
+
+  for (int iteration = 0;
+       !candidates.empty() && iteration < search_config.budget;
+       ++iteration) {
+    MCMCOptimizeState current_state = candidates.top();
+    candidates.pop();
+
+    SearchResult current_mapped_pcg = current_state.mapped_pcg;
+    float current_estimate = current_state.runtime * -1;
+    float best_estimate = best_state.runtime * -1;
+
+    if (current_estimate < best_estimate) {
+      best_state = current_state;
+      std::cout << "new best state" << std::endl;
+      std::cout << current_estimate << std::endl;
+      std::cout << best_estimate << std::endl;
+    } else if (current_estimate > best_estimate * search_config.alpha) {
+      continue;
+    } else {
+      std::cout << current_estimate << best_estimate * search_config.alpha
+                << std::endl;
+    }
+    // std::cout << "Hello" << std::endl;
+
+    for (SearchResult const &new_mapped_pcg :
+         all_pcgs_obtained_by_applying_a_substitution(current_mapped_pcg,
+                                                      substitutions)) {
+      float new_estimate = task_simulator_estimate_forward_pass_time(
+          new_mapped_pcg.pcg,
+          cost_estimator,
+          new_mapped_pcg.machine_mapping,
+          resources);
+
+      std::cout << "new substitution" << std::endl;
+
+      std::cout << "new estimate" << new_estimate << std::endl;
+      if (new_estimate <= search_config.threshold &&
+          get_nodes(new_mapped_pcg.pcg.raw_graph).size() <=
+              search_config.max_num_ops) {
+        candidates.push(MCMCOptimizeState{new_mapped_pcg, -1 * new_estimate});
+      }
+    }
+
+    for (MachineMapping const &new_machine_mapping :
+         get_possible_mutations(current_mapped_pcg, resources)) {
+      float new_estimate =
+          task_simulator_estimate_forward_pass_time(current_mapped_pcg.pcg,
+                                                    cost_estimator,
+                                                    new_machine_mapping,
+                                                    resources);
+      //std::cout << "new mapping" << std::endl;
+
+      //std::cout << "new estimate" << new_estimate << std::endl;
+      if (new_estimate <= search_config.threshold) {
+        //std::cout << "pushed" << std::endl;
+        candidates.push(
+            MCMCOptimizeState{SearchResult{current_mapped_pcg.pcg, new_machine_mapping}, -1 * new_estimate});
+      }
+    }
+  }
+  return best_state.mapped_pcg;
+}
+
+} // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/mcmc/mcmc_graph_optimize_state.cc b/lib/compiler/src/compiler/mcmc/mcmc_graph_optimize_state.cc
new file mode 100644
index 0000000000..2556a50b4d
--- /dev/null
+++ b/lib/compiler/src/compiler/mcmc/mcmc_graph_optimize_state.cc
@@ -0,0 +1,84 @@
+#include "compiler/mcmc/mcmc_graph_optimize_state.h"
+#include "pcg/machine_view.h"
+#include "pcg/parallel_computation_graph/parallel_tensor_guid_t.h"
+
+namespace FlexFlow {
+
+MCMCOptimizeState::MCMCOptimizeState(SearchResult const &mapped_pcg,
+                                     float runtime)
+    : mapped_pcg(mapped_pcg), runtime(runtime) {}
+
+bool MCMCOptimizeState::operator==(MCMCOptimizeState const &other) const {
+  return pcgs_are_isomorphic(mapped_pcg.pcg, other.mapped_pcg.pcg) &&
+         mapped_pcg.machine_mapping == other.mapped_pcg.machine_mapping &&
+         runtime == other.runtime;
+}
+
+bool MCMCOptimizeState::operator!=(MCMCOptimizeState const &other) const {
+  return !(*this == other);
+}
+
+bool MCMCOptimizeState::operator<(MCMCOptimizeState const &other) const {
+  return runtime < other.runtime;
+}
+
+std::string format_as(MCMCOptimizeState const &r) {
+  return fmt::format("<MCMCOptimizeState pcg={} machine_mapping={} runtime={}>",
+                     as_dot(r.mapped_pcg.pcg),
+                     r.mapped_pcg.machine_mapping,
+                     r.runtime);
+}
+
+std::ostream &operator<<(std::ostream &s, MCMCOptimizeState const &st) {
+  return (s << fmt::to_string(st));
+}
+} // namespace FlexFlow
+
+namespace std {
+
+size_t hash<::FlexFlow::MCMCOptimizeState>::operator()(
+    ::FlexFlow::MCMCOptimizeState const &state) const {
+  ::FlexFlow::ParallelComputationGraph pcg = state.mapped_pcg.pcg;
+  ::FlexFlow::MachineMapping machine_mapping = state.mapped_pcg.machine_mapping;
+  size_t seed = 0;
+  ::FlexFlow::hash_combine(seed, state.runtime);
+  std::vector<::FlexFlow::parallel_layer_guid_t> layers =
+      topological_ordering(pcg);
+  ::FlexFlow::hash_combine(seed, layers.size());
+  for (::FlexFlow::parallel_layer_guid_t const &layer : layers) {
+    ::FlexFlow::hash_combine(seed, get_parallel_layer_attrs(pcg, layer));
+    std::vector<::FlexFlow::parallel_tensor_guid_t> inputs =
+        get_incoming_tensors(pcg, layer);
+    ::FlexFlow::hash_combine(seed, inputs.size());
+    for (::FlexFlow::parallel_tensor_guid_t input : inputs) {
+      for (size_t i = 0; i < layers.size(); ++i) {
+        if (get_source_layer(input) == layers.at(i)) {
+          ::FlexFlow::hash_combine(seed, i);
+          break;
+        }
+      }
+    }
+    ::FlexFlow::MachineView machine_view =
+        machine_mapping.machine_views.at(layer);
+    ::FlexFlow::hash_combine(seed, machine_view.start.node_idx);
+    ::FlexFlow::hash_combine(seed, machine_view.start.device_idx);
+    if (get_device_type(machine_view) == ::FlexFlow::DeviceType::CPU) {
+      ::FlexFlow::hash_combine(seed, 0);
+    } else {
+      ::FlexFlow::hash_combine(seed, 1);
+    }
+    for (::FlexFlow::MachineViewDimension dimension : machine_view.dimensions) {
+      ::FlexFlow::hash_combine(seed, dimension.stride.unwrapped);
+      if (dimension.projection ==
+          ::FlexFlow::MachineSpecificationDimension::INTRA_NODE) {
+        ::FlexFlow::hash_combine(seed, 0);
+      } else {
+        ::FlexFlow::hash_combine(seed, 1);
+      }
+    }
+  }
+
+  return seed;
+}
+
+} // namespace std
diff --git a/lib/compiler/test/src/compiler/mcmc/mcmc_algorithm.cc b/lib/compiler/test/src/compiler/mcmc/mcmc_algorithm.cc
new file mode 100644
index 0000000000..d441db199f
--- /dev/null
+++ b/lib/compiler/test/src/compiler/mcmc/mcmc_algorithm.cc
@@ -0,0 +1,88 @@
+#include "compiler/mcmc/mcmc_algorithm.h"
+#include "../cost_estimator_for_test.h"
+#include "doctest/doctest.h"
+#include "op-attrs/parallel_tensor_dims.h"
+#include "op-attrs/parallel_tensor_shape.dtg.h"
+#include "op-attrs/replica_type.dtg.h"
+#include "op-attrs/shard_parallel_dim.h"
+#include "pcg/computation_graph_builder.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h"
+#include "pcg/pcg_from_computation_graph.h"
+#include "utils/integer_conversions.h"
+#include "compiler/task_graph_simulator/task_simulator.h"
+
+using namespace FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("mcmc_graph_optimize") {
+    ComputationGraph cg = [&] {
+      ComputationGraphBuilder b;
+      TensorShape input_tensor_shape = TensorShape{
+          TensorDims{
+              FFOrdered<nonnegative_int>{nonnegative_int{32},
+                                         nonnegative_int{64}},
+          },
+          DataType::FLOAT,
+      };
+      tensor_guid_t t = b.create_input(input_tensor_shape, CreateGrad::YES);
+      t = b.dense(t,
+                  /*outDim=*/nonnegative_int{16},
+                  /*activation=*/std::nullopt);
+      t = b.gelu(t);
+      t = b.dense(t,
+                  /*outDim=*/nonnegative_int{12},
+                  /*activation=*/std::nullopt,
+                  /*use_bias=*/false,
+                  /*data_type=*/DataType::FLOAT,
+                  /*kernel_initializer=*/std::nullopt,
+                  /*bias_initializer=*/std::nullopt);
+      t = b.relu(t);
+      t = b.dense(t,
+                  /*outDim=*/nonnegative_int{8},
+                  /*activation=*/Activation::RELU);
+      return b.computation_graph;
+    }();
+
+    ParallelComputationGraph pcg = pcg_from_computation_graph(cg);
+
+    CostEstimator cost_estimator = make_fake_cost_estimator(
+        [](OpCostEstimateKey const &k) {
+          return OpCostMetrics{
+              /*forward_runtime=*/1.0,
+              /*backward_runtime=*/2.0,
+              /*memory=*/nonnegative_int{1},
+          };
+        },
+        [](TensorSetMovement const &) { return 1.0; });
+
+    MachineSpecification full_machine_spec = MachineSpecification{
+        /*num_nodes=*/nonnegative_int{2},
+        /*num_cpus_per_node=*/nonnegative_int{1},
+        /*num_gpus_per_node=*/nonnegative_int{1},
+        /*inter_node_bandwidth=*/1,
+        /*intra_node_bandwidth=*/1,
+    };
+
+    UnitySearchConfig search_config = UnitySearchConfig{
+        /*alpha=*/1.2,
+        /*budget=*/10,
+        /*threshold=*/30.0,
+        /*max_num_ops=*/100,
+    };
+
+    SearchResult result = mcmc_graph_optimize(
+        pcg, cost_estimator, full_machine_spec, search_config);
+
+        std::cout << task_simulator_estimate_forward_pass_time(result.pcg,
+          cost_estimator,
+          result.machine_mapping,
+          full_machine_spec) << std::endl;
+
+    CHECK(task_simulator_estimate_forward_pass_time(result.pcg,
+                                                    cost_estimator,
+                                                    result.machine_mapping,
+                                                    full_machine_spec) < 16);
+                                                    
+    CHECK(false);
+  }
+}
diff --git a/lib/substitutions/src/substitutions/apply_substitution/apply_substitution.cc b/lib/substitutions/src/substitutions/apply_substitution/apply_substitution.cc
index 61bfe15d7b..f1354264f8 100644
--- a/lib/substitutions/src/substitutions/apply_substitution/apply_substitution.cc
+++ b/lib/substitutions/src/substitutions/apply_substitution/apply_substitution.cc
@@ -159,6 +159,8 @@ SubParallelComputationGraph
       post_value_data,
   };
 
+  std::cout << as_dot(sub_pcg_from_graph_data(post_data)) << std::endl;
+
   return sub_pcg_from_graph_data(post_data);
 }
 
diff --git a/lib/substitutions/src/substitutions/operator_pattern/satisfies_constraint.cc b/lib/substitutions/src/substitutions/operator_pattern/satisfies_constraint.cc
index 194ae49255..40c69bf4c8 100644
--- a/lib/substitutions/src/substitutions/operator_pattern/satisfies_constraint.cc
+++ b/lib/substitutions/src/substitutions/operator_pattern/satisfies_constraint.cc
@@ -13,9 +13,14 @@ bool operator_satisfies_constraint(
     return false;
   }
 
+  // std::cout << constraint.constraint_type << std::endl;
   switch (constraint.constraint_type) {
     case ConstraintType::EQUAL:
       return expr_val.value() == constraint.attribute_value;
+    case ConstraintType::DIVISIBLE_BY:
+      return (expr_val.value().get<nonnegative_int>() %
+                 constraint.attribute_value.get<nonnegative_int>()) ==
+             0;
     default:
       throw mk_runtime_error(
           fmt::format("Unknown constraint type {}",
diff --git a/lib/substitutions/src/substitutions/pcg_pattern.cc b/lib/substitutions/src/substitutions/pcg_pattern.cc
index a0af875848..3277789a57 100644
--- a/lib/substitutions/src/substitutions/pcg_pattern.cc
+++ b/lib/substitutions/src/substitutions/pcg_pattern.cc
@@ -23,6 +23,8 @@ std::unordered_set<PatternNode> get_nodes(PCGPattern const &p) {
 static MatchAdditionalCriterion
     pcg_pattern_criteria(PCGPattern const &pattern,
                          SubParallelComputationGraph const &pcg) {
+  // std::cout << "GGETTING pattern criteria" << std::endl;
+  // std::cout << get_nodes(pattern) << std::endl;
   return MatchAdditionalCriterion{
       [&](PatternNode const &patternNode, Node const &pcgNode) {
         return operator_satisfies_pattern(
@@ -40,6 +42,8 @@ static MatchAdditionalCriterion
 std::vector<PCGPatternMatch>
     find_pattern_matches(PCGPattern const &pattern,
                          SubParallelComputationGraph const &pcg) {
+
+  // std::cout << "IN PATTERN MATCH"<< std::endl;
   std::vector<UnlabelledDataflowGraphPatternMatch> unlabelled_matches =
       find_pattern_matches(get_unlabelled_pattern(pattern),
                            pcg.raw_graph,
@@ -65,11 +69,20 @@ UnlabelledGraphPattern get_unlabelled_pattern(PCGPattern const &p) {
 
 TensorAttributePattern get_tensor_pattern(PCGPattern const &p,
                                           PatternValue const &v) {
+
+  // std::cout << "get tensor pattern"<< std::endl;
+  // std::cout << v << std::endl;
+  // std::cout << raw_open_dataflow_value_from_pattern_value(v) << std::endl;
+  TensorAttributePattern t =
+      p.raw_graph.at(raw_open_dataflow_value_from_pattern_value(v));
+  // std::cout << "hmm" << std::endl;
   return p.raw_graph.at(raw_open_dataflow_value_from_pattern_value(v));
 }
 
 OperatorAttributePattern get_operator_pattern(PCGPattern const &p,
                                               PatternNode const &n) {
+
+  // std::cout << "get op pattern"<< std::endl;
   return p.raw_graph.at(n.raw_node);
 }
 
diff --git a/lib/substitutions/src/substitutions/sub_parallel_computation_graph.cc b/lib/substitutions/src/substitutions/sub_parallel_computation_graph.cc
index 83df74f21b..0c673f0a8a 100644
--- a/lib/substitutions/src/substitutions/sub_parallel_computation_graph.cc
+++ b/lib/substitutions/src/substitutions/sub_parallel_computation_graph.cc
@@ -188,34 +188,33 @@ bool sub_pcgs_are_isomorphic(SubParallelComputationGraph const &lhs,
 }
 
 std::string as_dot(SubParallelComputationGraph const &spcg) {
-  NOT_IMPLEMENTED();
-  // std::function<std::string(ParallelLayerAttrs const &)> get_node_label =
-  //     [](ParallelLayerAttrs const &a) -> std::string {
-  //   RecordFormatter r = as_dot(a.op_attrs);
-  //
-  //   if (a.name.has_value()) {
-  //     RecordFormatter rr;
-  //     rr << "Name" << a.name.value();
-  //     r << rr;
-  //   }
-  //
-  //   std::ostringstream oss;
-  //   oss << r;
-  //   return oss.str();
-  // };
-  //
-  // std::function<std::string(ParallelTensorAttrs const &)> get_input_label =
-  //     [](ParallelTensorAttrs const &a) -> std::string {
-  //   RecordFormatter r;
-  //
-  //   r << fmt::to_string(a.shape);
-  //
-  //   std::ostringstream oss;
-  //   oss << r;
-  //   return oss.str();
-  // };
-  //
-  // return as_dot(spcg.raw_graph, get_node_label, get_input_label);
+  std::function<std::string(ParallelLayerAttrs const &)> get_node_label =
+      [](ParallelLayerAttrs const &a) -> std::string {
+    RecordFormatter r = as_dot(a.op_attrs);
+
+    if (a.name.has_value()) {
+      RecordFormatter rr;
+      rr << "Name" << a.name.value();
+      r << rr;
+    }
+
+    std::ostringstream oss;
+    oss << r;
+    return oss.str();
+  };
+
+  std::function<std::string(ParallelTensorAttrs const &)> get_input_label =
+      [](ParallelTensorAttrs const &a) -> std::string {
+    RecordFormatter r;
+
+    r << fmt::to_string(a.shape);
+
+    std::ostringstream oss;
+    oss << r;
+    return oss.str();
+  };
+
+  return as_dot(spcg.raw_graph, get_node_label, get_input_label);
 }
 
 void debug_print_dot(SubParallelComputationGraph const &spcg) {
diff --git a/lib/substitutions/src/substitutions/unlabelled/find_pattern_matches.cc b/lib/substitutions/src/substitutions/unlabelled/find_pattern_matches.cc
index a7ebc0bff7..1142333f48 100644
--- a/lib/substitutions/src/substitutions/unlabelled/find_pattern_matches.cc
+++ b/lib/substitutions/src/substitutions/unlabelled/find_pattern_matches.cc
@@ -71,16 +71,22 @@ std::vector<UnlabelledDataflowGraphPatternMatch>
     find_pattern_matches(UnlabelledGraphPattern const &pattern,
                          OpenDataflowGraphView const &graph,
                          MatchAdditionalCriterion const &additional_criterion) {
+  // std::cout << "find pattern matches" << std::endl;
   std::vector<UnlabelledDataflowGraphPatternMatch> matches;
   if (is_singleton_pattern(pattern)) {
+    // std::cout << "singleton pattern" << std::endl;
     for (Node const &graph_node : get_nodes(graph)) {
+      // std::cout << "11111" << std::endl;
       std::optional<UnlabelledDataflowGraphPatternMatch> candidate =
           get_candidate_singleton_match(pattern, graph, graph_node);
+      // std::cout << "22222" << std::endl;
       if (candidate.has_value() &&
           unlabelled_pattern_does_match(
               pattern, graph, candidate.value(), additional_criterion)) {
+        // std::cout << "2.555" << std::endl;
         matches.push_back(candidate.value());
       }
+      // std::cout << "33333" << std::endl;
     }
   } else {
     PatternSplit split = find_even_split(pattern);
@@ -110,7 +116,7 @@ std::vector<UnlabelledDataflowGraphPatternMatch>
       }
     }
   }
-
+  // std::cout << "return from pattern matches" << std::endl;
   return matches;
 }
 
diff --git a/lib/substitutions/src/substitutions/unlabelled/pattern_matching.cc b/lib/substitutions/src/substitutions/unlabelled/pattern_matching.cc
index 304bb8cf46..e4285e37bf 100644
--- a/lib/substitutions/src/substitutions/unlabelled/pattern_matching.cc
+++ b/lib/substitutions/src/substitutions/unlabelled/pattern_matching.cc
@@ -97,24 +97,30 @@ bool pattern_matches_subgraph_under(
         &full_graph_values_to_subgraph_inputs,
     UnlabelledDataflowGraphPatternMatch const &match,
     MatchAdditionalCriterion const &additional_criterion) {
+  // std::cout << "pattern amtches subgrpah under" << std::endl;
   SubgraphConcreteFromPattern concrete_from_pattern{
       match, full_graph_values_to_subgraph_inputs};
 
   std::unordered_set<Node> concrete_nodes = get_nodes(subgraph);
   std::unordered_set<Node> concrete_nodes_from_match =
       transform(get_nodes(pattern), concrete_from_pattern);
+  // std::cout << "mid of pattern amtches subgrpah under" << std::endl;
 
   if (concrete_nodes != concrete_nodes_from_match) {
     return false;
   }
 
   for (PatternNode const &pattern_node : get_nodes(pattern)) {
+    // std::cout << "hello hello hello" << std::endl;
     if (!additional_criterion.node_criterion(
             pattern_node, concrete_from_pattern(pattern_node))) {
+      // std::cout << "hello hello hello hello hello" << std::endl;
       return false;
     }
   }
 
+  // std::cout << "later mid of pattern amtches subgrpah under" << std::endl;
+
   std::unordered_set<OpenDataflowEdge> concrete_edges = get_edges(subgraph);
   std::unordered_set<OpenDataflowEdge> concrete_edge_from_match =
       transform(get_edges(pattern), concrete_from_pattern);
@@ -138,6 +144,7 @@ bool pattern_matches_subgraph_under(
       return false;
     }
   }
+  // std::cout << "end of pattern amtches subgrpah under" << std::endl;
 
   return true;
 }
@@ -147,12 +154,14 @@ bool unlabelled_pattern_does_match(
     OpenDataflowGraphView const &graph,
     UnlabelledDataflowGraphPatternMatch const &match,
     MatchAdditionalCriterion const &additional_criterion) {
+  // std::cout << "unlabelled_pattern_does_match" << std::endl;
 
   OpenDataflowSubgraphResult subgraph_result = subgraph_matched(graph, match);
   OpenDataflowGraphView matched_subgraph = subgraph_result.graph;
 
   assert(left_entries(match.node_assignment) == get_nodes(pattern));
   assert(right_entries(match.node_assignment) == get_nodes(matched_subgraph));
+  // std::cout << "middle of" << std::endl;
 
   MatchAdditionalCriterion through_subgraph_operation =
       MatchAdditionalCriterion{
@@ -171,6 +180,7 @@ bool unlabelled_pattern_does_match(
                 }});
           },
       };
+  // std::cout << "end of unlabelled_pattern_does_match" << std::endl;
 
   return pattern_matches_subgraph_under(
       pattern,

From fe6d9505f8ea11d6ed8d421663b0f1eb1364db7b Mon Sep 17 00:00:00 2001
From: Victor Li <vli42@sapling2.stanford.edu>
Date: Wed, 12 Mar 2025 03:02:45 -0700
Subject: [PATCH 02/11] removing substitution part of MCMC for now

---
 .envrc                                        |  3 ++
 .vimrc                                        |  8 +++++
 .../src/compiler/mcmc/mcmc_algorithm.cc       | 25 +++-----------
 .../test/src/compiler/mcmc/mcmc_algorithm.cc  |  6 ----
 .../src/substitutions/pcg_pattern.cc          | 13 --------
 .../unlabelled/find_pattern_matches.cc        |  7 ----
 .../unlabelled/pattern_matching.cc            | 33 +++++++++++++------
 7 files changed, 39 insertions(+), 56 deletions(-)
 create mode 100644 .envrc
 create mode 100644 .vimrc

diff --git a/.envrc b/.envrc
new file mode 100644
index 0000000000..2797f0f929
--- /dev/null
+++ b/.envrc
@@ -0,0 +1,3 @@
+source_up_if_exists
+
+use flake
diff --git a/.vimrc b/.vimrc
new file mode 100644
index 0000000000..4c8a8a8279
--- /dev/null
+++ b/.vimrc
@@ -0,0 +1,8 @@
+" example search path configuration
+set path=lib/runtime/**,lib/**
+
+" set build target
+" let g:target = "pcg"
+
+" set test target
+" let g:test_target = "utils-test"
diff --git a/lib/compiler/src/compiler/mcmc/mcmc_algorithm.cc b/lib/compiler/src/compiler/mcmc/mcmc_algorithm.cc
index 61f425fec6..025fb586c6 100644
--- a/lib/compiler/src/compiler/mcmc/mcmc_algorithm.cc
+++ b/lib/compiler/src/compiler/mcmc/mcmc_algorithm.cc
@@ -28,6 +28,8 @@
 #include "utils/full_binary_tree/binary_tree_path.h"
 #include "utils/graph/node/algorithms.h"
 #include "utils/optional.h"
+#include "utils/graph/labelled_open_dataflow_graph/algorithms/as_dot.h"
+
 
 namespace FlexFlow {
 
@@ -52,7 +54,6 @@ SearchResult apply_substitution_and_update_machine_mapping(
     SearchResult const &mapped_pcg,
     Substitution const &sub,
     PCGPatternMatch const &match) {
-  // std::cout << "applying substitution" << std::endl;
   SubParallelComputationGraph spcg = sub_pcg_from_full_pcg(mapped_pcg.pcg);
 
   auto substitution_output_result =
@@ -217,19 +218,17 @@ std::vector<SearchResult> all_pcgs_obtained_by_applying_a_substitution(
     SearchResult const &mapped_pcg,
     std::vector<Substitution> const &substitutions) {
   std::vector<SearchResult> results;
-  SubParallelComputationGraph subpcg = sub_pcg_from_full_pcg(mapped_pcg.pcg);
-  // std::cout << "len" << substitutions.size() << std::endl;
+  //currently not functional
+  /*SubParallelComputationGraph subpcg = sub_pcg_from_full_pcg(mapped_pcg.pcg);
   for (Substitution const &substitution : substitutions) {
-     std::cout << "in outer loop" << std::endl;
     for (PCGPatternMatch const &pattern_match :
          find_pattern_matches(substitution.pcg_pattern, subpcg)) {
-       std::cout << "getting stuff" << std::endl;
       SearchResult mapped_pcg_from_substitution =
           apply_substitution_and_update_machine_mapping(
               mapped_pcg, substitution, pattern_match);
       results.push_back(mapped_pcg_from_substitution);
     }
-  }
+  }*/
   return results;
 }
 
@@ -267,16 +266,9 @@ SearchResult mcmc_graph_optimize(ParallelComputationGraph &pcg,
 
     if (current_estimate < best_estimate) {
       best_state = current_state;
-      std::cout << "new best state" << std::endl;
-      std::cout << current_estimate << std::endl;
-      std::cout << best_estimate << std::endl;
     } else if (current_estimate > best_estimate * search_config.alpha) {
       continue;
-    } else {
-      std::cout << current_estimate << best_estimate * search_config.alpha
-                << std::endl;
     }
-    // std::cout << "Hello" << std::endl;
 
     for (SearchResult const &new_mapped_pcg :
          all_pcgs_obtained_by_applying_a_substitution(current_mapped_pcg,
@@ -287,9 +279,6 @@ SearchResult mcmc_graph_optimize(ParallelComputationGraph &pcg,
           new_mapped_pcg.machine_mapping,
           resources);
 
-      std::cout << "new substitution" << std::endl;
-
-      std::cout << "new estimate" << new_estimate << std::endl;
       if (new_estimate <= search_config.threshold &&
           get_nodes(new_mapped_pcg.pcg.raw_graph).size() <=
               search_config.max_num_ops) {
@@ -304,11 +293,7 @@ SearchResult mcmc_graph_optimize(ParallelComputationGraph &pcg,
                                                     cost_estimator,
                                                     new_machine_mapping,
                                                     resources);
-      //std::cout << "new mapping" << std::endl;
-
-      //std::cout << "new estimate" << new_estimate << std::endl;
       if (new_estimate <= search_config.threshold) {
-        //std::cout << "pushed" << std::endl;
         candidates.push(
             MCMCOptimizeState{SearchResult{current_mapped_pcg.pcg, new_machine_mapping}, -1 * new_estimate});
       }
diff --git a/lib/compiler/test/src/compiler/mcmc/mcmc_algorithm.cc b/lib/compiler/test/src/compiler/mcmc/mcmc_algorithm.cc
index d441db199f..a7ffa8e5e0 100644
--- a/lib/compiler/test/src/compiler/mcmc/mcmc_algorithm.cc
+++ b/lib/compiler/test/src/compiler/mcmc/mcmc_algorithm.cc
@@ -73,16 +73,10 @@ TEST_SUITE(FF_TEST_SUITE) {
     SearchResult result = mcmc_graph_optimize(
         pcg, cost_estimator, full_machine_spec, search_config);
 
-        std::cout << task_simulator_estimate_forward_pass_time(result.pcg,
-          cost_estimator,
-          result.machine_mapping,
-          full_machine_spec) << std::endl;
-
     CHECK(task_simulator_estimate_forward_pass_time(result.pcg,
                                                     cost_estimator,
                                                     result.machine_mapping,
                                                     full_machine_spec) < 16);
                                                     
-    CHECK(false);
   }
 }
diff --git a/lib/substitutions/src/substitutions/pcg_pattern.cc b/lib/substitutions/src/substitutions/pcg_pattern.cc
index 3277789a57..a0af875848 100644
--- a/lib/substitutions/src/substitutions/pcg_pattern.cc
+++ b/lib/substitutions/src/substitutions/pcg_pattern.cc
@@ -23,8 +23,6 @@ std::unordered_set<PatternNode> get_nodes(PCGPattern const &p) {
 static MatchAdditionalCriterion
     pcg_pattern_criteria(PCGPattern const &pattern,
                          SubParallelComputationGraph const &pcg) {
-  // std::cout << "GGETTING pattern criteria" << std::endl;
-  // std::cout << get_nodes(pattern) << std::endl;
   return MatchAdditionalCriterion{
       [&](PatternNode const &patternNode, Node const &pcgNode) {
         return operator_satisfies_pattern(
@@ -42,8 +40,6 @@ static MatchAdditionalCriterion
 std::vector<PCGPatternMatch>
     find_pattern_matches(PCGPattern const &pattern,
                          SubParallelComputationGraph const &pcg) {
-
-  // std::cout << "IN PATTERN MATCH"<< std::endl;
   std::vector<UnlabelledDataflowGraphPatternMatch> unlabelled_matches =
       find_pattern_matches(get_unlabelled_pattern(pattern),
                            pcg.raw_graph,
@@ -69,20 +65,11 @@ UnlabelledGraphPattern get_unlabelled_pattern(PCGPattern const &p) {
 
 TensorAttributePattern get_tensor_pattern(PCGPattern const &p,
                                           PatternValue const &v) {
-
-  // std::cout << "get tensor pattern"<< std::endl;
-  // std::cout << v << std::endl;
-  // std::cout << raw_open_dataflow_value_from_pattern_value(v) << std::endl;
-  TensorAttributePattern t =
-      p.raw_graph.at(raw_open_dataflow_value_from_pattern_value(v));
-  // std::cout << "hmm" << std::endl;
   return p.raw_graph.at(raw_open_dataflow_value_from_pattern_value(v));
 }
 
 OperatorAttributePattern get_operator_pattern(PCGPattern const &p,
                                               PatternNode const &n) {
-
-  // std::cout << "get op pattern"<< std::endl;
   return p.raw_graph.at(n.raw_node);
 }
 
diff --git a/lib/substitutions/src/substitutions/unlabelled/find_pattern_matches.cc b/lib/substitutions/src/substitutions/unlabelled/find_pattern_matches.cc
index 1142333f48..0f26ce93fa 100644
--- a/lib/substitutions/src/substitutions/unlabelled/find_pattern_matches.cc
+++ b/lib/substitutions/src/substitutions/unlabelled/find_pattern_matches.cc
@@ -71,22 +71,16 @@ std::vector<UnlabelledDataflowGraphPatternMatch>
     find_pattern_matches(UnlabelledGraphPattern const &pattern,
                          OpenDataflowGraphView const &graph,
                          MatchAdditionalCriterion const &additional_criterion) {
-  // std::cout << "find pattern matches" << std::endl;
   std::vector<UnlabelledDataflowGraphPatternMatch> matches;
   if (is_singleton_pattern(pattern)) {
-    // std::cout << "singleton pattern" << std::endl;
     for (Node const &graph_node : get_nodes(graph)) {
-      // std::cout << "11111" << std::endl;
       std::optional<UnlabelledDataflowGraphPatternMatch> candidate =
           get_candidate_singleton_match(pattern, graph, graph_node);
-      // std::cout << "22222" << std::endl;
       if (candidate.has_value() &&
           unlabelled_pattern_does_match(
               pattern, graph, candidate.value(), additional_criterion)) {
-        // std::cout << "2.555" << std::endl;
         matches.push_back(candidate.value());
       }
-      // std::cout << "33333" << std::endl;
     }
   } else {
     PatternSplit split = find_even_split(pattern);
@@ -116,7 +110,6 @@ std::vector<UnlabelledDataflowGraphPatternMatch>
       }
     }
   }
-  // std::cout << "return from pattern matches" << std::endl;
   return matches;
 }
 
diff --git a/lib/substitutions/src/substitutions/unlabelled/pattern_matching.cc b/lib/substitutions/src/substitutions/unlabelled/pattern_matching.cc
index e4285e37bf..cfb34aac3a 100644
--- a/lib/substitutions/src/substitutions/unlabelled/pattern_matching.cc
+++ b/lib/substitutions/src/substitutions/unlabelled/pattern_matching.cc
@@ -97,30 +97,26 @@ bool pattern_matches_subgraph_under(
         &full_graph_values_to_subgraph_inputs,
     UnlabelledDataflowGraphPatternMatch const &match,
     MatchAdditionalCriterion const &additional_criterion) {
-  // std::cout << "pattern amtches subgrpah under" << std::endl;
+      std::cout << "gamer" << std::endl;
+      std::cout << get_open_dataflow_values(pattern.raw_graph) << std::endl;
   SubgraphConcreteFromPattern concrete_from_pattern{
       match, full_graph_values_to_subgraph_inputs};
 
   std::unordered_set<Node> concrete_nodes = get_nodes(subgraph);
   std::unordered_set<Node> concrete_nodes_from_match =
       transform(get_nodes(pattern), concrete_from_pattern);
-  // std::cout << "mid of pattern amtches subgrpah under" << std::endl;
 
   if (concrete_nodes != concrete_nodes_from_match) {
     return false;
   }
 
   for (PatternNode const &pattern_node : get_nodes(pattern)) {
-    // std::cout << "hello hello hello" << std::endl;
     if (!additional_criterion.node_criterion(
             pattern_node, concrete_from_pattern(pattern_node))) {
-      // std::cout << "hello hello hello hello hello" << std::endl;
       return false;
     }
   }
 
-  // std::cout << "later mid of pattern amtches subgrpah under" << std::endl;
-
   std::unordered_set<OpenDataflowEdge> concrete_edges = get_edges(subgraph);
   std::unordered_set<OpenDataflowEdge> concrete_edge_from_match =
       transform(get_edges(pattern), concrete_from_pattern);
@@ -137,14 +133,20 @@ bool pattern_matches_subgraph_under(
   if (concrete_values != concrete_values_from_match) {
     return false;
   }
+  std::cout << "later later mid of pattern amtches subgrpah under" << std::endl;
+
 
   for (PatternValue const &pattern_value : get_values(pattern)) {
+    std::cout << "dfjsahdfkiasjhdfkasjhdfkasdjhdfbgk awerhurgvt " << std::endl;
+    std::cout << get_open_dataflow_values(pattern.raw_graph) << std::endl;
+    std::cout << pattern_value << std::endl;
     if (!additional_criterion.value_criterion(
             pattern_value, concrete_from_pattern(pattern_value))) {
+              std::cout << "dfjsahdfkiasjhdfkasjhdfkasdjhdfbgk awerhurgvtfwewefewfewf " << std::endl;
       return false;
     }
   }
-  // std::cout << "end of pattern amtches subgrpah under" << std::endl;
+   std::cout << "end of pattern amtches subgrpah under" << std::endl;
 
   return true;
 }
@@ -154,14 +156,19 @@ bool unlabelled_pattern_does_match(
     OpenDataflowGraphView const &graph,
     UnlabelledDataflowGraphPatternMatch const &match,
     MatchAdditionalCriterion const &additional_criterion) {
-  // std::cout << "unlabelled_pattern_does_match" << std::endl;
+   std::cout << "unlabelled_pattern_does_match" << std::endl;
 
   OpenDataflowSubgraphResult subgraph_result = subgraph_matched(graph, match);
   OpenDataflowGraphView matched_subgraph = subgraph_result.graph;
 
   assert(left_entries(match.node_assignment) == get_nodes(pattern));
   assert(right_entries(match.node_assignment) == get_nodes(matched_subgraph));
-  // std::cout << "middle of" << std::endl;
+   std::cout << "middle of" << std::endl;
+   std::cout << get_open_dataflow_values(pattern.raw_graph) << std::endl;
+   std::cout << left_entries(match.node_assignment) << std::endl;
+   std::cout << right_entries(match.node_assignment) << std::endl;
+   std::cout << get_nodes(pattern) << std::endl;
+   std::cout << get_nodes(matched_subgraph) << std::endl;
 
   MatchAdditionalCriterion through_subgraph_operation =
       MatchAdditionalCriterion{
@@ -169,18 +176,24 @@ bool unlabelled_pattern_does_match(
           [&](PatternValue const &pv, OpenDataflowValue const &v) {
             return v.visit<bool>(overload{
                 [&](DataflowOutput const &) {
+                  //std::cout << "whefihweoifhewfi" <<std::endl;
                   return additional_criterion.value_criterion(pv, v);
                 },
                 [&](DataflowGraphInput const &subgraph_input) {
+                  //std::cout << "bobobobobob" << std::endl;
                   OpenDataflowValue full_graph_value =
                       subgraph_result.full_graph_values_to_subgraph_inputs.at_r(
                           subgraph_input);
+                          /*std::cout << "ppopopopopopo" << std::endl;
+                          bool ss = additional_criterion.value_criterion(pv,
+                            full_graph_value);
+                            std::cout << "lolololololo" << std::endl;*/
                   return additional_criterion.value_criterion(pv,
                                                               full_graph_value);
                 }});
           },
       };
-  // std::cout << "end of unlabelled_pattern_does_match" << std::endl;
+   //std::cout << "end of unlabelled_pattern_does_match" << std::endl;
 
   return pattern_matches_subgraph_under(
       pattern,

From be740ccb8b0fc27a71fb10a349a4ef95b3f2978b Mon Sep 17 00:00:00 2001
From: Victor Li <vli42@sapling2.stanford.edu>
Date: Tue, 1 Apr 2025 13:56:25 -0700
Subject: [PATCH 03/11] Adding randomness to MCMC to make it true MCMC, adding
 secondary non-lazy random mutation generator

---
 .../machine_mapping_mutation_set.h            |  29 ++
 .../mcmc/machine_mapping_mutation_set.h       |  32 --
 .../include/compiler/mcmc/mcmc_algorithm.h    |   4 +-
 .../mcmc/mcmc_search_config.struct.toml       |  26 ++
 .../machine_mapping_mutation_set.cc           | 189 ++++++++++++
 .../mcmc/machine_mapping_mutation_set.cc      | 110 -------
 .../src/compiler/mcmc/mcmc_algorithm.cc       | 273 +++---------------
 .../test/src/compiler/mcmc/mcmc_algorithm.cc  |  37 ++-
 ..._substitution_and_update_machine_mapping.h |  32 ++
 ...ly_substitution_and_update_machine_mapping | 185 ++++++++++++
 .../unlabelled/pattern_matching.cc            |  38 +--
 lib/utils/include/utils/random_utils.h        |   2 +-
 12 files changed, 540 insertions(+), 417 deletions(-)
 create mode 100644 lib/compiler/include/compiler/machine_mapping/machine_mapping_mutation_set.h
 delete mode 100644 lib/compiler/include/compiler/mcmc/machine_mapping_mutation_set.h
 create mode 100644 lib/compiler/include/compiler/mcmc/mcmc_search_config.struct.toml
 create mode 100644 lib/compiler/src/compiler/machine_mapping/machine_mapping_mutation_set.cc
 delete mode 100644 lib/compiler/src/compiler/mcmc/machine_mapping_mutation_set.cc
 create mode 100644 lib/substitutions/include/substitutions/apply_substitution/apply_substitution_and_update_machine_mapping.h
 create mode 100644 lib/substitutions/src/substitutions/apply_substitution/apply_substitution_and_update_machine_mapping

diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_mutation_set.h b/lib/compiler/include/compiler/machine_mapping/machine_mapping_mutation_set.h
new file mode 100644
index 0000000000..443ab06f02
--- /dev/null
+++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_mutation_set.h
@@ -0,0 +1,29 @@
+#ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_MCMC_MACHINE_MAPPING_MUTATION_SET_H
+#define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_MCMC_MACHINE_MAPPING_MUTATION_SET_H
+
+#include "compiler/machine_mapping/machine_mapping.h"
+#include "compiler/search_result.dtg.h"
+
+namespace FlexFlow {
+std::optional<MachineMapping>
+    get_naive_mapping(ParallelComputationGraph &pcg,
+                      MachineSpecification const &resources);
+std::vector<MachineMapping>
+    get_possible_mutations(SearchResult mapped_pcg,
+                           MachineSpecification const &resource);
+std::optional<MachineMapping>
+    get_random_mutation(SearchResult mapped_pcg,
+                        MachineSpecification const &resource,
+                        DeviceType const &device_type = DeviceType::GPU);
+MachineView increment_stride(MachineView machine_view, nonnegative_int dim);
+MachineView decrement_all_strides(MachineView machine_view);
+MachineView change_stride(nonnegative_int stride,
+                          MachineView machine_view,
+                          nonnegative_int dim);
+MachineView change_node_idx(nonnegative_int node_ix, MachineView machine_view);
+MachineView change_device_idx(nonnegative_int device_idx,
+                              MachineView machine_view);
+MachineView switch_projection(MachineView machine_view, nonnegative_int dim);
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/compiler/include/compiler/mcmc/machine_mapping_mutation_set.h b/lib/compiler/include/compiler/mcmc/machine_mapping_mutation_set.h
deleted file mode 100644
index e41aad2f71..0000000000
--- a/lib/compiler/include/compiler/mcmc/machine_mapping_mutation_set.h
+++ /dev/null
@@ -1,32 +0,0 @@
-#ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_MCMC_MACHINE_MAPPING_MUTATION_SET_H
-#define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_MCMC_MACHINE_MAPPING_MUTATION_SET_H
-
-#include "compiler/machine_mapping/machine_mapping.h"
-#include "compiler/search_result.dtg.h"
-
-namespace FlexFlow {
-std::vector<MachineMapping>
-    get_possible_mutations(SearchResult mapped_pcg,
-                           MachineSpecification const &resource);
-MachineMapping permute_layers(std::vector<parallel_layer_guid_t> layers,
-                              MachineMapping mapping);
-MachineMapping copy_layer(parallel_layer_guid_t source,
-                          parallel_layer_guid_t destination,
-                          MachineMapping mapping);
-MachineView change_stride(nonnegative_int stride,
-                          parallel_layer_guid_t layer,
-                          MachineView machine_view,
-                          nonnegative_int dim);
-MachineView change_node_idx(nonnegative_int node_ix,
-                            parallel_layer_guid_t layer,
-                            MachineView machine_view);
-MachineView change_device_idx(nonnegative_int device_idx,
-                              parallel_layer_guid_t layer,
-                              MachineView machine_view);
-MachineView change_projection(MachineSpecificationDimension projection,
-                              parallel_layer_guid_t layer,
-                              MachineView machine_view,
-                              nonnegative_int dim);
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/compiler/include/compiler/mcmc/mcmc_algorithm.h b/lib/compiler/include/compiler/mcmc/mcmc_algorithm.h
index 53efa845cf..b17eaf3e16 100644
--- a/lib/compiler/include/compiler/mcmc/mcmc_algorithm.h
+++ b/lib/compiler/include/compiler/mcmc/mcmc_algorithm.h
@@ -2,8 +2,8 @@
 #define _FLEXFLOW_COMPILER_MCMC_ALGORITHM_H
 
 #include "compiler/cost_estimator/cost_estimator.h"
+#include "compiler/mcmc/mcmc_search_config.dtg.h"
 #include "compiler/search_result.dtg.h"
-#include "compiler/unity_algorithm/unity_search_config.dtg.h"
 #include "pcg/computation_graph.h"
 #include "pcg/machine_specification.dtg.h"
 #include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h"
@@ -15,7 +15,7 @@ namespace FlexFlow {
 SearchResult mcmc_graph_optimize(ParallelComputationGraph &pcg,
                                  CostEstimator const &cost_estimator,
                                  MachineSpecification const &resources,
-                                 UnitySearchConfig const &search_config);
+                                 MCMCSearchConfig const &search_config);
 
 } // namespace FlexFlow
 
diff --git a/lib/compiler/include/compiler/mcmc/mcmc_search_config.struct.toml b/lib/compiler/include/compiler/mcmc/mcmc_search_config.struct.toml
new file mode 100644
index 0000000000..e96ced81cd
--- /dev/null
+++ b/lib/compiler/include/compiler/mcmc/mcmc_search_config.struct.toml
@@ -0,0 +1,26 @@
+namespace = "FlexFlow"
+name = "MCMCSearchConfig"
+features = [
+  "eq",
+  "hash",
+  "fmt",
+]
+
+includes = [
+]
+
+[[fields]]
+name = "temperature"
+type = "float"
+
+[[fields]]
+name = "num_iterations"
+type = "int"
+
+[[fields]]
+name = "num_mutations_per_iteration"
+type = "int"
+
+[[fields]]
+name = "max_num_ops"
+type = "int"
diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping_mutation_set.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping_mutation_set.cc
new file mode 100644
index 0000000000..36651fdc5d
--- /dev/null
+++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping_mutation_set.cc
@@ -0,0 +1,189 @@
+#include "compiler/machine_mapping/machine_mapping_mutation_set.h"
+#include "compiler/machine_mapping/allowed_machine_views.h"
+#include "pcg/machine_view.h"
+#include "pcg/operator_task_space.h"
+#include "utils/containers/vector_of.h"
+#include "utils/nonnegative_int/nonnegative_range.h"
+#include "utils/random_utils.h"
+#include "utils/vector.h"
+
+namespace FlexFlow {
+
+std::optional<MachineMapping>
+    get_naive_mapping(ParallelComputationGraph &pcg,
+                      MachineSpecification const &resources) {
+  std::vector<parallel_layer_guid_t> layers = topological_ordering(pcg);
+  std::unordered_map<parallel_layer_guid_t, MachineView> machine_views;
+  for (parallel_layer_guid_t layer : layers) {
+    OperatorTaskSpace task = get_operator_task_space(pcg, layer);
+    std::unordered_set<MachineView> allowed_machine_views =
+        get_allowed_machine_views(resources, task, DeviceType::GPU);
+    if (allowed_machine_views.empty()) {
+      return std::nullopt;
+    }
+    machine_views.insert({layer, *(allowed_machine_views.begin())});
+  }
+  return MachineMapping{machine_views};
+}
+
+std::optional<MachineMapping>
+    get_random_mutation_notlazy(SearchResult mapped_pcg,
+                                MachineSpecification const &resources,
+                                DeviceType const &device_type) {
+  ParallelComputationGraph pcg = mapped_pcg.pcg;
+  std::vector<parallel_layer_guid_t> layers = topological_ordering(pcg);
+  if (layers.size() == 0) {
+    return std::nullopt;
+  }
+  parallel_layer_guid_t random_layer = select_random(layers);
+  MachineMapping machine_mapping = mapped_pcg.machine_mapping;
+  MachineView machine_view = machine_mapping.machine_views.at(random_layer);
+
+  int mutation_op = select_random(range(6));
+  switch (mutation_op) {
+    case 0: {
+      machine_view = decrement_all_strides(machine_view);
+      break;
+    }
+    case 1: {
+      nonnegative_int rand_dim = select_random(
+          nonnegative_range(nonnegative_int{num_dims(machine_view)}));
+      machine_view = increment_stride(machine_view, rand_dim);
+      break;
+    }
+    case 2: {
+      nonnegative_int rand_node_idx =
+          select_random(nonnegative_range(resources.num_nodes));
+      machine_view = change_node_idx(rand_node_idx, machine_view);
+      break;
+    }
+    case 3: {
+      if (device_type == DeviceType::GPU) {
+        nonnegative_int rand_device_idx =
+            select_random(nonnegative_range(resources.num_gpus_per_node));
+        machine_view = change_device_idx(rand_device_idx, machine_view);
+      } else {
+        nonnegative_int rand_device_idx =
+            select_random(nonnegative_range(resources.num_cpus_per_node));
+        machine_view = change_device_idx(rand_device_idx, machine_view);
+      }
+      break;
+    }
+    case 4: {
+      nonnegative_int rand_dim = select_random(
+          nonnegative_range(nonnegative_int{num_dims(machine_view)}));
+      machine_view = switch_projection(machine_view, rand_dim);
+      break;
+    }
+    case 5: {
+      // copy layer
+      parallel_layer_guid_t layer_to_copy = select_random(layers);
+      machine_view = machine_mapping.machine_views.at(layer_to_copy);
+      break;
+    }
+  }
+  OperatorTaskSpace task = get_operator_task_space(pcg, random_layer);
+  if (is_valid_machine_view(machine_view, task, resources)) {
+    // only apply it if valid
+    machine_mapping.machine_views.at(random_layer) = machine_view;
+  }
+  return machine_mapping;
+}
+
+std::optional<MachineMapping>
+    get_random_mutation(SearchResult mapped_pcg,
+                        MachineSpecification const &resources,
+                        DeviceType const &device_type) {
+  ParallelComputationGraph pcg = mapped_pcg.pcg;
+  std::vector<parallel_layer_guid_t> layers = topological_ordering(pcg);
+  if (layers.size() == 0) {
+    return std::nullopt;
+  }
+  parallel_layer_guid_t random_layer = layers.at(rand() % layers.size());
+
+  MachineMapping machine_mapping = mapped_pcg.machine_mapping;
+  MachineView machine_view = machine_mapping.machine_views.at(random_layer);
+  OperatorTaskSpace task = get_operator_task_space(pcg, random_layer);
+
+  std::vector<MachineView> allowed_machine_views =
+      vector_of(get_allowed_machine_views(resources, task, DeviceType::GPU));
+  MachineView random_new_machine_view =
+      allowed_machine_views.at(rand() % allowed_machine_views.size());
+
+  machine_mapping.machine_views.at(random_layer) = random_new_machine_view;
+  return machine_mapping;
+}
+
+MachineView increment_stride(MachineView machine_view, nonnegative_int dim) {
+  std::vector<stride_t> strides = get_strides(machine_view);
+  nonnegative_int new_stride =
+      strides.at(dim.unwrap_nonnegative()).unwrapped + 1_n;
+  return change_stride(new_stride, machine_view, dim);
+}
+
+MachineView decrement_all_strides(MachineView machine_view) {
+  std::vector<stride_t> strides = get_strides(machine_view);
+  for (nonnegative_int dim :
+       nonnegative_range(nonnegative_int{num_dims(machine_view)})) {
+    nonnegative_int old_stride = strides.at(dim.unwrap_nonnegative()).unwrapped;
+    if (old_stride >= 1_n) {
+      machine_view =
+          change_stride(nonnegative_int{old_stride.unwrap_nonnegative() - 1},
+                        machine_view,
+                        dim);
+    }
+  }
+  return machine_view;
+}
+
+MachineView change_stride(nonnegative_int stride,
+                          MachineView machine_view,
+                          nonnegative_int dim) {
+  std::vector<stride_t> strides = get_strides(machine_view);
+  strides.at(dim.unwrap_nonnegative()) = stride_t{stride};
+  MachineView new_machine_view =
+      machine_view_from_strides_and_machine_spec_dimensions(
+          machine_view.start, strides, get_dimensions(machine_view));
+  return new_machine_view;
+}
+
+MachineView change_node_idx(nonnegative_int node_ix, MachineView machine_view) {
+  MachineView new_machine_view =
+      machine_view_from_strides_and_machine_spec_dimensions(
+          MachineSpaceCoordinate{node_ix,
+                                 machine_view.start.device_idx,
+                                 machine_view.start.device_type},
+          get_strides(machine_view),
+          get_dimensions(machine_view));
+  return new_machine_view;
+}
+
+MachineView change_device_idx(nonnegative_int device_idx,
+                              MachineView machine_view) {
+  MachineView new_machine_view =
+      machine_view_from_strides_and_machine_spec_dimensions(
+          MachineSpaceCoordinate{machine_view.start.node_idx,
+                                 device_idx,
+                                 machine_view.start.device_type},
+          get_strides(machine_view),
+          get_dimensions(machine_view));
+  return new_machine_view;
+}
+
+MachineView switch_projection(MachineView machine_view, nonnegative_int dim) {
+  std::vector<MachineSpecificationDimension> dims =
+      get_dimensions(machine_view);
+  MachineSpecificationDimension projection = dims.at(dim.unwrap_nonnegative());
+  if (projection == MachineSpecificationDimension::INTER_NODE) {
+    dims.at(dim.unwrap_nonnegative()) =
+        MachineSpecificationDimension::INTRA_NODE;
+  } else {
+    dims.at(dim.unwrap_nonnegative()) =
+        MachineSpecificationDimension::INTER_NODE;
+  }
+  MachineView new_machine_view =
+      machine_view_from_strides_and_machine_spec_dimensions(
+          machine_view.start, get_strides(machine_view), dims);
+  return new_machine_view;
+}
+} // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/mcmc/machine_mapping_mutation_set.cc b/lib/compiler/src/compiler/mcmc/machine_mapping_mutation_set.cc
deleted file mode 100644
index d67e4cb592..0000000000
--- a/lib/compiler/src/compiler/mcmc/machine_mapping_mutation_set.cc
+++ /dev/null
@@ -1,110 +0,0 @@
-#include "compiler/mcmc/machine_mapping_mutation_set.h"
-#include "compiler/machine_mapping/allowed_machine_views.h"
-#include "pcg/machine_view.h"
-#include "pcg/operator_task_space.h"
-#include "utils/containers/vector_of.h"
-#include "utils/vector.h"
-
-namespace FlexFlow {
-
-bool mutation_is_allowed(ParallelComputationGraph &pcg,
-                         parallel_layer_guid_t layer,
-                         MachineSpecification const &resources,
-                         MachineView machine_view) {
-  OperatorTaskSpace task = get_operator_task_space(pcg, layer);
-  std::unordered_set<MachineView> allowed_machine_views =
-      get_allowed_machine_views(resources, task, DeviceType::GPU);
-  return allowed_machine_views.count(machine_view);
-}
-
-std::vector<MachineMapping>
-    get_possible_mutations(SearchResult mapped_pcg,
-                           MachineSpecification const &resources) {
-  //each mutation only changes one layer at a time
-  ParallelComputationGraph pcg = mapped_pcg.pcg;
-  std::vector<parallel_layer_guid_t> layers = topological_ordering(pcg);
-  std::vector<MachineMapping> machine_mappings;
-  for (parallel_layer_guid_t layer : layers) {
-    MachineMapping original_mapping = mapped_pcg.machine_mapping;
-    MachineView machine_view = original_mapping.machine_views.at(layer);
-    OperatorTaskSpace task = get_operator_task_space(pcg, layer);
-    std::vector<MachineView> allowed_machine_views =
-        vector_of(get_allowed_machine_views(resources, task, DeviceType::GPU));
-
-    std::vector<MachineMapping> new_machine_mappings =
-        transform(allowed_machine_views, [&](MachineView machine_views) {
-          MachineMapping original_mapping = mapped_pcg.machine_mapping;
-          original_mapping.machine_views.at(layer) = machine_views;
-          return original_mapping;
-        });
-    machine_mappings = concat(machine_mappings, new_machine_mappings);
-  }
-  return machine_mappings;
-}
-
-MachineMapping permute_layers(std::vector<parallel_layer_guid_t> layers,
-                              MachineMapping mapping) {
-  NOT_IMPLEMENTED();
-}
-
-MachineMapping copy_layer(parallel_layer_guid_t source,
-                          parallel_layer_guid_t destination,
-                          MachineMapping mapping) {
-  std::unordered_map<parallel_layer_guid_t, MachineView> machine_views =
-      mapping.machine_views;
-  MachineView machine_view_to_copy = machine_views.at(source);
-  machine_views.try_emplace(destination, machine_view_to_copy);
-  return MachineMapping{machine_views};
-}
-
-MachineView change_stride(nonnegative_int stride,
-                          parallel_layer_guid_t layer,
-                          MachineView machine_view,
-                          nonnegative_int dim) {
-  std::vector<stride_t> strides = get_strides(machine_view);
-  strides.at(dim.unwrap_nonnegative()) = stride_t{stride};
-  MachineView new_machine_view =
-      machine_view_from_strides_and_machine_spec_dimensions(
-          machine_view.start, strides, get_dimensions(machine_view));
-  return new_machine_view;
-}
-
-MachineView change_node_idx(nonnegative_int node_ix,
-                            parallel_layer_guid_t layer,
-                            MachineView machine_view) {
-  MachineView new_machine_view =
-      machine_view_from_strides_and_machine_spec_dimensions(
-          MachineSpaceCoordinate{node_ix,
-                                 machine_view.start.device_idx,
-                                 machine_view.start.device_type},
-          get_strides(machine_view),
-          get_dimensions(machine_view));
-  return new_machine_view;
-}
-
-MachineView change_device_idx(nonnegative_int device_idx,
-                              parallel_layer_guid_t layer,
-                              MachineView machine_view) {
-  MachineView new_machine_view =
-      machine_view_from_strides_and_machine_spec_dimensions(
-          MachineSpaceCoordinate{machine_view.start.node_idx,
-                                 device_idx,
-                                 machine_view.start.device_type},
-          get_strides(machine_view),
-          get_dimensions(machine_view));
-  return new_machine_view;
-}
-
-MachineView change_projection(MachineSpecificationDimension projection,
-                              parallel_layer_guid_t layer,
-                              MachineView machine_view,
-                              nonnegative_int dim) {
-  std::vector<MachineSpecificationDimension> dims =
-      get_dimensions(machine_view);
-  dims.at(dim.unwrap_nonnegative()) = projection;
-  MachineView new_machine_view =
-      machine_view_from_strides_and_machine_spec_dimensions(
-          machine_view.start, get_strides(machine_view), dims);
-  return new_machine_view;
-}
-} // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/mcmc/mcmc_algorithm.cc b/lib/compiler/src/compiler/mcmc/mcmc_algorithm.cc
index 025fb586c6..6553823252 100644
--- a/lib/compiler/src/compiler/mcmc/mcmc_algorithm.cc
+++ b/lib/compiler/src/compiler/mcmc/mcmc_algorithm.cc
@@ -1,224 +1,22 @@
 #include "compiler/mcmc/mcmc_algorithm.h"
-#include "compiler/machine_mapping/allowed_machine_views.h"
-#include "compiler/mcmc/machine_mapping_mutation_set.h"
+#include "compiler/machine_mapping/machine_mapping_mutation_set.h"
 #include "compiler/mcmc/mcmc_graph_optimize_state.h"
 #include "compiler/task_graph_simulator/task_simulator.h"
-#include "pcg/operator_task_space.h"
-#include "pcg/parallel_computation_graph/parallel_computation_graph.h"
-#include "pcg/parallel_computation_graph/parallel_computation_graph_edge.h"
-#include "pcg/parallel_computation_graph/parallel_tensor_guid_t.h"
-#include "substitutions/apply_substitution/apply_substitution.h"
-#include "substitutions/apply_substitution/evaluate_substitution_output.h"
-#include "substitutions/apply_substitution/output_expr_to_result_sub_pcg_mapping.h"
-#include "substitutions/open_parallel_tensor_guid_t.h"
+#include "substitutions/apply_substitution/apply_substitution_and_update_machine_mapping.h"
 #include "substitutions/pcg_pattern.h"
 #include "substitutions/pcg_pattern_match.h"
-#include "substitutions/sub_parallel_computation_graph.h"
-#include "substitutions/sub_parallel_computation_graph_data.dtg.h"
-#include "substitutions/sub_parallel_computation_graph_edge.h"
 #include "substitutions/substitution.h"
 #include "substitutions/unity_substitution_set.h"
-#include "utils/containers/keys.h"
-#include "utils/containers/merge_maps.h"
-#include "utils/containers/restrict_keys.h"
-#include "utils/containers/set_minus.h"
-#include "utils/containers/transform.h"
-#include "utils/containers/values.h"
-#include "utils/deduplicated_priority_queue.h"
-#include "utils/full_binary_tree/binary_tree_path.h"
-#include "utils/graph/node/algorithms.h"
 #include "utils/optional.h"
-#include "utils/graph/labelled_open_dataflow_graph/algorithms/as_dot.h"
-
+#include "utils/random_utils.h"
 
 namespace FlexFlow {
 
-std::optional<MachineMapping>
-    get_naive_mapping(ParallelComputationGraph &pcg,
-                      MachineSpecification const &resources) {
-  std::vector<parallel_layer_guid_t> layers = topological_ordering(pcg);
-  std::unordered_map<parallel_layer_guid_t, MachineView> machine_views;
-  for (parallel_layer_guid_t layer : layers) {
-    OperatorTaskSpace task = get_operator_task_space(pcg, layer);
-    std::unordered_set<MachineView> allowed_machine_views =
-        get_allowed_machine_views(resources, task, DeviceType::GPU);
-    if (allowed_machine_views.empty()) {
-      return std::nullopt;
-    }
-    machine_views.insert({layer, *(allowed_machine_views.begin())});
-  }
-  return MachineMapping{machine_views};
-}
-
-SearchResult apply_substitution_and_update_machine_mapping(
-    SearchResult const &mapped_pcg,
-    Substitution const &sub,
-    PCGPatternMatch const &match) {
-  SubParallelComputationGraph spcg = sub_pcg_from_full_pcg(mapped_pcg.pcg);
-
-  auto substitution_output_result =
-      evaluate_substitution_output(spcg, sub, match);
-  SubParallelComputationGraph substitution_output_graph =
-      substitution_output_result.first;
-  OutputExprToResultSubPCGMapping output_expr_to_result_sub_pcg_mapping =
-      substitution_output_result.second;
-
-  SubParallelComputationGraphData output_graph_data =
-      get_sub_pcg_data(substitution_output_graph);
-  SubParallelComputationGraphData pre_data = get_sub_pcg_data(spcg);
-
-  std::unordered_set<parallel_layer_guid_t> pre_nodes =
-      keys(pre_data.node_data);
-  std::unordered_set<parallel_layer_guid_t> matched_nodes =
-      unordered_set_of(values(match.node_assignment));
-  std::unordered_set<parallel_layer_guid_t> post_nodes_from_original_graph =
-      set_minus(pre_nodes, matched_nodes);
-
-  std::unordered_map<parallel_layer_guid_t, MachineView> machine_views =
-      mapped_pcg.machine_mapping.machine_views;
-
-  std::unordered_set<MachineView> substituted_machine_views =
-      transform(matched_nodes, [&](parallel_layer_guid_t const &node) {
-        return machine_views.at(node);
-      });
-
-  std::unordered_map<parallel_layer_guid_t, ParallelLayerAttrs> post_node_data =
-      [&] {
-        std::unordered_map<parallel_layer_guid_t, ParallelLayerAttrs>
-            post_node_data_from_orig = restrict_keys(
-                pre_data.node_data, post_nodes_from_original_graph);
-        std::unordered_map<parallel_layer_guid_t, ParallelLayerAttrs>
-            post_node_data_from_sub = output_graph_data.node_data;
-
-        // just taking the first substituted machine view, not sure if this
-        // is fine
-        for (auto [layer, attrs] : post_node_data_from_sub) {
-          machine_views.try_emplace(layer, *substituted_machine_views.begin());
-        }
-
-        return merge_disjoint_maps(post_node_data_from_orig,
-                                   post_node_data_from_sub);
-      }();
-
-  std::unordered_set<SubParallelComputationGraphEdge> post_edges = [&] {
-    std::unordered_set<SubParallelComputationGraphEdge> post_edges_from_orig =
-        filter(pre_data.edges, [&](SubParallelComputationGraphEdge const &e) {
-          if (e.raw_edge.has<DataflowInputEdge>()) {
-            return true;
-          } else {
-            DataflowEdge dfe = e.raw_edge.get<DataflowEdge>();
-            parallel_layer_guid_t src = parallel_layer_guid_t{dfe.src.node};
-            parallel_layer_guid_t dst = parallel_layer_guid_t{dfe.dst.node};
-            return !(contains(matched_nodes, src) ||
-                     contains(matched_nodes, dst));
-          }
-        });
-
-    std::unordered_set<SubParallelComputationGraphEdge> post_edges_from_sub =
-        filter(output_graph_data.edges,
-               [&](SubParallelComputationGraphEdge const &e) {
-                 return !e.raw_edge.has<DataflowInputEdge>();
-               });
-
-    bidict<PatternNodeOutput, parallel_tensor_guid_t>
-        output_orig_pattern_mapping = get_output_mapping_for_pcg_pattern_match(
-            match, sub.pcg_pattern, spcg);
-    bidict<parallel_tensor_guid_t, OutputGraphExprNodeOutput>
-        output_post_outexpr_mapping = get_output_graph_expr_output_mapping(
-            output_expr_to_result_sub_pcg_mapping,
-            sub.output_graph_expr,
-            substitution_output_graph);
-
-    std::unordered_set<SubParallelComputationGraphEdge> incoming_to_sub_edges;
-    for (auto const &[pattern_input, base_graph_tensor] :
-         match.input_assignment) {
-      OutputGraphExprInput output_expr_input =
-          sub.inputs_mapping.at_l(pattern_input);
-      input_parallel_tensor_guid_t output_graph_input =
-          output_expr_to_result_sub_pcg_mapping.input_mapping.at_r(
-              output_expr_input);
-      std::unordered_set<parallel_tensor_use_t> uses = get_parallel_tensor_uses(
-          substitution_output_graph,
-          open_parallel_tensor_guid_from_input(output_graph_input));
-      for (parallel_tensor_use_t const &use : uses) {
-        SubParallelComputationGraphEdge new_edge =
-            subpcg_edge_from_tensor_and_use(base_graph_tensor, use);
-        incoming_to_sub_edges.insert(new_edge);
-      }
-    }
-
-    std::unordered_set<SubParallelComputationGraphEdge> outgoing_from_sub_edges;
-    for (ParallelComputationGraphEdge const &outgoing_edge :
-         get_subgraph_outgoing_edges(spcg, matched_nodes)) {
-      parallel_tensor_guid_t original_tensor =
-          get_parallel_tensor(outgoing_edge);
-      PatternNodeOutput pattern_tensor =
-          output_orig_pattern_mapping.at_r(original_tensor);
-      OutputGraphExprNodeOutput output_graph_tensor =
-          sub.outputs_mapping.at_l(pattern_tensor);
-      parallel_tensor_guid_t new_tensor =
-          output_post_outexpr_mapping.at_r(output_graph_tensor);
-
-      SubParallelComputationGraphEdge new_edge =
-          subpcg_edge_from_tensor_and_dst(
-              new_tensor,
-              get_dst_layer(outgoing_edge),
-              get_dst_layer_input_idx(outgoing_edge));
-      outgoing_from_sub_edges.insert(new_edge);
-    }
-
-    return set_union(std::vector{
-        post_edges_from_orig,
-        post_edges_from_sub,
-        incoming_to_sub_edges,
-        outgoing_from_sub_edges,
-    });
-  }();
-
-  std::unordered_set<input_parallel_tensor_guid_t> post_inputs =
-      pre_data.inputs;
-
-  std::unordered_map<open_parallel_tensor_guid_t, ParallelTensorAttrs>
-      post_value_data = [&] {
-        std::unordered_map<open_parallel_tensor_guid_t, ParallelTensorAttrs>
-            post_value_data_from_orig = filter_keys(
-                pre_data.value_data, [&](open_parallel_tensor_guid_t const &t) {
-                  return visit_open_parallel_tensor_guid(
-                      t,
-                      overload{
-                          [&](parallel_tensor_guid_t const &t) {
-                            return contains(post_nodes_from_original_graph,
-                                            get_source_layer(t));
-                          },
-                          [](input_parallel_tensor_guid_t const &) {
-                            return true;
-                          },
-                      });
-                });
-
-        std::unordered_map<open_parallel_tensor_guid_t, ParallelTensorAttrs>
-            post_value_data_from_sub = output_graph_data.value_data;
-        return merge_disjoint_maps(post_value_data_from_orig,
-                                   post_value_data_from_sub);
-      }();
-
-  SubParallelComputationGraphData post_data = SubParallelComputationGraphData{
-      post_node_data,
-      post_edges,
-      post_inputs,
-      post_value_data,
-  };
-
-  return SearchResult{
-      pcg_from_sub_pcg_by_dropping_inputs(sub_pcg_from_graph_data(post_data)),
-      MachineMapping{machine_views}};
-}
-
 std::vector<SearchResult> all_pcgs_obtained_by_applying_a_substitution(
     SearchResult const &mapped_pcg,
     std::vector<Substitution> const &substitutions) {
   std::vector<SearchResult> results;
-  //currently not functional
+  // currently not functional
   /*SubParallelComputationGraph subpcg = sub_pcg_from_full_pcg(mapped_pcg.pcg);
   for (Substitution const &substitution : substitutions) {
     for (PCGPatternMatch const &pattern_match :
@@ -232,13 +30,16 @@ std::vector<SearchResult> all_pcgs_obtained_by_applying_a_substitution(
   return results;
 }
 
+bool mcmc_accept(int delta, float temperature) {
+  return delta < 0 || randf() < exp(-delta / temperature);
+}
+
 SearchResult mcmc_graph_optimize(ParallelComputationGraph &pcg,
                                  CostEstimator const &cost_estimator,
                                  MachineSpecification const &resources,
-                                 UnitySearchConfig const &search_config) {
+                                 MCMCSearchConfig const &search_config) {
 
   std::vector<Substitution> substitutions = get_substitution_set(resources);
-  DeduplicatedPriorityQueue<MCMCOptimizeState> candidates;
 
   std::optional<MachineMapping> naive_mapping =
       get_naive_mapping(pcg, resources);
@@ -246,31 +47,20 @@ SearchResult mcmc_graph_optimize(ParallelComputationGraph &pcg,
     throw std::runtime_error("Failed to find any solutions");
   }
 
-  // multiply runtime by -1 to make it minheap instead of maxheap
-  MCMCOptimizeState best_state = MCMCOptimizeState{
+  MCMCOptimizeState current_state = MCMCOptimizeState{
       SearchResult{pcg, naive_mapping.value()},
-      -1 * task_simulator_estimate_forward_pass_time(
-               pcg, cost_estimator, naive_mapping.value(), resources)};
+      task_simulator_estimate_forward_pass_time(
+          pcg, cost_estimator, naive_mapping.value(), resources)};
 
-  candidates.push(best_state);
+  MCMCOptimizeState best_state = current_state;
 
-  for (int iteration = 0;
-       !candidates.empty() && iteration < search_config.budget;
+  for (int iteration = 0; iteration < search_config.num_iterations;
        ++iteration) {
-    MCMCOptimizeState current_state = candidates.top();
-    candidates.pop();
 
     SearchResult current_mapped_pcg = current_state.mapped_pcg;
-    float current_estimate = current_state.runtime * -1;
-    float best_estimate = best_state.runtime * -1;
+    float best_estimate = best_state.runtime;
 
-    if (current_estimate < best_estimate) {
-      best_state = current_state;
-    } else if (current_estimate > best_estimate * search_config.alpha) {
-      continue;
-    }
-
-    for (SearchResult const &new_mapped_pcg :
+    /*for (SearchResult const &new_mapped_pcg :
          all_pcgs_obtained_by_applying_a_substitution(current_mapped_pcg,
                                                       substitutions)) {
       float new_estimate = task_simulator_estimate_forward_pass_time(
@@ -284,21 +74,36 @@ SearchResult mcmc_graph_optimize(ParallelComputationGraph &pcg,
               search_config.max_num_ops) {
         candidates.push(MCMCOptimizeState{new_mapped_pcg, -1 * new_estimate});
       }
-    }
-
-    for (MachineMapping const &new_machine_mapping :
-         get_possible_mutations(current_mapped_pcg, resources)) {
+    }*/
+
+    std::optional<MachineMapping> new_machine_mapping =
+        get_random_mutation(current_mapped_pcg, resources);
+    for (int searched_mutations = 0;
+         searched_mutations < search_config.num_mutations_per_iteration;
+         searched_mutations++) {
+      if (new_machine_mapping == std::nullopt) {
+        break;
+      }
       float new_estimate =
           task_simulator_estimate_forward_pass_time(current_mapped_pcg.pcg,
                                                     cost_estimator,
-                                                    new_machine_mapping,
+                                                    new_machine_mapping.value(),
                                                     resources);
-      if (new_estimate <= search_config.threshold) {
-        candidates.push(
-            MCMCOptimizeState{SearchResult{current_mapped_pcg.pcg, new_machine_mapping}, -1 * new_estimate});
+      float runtime_delta = new_estimate - best_estimate;
+
+      if (mcmc_accept(runtime_delta, search_config.temperature)) {
+        current_state = MCMCOptimizeState{
+            SearchResult{current_mapped_pcg.pcg, new_machine_mapping.value()},
+            new_estimate};
+        if (runtime_delta < 0) {
+          best_state = current_state;
+        }
       }
+
+      new_machine_mapping = get_random_mutation(current_mapped_pcg, resources);
     }
   }
+
   return best_state.mapped_pcg;
 }
 
diff --git a/lib/compiler/test/src/compiler/mcmc/mcmc_algorithm.cc b/lib/compiler/test/src/compiler/mcmc/mcmc_algorithm.cc
index a7ffa8e5e0..7cde75cecf 100644
--- a/lib/compiler/test/src/compiler/mcmc/mcmc_algorithm.cc
+++ b/lib/compiler/test/src/compiler/mcmc/mcmc_algorithm.cc
@@ -1,5 +1,6 @@
 #include "compiler/mcmc/mcmc_algorithm.h"
 #include "../cost_estimator_for_test.h"
+#include "compiler/task_graph_simulator/task_simulator.h"
 #include "doctest/doctest.h"
 #include "op-attrs/parallel_tensor_dims.h"
 #include "op-attrs/parallel_tensor_shape.dtg.h"
@@ -9,7 +10,6 @@
 #include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h"
 #include "pcg/pcg_from_computation_graph.h"
 #include "utils/integer_conversions.h"
-#include "compiler/task_graph_simulator/task_simulator.h"
 
 using namespace FlexFlow;
 
@@ -19,18 +19,17 @@ TEST_SUITE(FF_TEST_SUITE) {
       ComputationGraphBuilder b;
       TensorShape input_tensor_shape = TensorShape{
           TensorDims{
-              FFOrdered<nonnegative_int>{nonnegative_int{32},
-                                         nonnegative_int{64}},
+              FFOrdered<nonnegative_int>{32_n, 64_n},
           },
           DataType::FLOAT,
       };
       tensor_guid_t t = b.create_input(input_tensor_shape, CreateGrad::YES);
       t = b.dense(t,
-                  /*outDim=*/nonnegative_int{16},
+                  /*outDim=*/16_n,
                   /*activation=*/std::nullopt);
       t = b.gelu(t);
       t = b.dense(t,
-                  /*outDim=*/nonnegative_int{12},
+                  /*outDim=*/12_n,
                   /*activation=*/std::nullopt,
                   /*use_bias=*/false,
                   /*data_type=*/DataType::FLOAT,
@@ -38,7 +37,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                   /*bias_initializer=*/std::nullopt);
       t = b.relu(t);
       t = b.dense(t,
-                  /*outDim=*/nonnegative_int{8},
+                  /*outDim=*/8_n,
                   /*activation=*/Activation::RELU);
       return b.computation_graph;
     }();
@@ -50,33 +49,33 @@ TEST_SUITE(FF_TEST_SUITE) {
           return OpCostMetrics{
               /*forward_runtime=*/1.0,
               /*backward_runtime=*/2.0,
-              /*memory=*/nonnegative_int{1},
+              /*memory=*/1_n,
           };
         },
         [](TensorSetMovement const &) { return 1.0; });
 
     MachineSpecification full_machine_spec = MachineSpecification{
-        /*num_nodes=*/nonnegative_int{2},
-        /*num_cpus_per_node=*/nonnegative_int{1},
-        /*num_gpus_per_node=*/nonnegative_int{1},
+        /*num_nodes=*/2_n,
+        /*num_cpus_per_node=*/1_n,
+        /*num_gpus_per_node=*/1_n,
         /*inter_node_bandwidth=*/1,
         /*intra_node_bandwidth=*/1,
     };
 
-    UnitySearchConfig search_config = UnitySearchConfig{
-        /*alpha=*/1.2,
-        /*budget=*/10,
-        /*threshold=*/30.0,
+    MCMCSearchConfig search_config = MCMCSearchConfig{
+        /*temperature=*/1.0,
+        /*num_iterations=*/100,
+        /*num_mutations_per_iteration=*/10,
         /*max_num_ops=*/100,
     };
 
     SearchResult result = mcmc_graph_optimize(
         pcg, cost_estimator, full_machine_spec, search_config);
+    float runtime = task_simulator_estimate_forward_pass_time(
+        result.pcg, cost_estimator, result.machine_mapping, full_machine_spec);
+    std::cout << runtime << std::endl;
 
-    CHECK(task_simulator_estimate_forward_pass_time(result.pcg,
-                                                    cost_estimator,
-                                                    result.machine_mapping,
-                                                    full_machine_spec) < 16);
-                                                    
+    CHECK(runtime < 16);
+    CHECK(false);
   }
 }
diff --git a/lib/substitutions/include/substitutions/apply_substitution/apply_substitution_and_update_machine_mapping.h b/lib/substitutions/include/substitutions/apply_substitution/apply_substitution_and_update_machine_mapping.h
new file mode 100644
index 0000000000..b08ca57851
--- /dev/null
+++ b/lib/substitutions/include/substitutions/apply_substitution/apply_substitution_and_update_machine_mapping.h
@@ -0,0 +1,32 @@
+#ifndef _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_APPLY_SUBSTITUTION_APPLY_SUBSTITUTION_AND_UPDATE_MACHINE_MAPPING_H
+#define _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_APPLY_SUBSTITUTION_APPLY_SUBSTITUTION_AND_UPDATE_MACHINE_MAPPING_H
+
+#include "compiler/search_result.dtg.h"
+#include "substitutions/pcg_pattern_match.dtg.h"
+#include "substitutions/sub_parallel_computation_graph.dtg.h"
+#include "substitutions/substitution.dtg.h"
+
+namespace FlexFlow {
+/**
+ * @brief Applies \p substitution to \p mapped_pcg at the location specified by
+ * \p match, returning the resulting SearchResult (mapped pcg)
+ *
+ * @param mapped_pcg
+ * @param substitution
+ * @param match The location at which to apply substitution. This location in
+ * sub_pcg should match substitution's PCGPattern. Likely created by running
+ * FlexFlow::find_pattern_matches(PCGPattern const &,
+ * SubParallelComputationGraph const &).
+ * @return SearchResult A mapped pcg similar to mapped_pcg, but with
+ * the subgraph of the pcg specified by match replaced with the result of the
+ * output expression of substitution and the machine mapping updated to account
+ * for the new output
+ */
+SearchResult apply_substitution_and_update_machine_mapping(
+    SearchResult const &mapped_pcg,
+    Substitution const &sub,
+    PCGPatternMatch const &match);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/substitutions/src/substitutions/apply_substitution/apply_substitution_and_update_machine_mapping b/lib/substitutions/src/substitutions/apply_substitution/apply_substitution_and_update_machine_mapping
new file mode 100644
index 0000000000..1721ee26d8
--- /dev/null
+++ b/lib/substitutions/src/substitutions/apply_substitution/apply_substitution_and_update_machine_mapping
@@ -0,0 +1,185 @@
+#include "substitutions/apply_substitution/apply_substitution_and_update_machine_mapping.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph_edge.h"
+#include "pcg/parallel_computation_graph/parallel_tensor_guid_t.h"
+#include "substitutions/apply_substitution/evaluate_substitution_output.h"
+#include "substitutions/apply_substitution/output_expr_to_result_sub_pcg_mapping.h"
+#include "substitutions/open_parallel_tensor_guid_t.h"
+#include "substitutions/pcg_pattern_match.h"
+#include "substitutions/sub_parallel_computation_graph.h"
+#include "substitutions/sub_parallel_computation_graph_data.dtg.h"
+#include "substitutions/sub_parallel_computation_graph_edge.h"
+#include "utils/containers/keys.h"
+#include "utils/containers/merge_maps.h"
+#include "utils/containers/restrict_keys.h"
+#include "utils/containers/set_minus.h"
+#include "utils/containers/values.h"
+
+namespace FlexFlow {
+
+SearchResult apply_substitution_and_update_machine_mapping(
+    SearchResult const &mapped_pcg,
+    Substitution const &sub,
+    PCGPatternMatch const &match) {
+  SubParallelComputationGraph spcg = sub_pcg_from_full_pcg(mapped_pcg.pcg);
+
+  auto substitution_output_result =
+      evaluate_substitution_output(spcg, sub, match);
+  SubParallelComputationGraph substitution_output_graph =
+      substitution_output_result.first;
+  OutputExprToResultSubPCGMapping output_expr_to_result_sub_pcg_mapping =
+      substitution_output_result.second;
+
+  SubParallelComputationGraphData output_graph_data =
+      get_sub_pcg_data(substitution_output_graph);
+  SubParallelComputationGraphData pre_data = get_sub_pcg_data(spcg);
+
+  std::unordered_set<parallel_layer_guid_t> pre_nodes =
+      keys(pre_data.node_data);
+  std::unordered_set<parallel_layer_guid_t> matched_nodes =
+      unordered_set_of(values(match.node_assignment));
+  std::unordered_set<parallel_layer_guid_t> post_nodes_from_original_graph =
+      set_minus(pre_nodes, matched_nodes);
+
+  std::unordered_map<parallel_layer_guid_t, MachineView> machine_views =
+      mapped_pcg.machine_mapping.machine_views;
+
+  std::unordered_set<MachineView> substituted_machine_views =
+      transform(matched_nodes, [&](parallel_layer_guid_t const &node) {
+        return machine_views.at(node);
+      });
+
+  std::unordered_map<parallel_layer_guid_t, ParallelLayerAttrs> post_node_data =
+      [&] {
+        std::unordered_map<parallel_layer_guid_t, ParallelLayerAttrs>
+            post_node_data_from_orig = restrict_keys(
+                pre_data.node_data, post_nodes_from_original_graph);
+        std::unordered_map<parallel_layer_guid_t, ParallelLayerAttrs>
+            post_node_data_from_sub = output_graph_data.node_data;
+
+        // just taking the first substituted machine view, not sure if this
+        // is fine
+        for (auto [layer, attrs] : post_node_data_from_sub) {
+          machine_views.try_emplace(layer, *substituted_machine_views.begin());
+        }
+
+        return merge_disjoint_maps(post_node_data_from_orig,
+                                   post_node_data_from_sub);
+      }();
+
+  std::unordered_set<SubParallelComputationGraphEdge> post_edges = [&] {
+    std::unordered_set<SubParallelComputationGraphEdge> post_edges_from_orig =
+        filter(pre_data.edges, [&](SubParallelComputationGraphEdge const &e) {
+          if (e.raw_edge.has<DataflowInputEdge>()) {
+            return true;
+          } else {
+            DataflowEdge dfe = e.raw_edge.get<DataflowEdge>();
+            parallel_layer_guid_t src = parallel_layer_guid_t{dfe.src.node};
+            parallel_layer_guid_t dst = parallel_layer_guid_t{dfe.dst.node};
+            return !(contains(matched_nodes, src) ||
+                     contains(matched_nodes, dst));
+          }
+        });
+
+    std::unordered_set<SubParallelComputationGraphEdge> post_edges_from_sub =
+        filter(output_graph_data.edges,
+               [&](SubParallelComputationGraphEdge const &e) {
+                 return !e.raw_edge.has<DataflowInputEdge>();
+               });
+
+    bidict<PatternNodeOutput, parallel_tensor_guid_t>
+        output_orig_pattern_mapping = get_output_mapping_for_pcg_pattern_match(
+            match, sub.pcg_pattern, spcg);
+    bidict<parallel_tensor_guid_t, OutputGraphExprNodeOutput>
+        output_post_outexpr_mapping = get_output_graph_expr_output_mapping(
+            output_expr_to_result_sub_pcg_mapping,
+            sub.output_graph_expr,
+            substitution_output_graph);
+
+    std::unordered_set<SubParallelComputationGraphEdge> incoming_to_sub_edges;
+    for (auto const &[pattern_input, base_graph_tensor] :
+         match.input_assignment) {
+      OutputGraphExprInput output_expr_input =
+          sub.inputs_mapping.at_l(pattern_input);
+      input_parallel_tensor_guid_t output_graph_input =
+          output_expr_to_result_sub_pcg_mapping.input_mapping.at_r(
+              output_expr_input);
+      std::unordered_set<parallel_tensor_use_t> uses = get_parallel_tensor_uses(
+          substitution_output_graph,
+          open_parallel_tensor_guid_from_input(output_graph_input));
+      for (parallel_tensor_use_t const &use : uses) {
+        SubParallelComputationGraphEdge new_edge =
+            subpcg_edge_from_tensor_and_use(base_graph_tensor, use);
+        incoming_to_sub_edges.insert(new_edge);
+      }
+    }
+
+    std::unordered_set<SubParallelComputationGraphEdge> outgoing_from_sub_edges;
+    for (ParallelComputationGraphEdge const &outgoing_edge :
+         get_subgraph_outgoing_edges(spcg, matched_nodes)) {
+      parallel_tensor_guid_t original_tensor =
+          get_parallel_tensor(outgoing_edge);
+      PatternNodeOutput pattern_tensor =
+          output_orig_pattern_mapping.at_r(original_tensor);
+      OutputGraphExprNodeOutput output_graph_tensor =
+          sub.outputs_mapping.at_l(pattern_tensor);
+      parallel_tensor_guid_t new_tensor =
+          output_post_outexpr_mapping.at_r(output_graph_tensor);
+
+      SubParallelComputationGraphEdge new_edge =
+          subpcg_edge_from_tensor_and_dst(
+              new_tensor,
+              get_dst_layer(outgoing_edge),
+              get_dst_layer_input_idx(outgoing_edge));
+      outgoing_from_sub_edges.insert(new_edge);
+    }
+
+    return set_union(std::vector{
+        post_edges_from_orig,
+        post_edges_from_sub,
+        incoming_to_sub_edges,
+        outgoing_from_sub_edges,
+    });
+  }();
+
+  std::unordered_set<input_parallel_tensor_guid_t> post_inputs =
+      pre_data.inputs;
+
+  std::unordered_map<open_parallel_tensor_guid_t, ParallelTensorAttrs>
+      post_value_data = [&] {
+        std::unordered_map<open_parallel_tensor_guid_t, ParallelTensorAttrs>
+            post_value_data_from_orig = filter_keys(
+                pre_data.value_data, [&](open_parallel_tensor_guid_t const &t) {
+                  return visit_open_parallel_tensor_guid(
+                      t,
+                      overload{
+                          [&](parallel_tensor_guid_t const &t) {
+                            return contains(post_nodes_from_original_graph,
+                                            get_source_layer(t));
+                          },
+                          [](input_parallel_tensor_guid_t const &) {
+                            return true;
+                          },
+                      });
+                });
+
+        std::unordered_map<open_parallel_tensor_guid_t, ParallelTensorAttrs>
+            post_value_data_from_sub = output_graph_data.value_data;
+        return merge_disjoint_maps(post_value_data_from_orig,
+                                   post_value_data_from_sub);
+      }();
+
+  SubParallelComputationGraphData post_data = SubParallelComputationGraphData{
+      post_node_data,
+      post_edges,
+      post_inputs,
+      post_value_data,
+  };
+
+  return SearchResult{
+      pcg_from_sub_pcg_by_dropping_inputs(sub_pcg_from_graph_data(post_data)),
+      MachineMapping{machine_views}};
+}
+
+} // namespace FlexFlow
+
+#endif
\ No newline at end of file
diff --git a/lib/substitutions/src/substitutions/unlabelled/pattern_matching.cc b/lib/substitutions/src/substitutions/unlabelled/pattern_matching.cc
index cfb34aac3a..d73764dc3e 100644
--- a/lib/substitutions/src/substitutions/unlabelled/pattern_matching.cc
+++ b/lib/substitutions/src/substitutions/unlabelled/pattern_matching.cc
@@ -97,8 +97,8 @@ bool pattern_matches_subgraph_under(
         &full_graph_values_to_subgraph_inputs,
     UnlabelledDataflowGraphPatternMatch const &match,
     MatchAdditionalCriterion const &additional_criterion) {
-      std::cout << "gamer" << std::endl;
-      std::cout << get_open_dataflow_values(pattern.raw_graph) << std::endl;
+  std::cout << "gamer" << std::endl;
+  std::cout << get_open_dataflow_values(pattern.raw_graph) << std::endl;
   SubgraphConcreteFromPattern concrete_from_pattern{
       match, full_graph_values_to_subgraph_inputs};
 
@@ -135,18 +135,18 @@ bool pattern_matches_subgraph_under(
   }
   std::cout << "later later mid of pattern amtches subgrpah under" << std::endl;
 
-
   for (PatternValue const &pattern_value : get_values(pattern)) {
     std::cout << "dfjsahdfkiasjhdfkasjhdfkasdjhdfbgk awerhurgvt " << std::endl;
     std::cout << get_open_dataflow_values(pattern.raw_graph) << std::endl;
     std::cout << pattern_value << std::endl;
     if (!additional_criterion.value_criterion(
             pattern_value, concrete_from_pattern(pattern_value))) {
-              std::cout << "dfjsahdfkiasjhdfkasjhdfkasdjhdfbgk awerhurgvtfwewefewfewf " << std::endl;
+      std::cout << "dfjsahdfkiasjhdfkasjhdfkasdjhdfbgk awerhurgvtfwewefewfewf "
+                << std::endl;
       return false;
     }
   }
-   std::cout << "end of pattern amtches subgrpah under" << std::endl;
+  std::cout << "end of pattern amtches subgrpah under" << std::endl;
 
   return true;
 }
@@ -156,19 +156,19 @@ bool unlabelled_pattern_does_match(
     OpenDataflowGraphView const &graph,
     UnlabelledDataflowGraphPatternMatch const &match,
     MatchAdditionalCriterion const &additional_criterion) {
-   std::cout << "unlabelled_pattern_does_match" << std::endl;
+  std::cout << "unlabelled_pattern_does_match" << std::endl;
 
   OpenDataflowSubgraphResult subgraph_result = subgraph_matched(graph, match);
   OpenDataflowGraphView matched_subgraph = subgraph_result.graph;
 
   assert(left_entries(match.node_assignment) == get_nodes(pattern));
   assert(right_entries(match.node_assignment) == get_nodes(matched_subgraph));
-   std::cout << "middle of" << std::endl;
-   std::cout << get_open_dataflow_values(pattern.raw_graph) << std::endl;
-   std::cout << left_entries(match.node_assignment) << std::endl;
-   std::cout << right_entries(match.node_assignment) << std::endl;
-   std::cout << get_nodes(pattern) << std::endl;
-   std::cout << get_nodes(matched_subgraph) << std::endl;
+  std::cout << "middle of" << std::endl;
+  std::cout << get_open_dataflow_values(pattern.raw_graph) << std::endl;
+  std::cout << left_entries(match.node_assignment) << std::endl;
+  std::cout << right_entries(match.node_assignment) << std::endl;
+  std::cout << get_nodes(pattern) << std::endl;
+  std::cout << get_nodes(matched_subgraph) << std::endl;
 
   MatchAdditionalCriterion through_subgraph_operation =
       MatchAdditionalCriterion{
@@ -176,24 +176,24 @@ bool unlabelled_pattern_does_match(
           [&](PatternValue const &pv, OpenDataflowValue const &v) {
             return v.visit<bool>(overload{
                 [&](DataflowOutput const &) {
-                  //std::cout << "whefihweoifhewfi" <<std::endl;
+                  // std::cout << "whefihweoifhewfi" <<std::endl;
                   return additional_criterion.value_criterion(pv, v);
                 },
                 [&](DataflowGraphInput const &subgraph_input) {
-                  //std::cout << "bobobobobob" << std::endl;
+                  // std::cout << "bobobobobob" << std::endl;
                   OpenDataflowValue full_graph_value =
                       subgraph_result.full_graph_values_to_subgraph_inputs.at_r(
                           subgraph_input);
-                          /*std::cout << "ppopopopopopo" << std::endl;
-                          bool ss = additional_criterion.value_criterion(pv,
-                            full_graph_value);
-                            std::cout << "lolololololo" << std::endl;*/
+                  /*std::cout << "ppopopopopopo" << std::endl;
+                  bool ss = additional_criterion.value_criterion(pv,
+                    full_graph_value);
+                    std::cout << "lolololololo" << std::endl;*/
                   return additional_criterion.value_criterion(pv,
                                                               full_graph_value);
                 }});
           },
       };
-   //std::cout << "end of unlabelled_pattern_does_match" << std::endl;
+  // std::cout << "end of unlabelled_pattern_does_match" << std::endl;
 
   return pattern_matches_subgraph_under(
       pattern,
diff --git a/lib/utils/include/utils/random_utils.h b/lib/utils/include/utils/random_utils.h
index 99da9646a1..014c38fc51 100644
--- a/lib/utils/include/utils/random_utils.h
+++ b/lib/utils/include/utils/random_utils.h
@@ -5,7 +5,7 @@
 #include <stdexcept>
 #include <vector>
 
-float randf() {
+inline float randf() {
   return static_cast<float>(std::rand()) / static_cast<float>(RAND_MAX);
 }
 

From 355fe3f29518f875b5ef00ff13285eab5c51c892 Mon Sep 17 00:00:00 2001
From: Victor Li <vli42@sapling2.stanford.edu>
Date: Thu, 3 Apr 2025 01:58:52 -0700
Subject: [PATCH 04/11] Adding substitutions to MCMC (not quite working yet)

---
 ..._substitution_and_update_machine_mapping.h |   0
 .../machine_mapping_mutation_set.h            |   5 +-
 .../mcmc/mcmc_search_config.struct.toml       |   5 +
 ...ubstitution_and_update_machine_mapping.cc} |   4 +-
 .../machine_mapping_mutation_set.cc           |  24 ++--
 .../src/compiler/mcmc/mcmc_algorithm.cc       | 107 +++++++++---------
 .../test/src/compiler/mcmc/mcmc_algorithm.cc  |  12 +-
 .../include/substitutions/pcg_pattern.h       |   4 +
 .../substitutions/unity_substitution_set.h    |   2 +
 .../operator_pattern/satisfies_constraint.cc  |   1 -
 .../src/substitutions/pcg_pattern.cc          |  12 ++
 .../substitutions/unity_substitution_set.cc   |  10 ++
 .../unlabelled/pattern_matching.cc            |  23 ----
 13 files changed, 108 insertions(+), 101 deletions(-)
 rename lib/{substitutions/include/substitutions/apply_substitution => compiler/include/compiler/machine_mapping}/apply_substitution_and_update_machine_mapping.h (100%)
 rename lib/{substitutions/src/substitutions/apply_substitution/apply_substitution_and_update_machine_mapping => compiler/src/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.cc} (98%)

diff --git a/lib/substitutions/include/substitutions/apply_substitution/apply_substitution_and_update_machine_mapping.h b/lib/compiler/include/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.h
similarity index 100%
rename from lib/substitutions/include/substitutions/apply_substitution/apply_substitution_and_update_machine_mapping.h
rename to lib/compiler/include/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.h
diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_mutation_set.h b/lib/compiler/include/compiler/machine_mapping/machine_mapping_mutation_set.h
index 443ab06f02..6dfefec7d1 100644
--- a/lib/compiler/include/compiler/machine_mapping/machine_mapping_mutation_set.h
+++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_mutation_set.h
@@ -7,14 +7,15 @@
 namespace FlexFlow {
 std::optional<MachineMapping>
     get_naive_mapping(ParallelComputationGraph &pcg,
-                      MachineSpecification const &resources);
+                      MachineSpecification const &resources,
+                      DeviceType const &device_type);
 std::vector<MachineMapping>
     get_possible_mutations(SearchResult mapped_pcg,
                            MachineSpecification const &resource);
 std::optional<MachineMapping>
     get_random_mutation(SearchResult mapped_pcg,
                         MachineSpecification const &resource,
-                        DeviceType const &device_type = DeviceType::GPU);
+                        DeviceType const &device_type);
 MachineView increment_stride(MachineView machine_view, nonnegative_int dim);
 MachineView decrement_all_strides(MachineView machine_view);
 MachineView change_stride(nonnegative_int stride,
diff --git a/lib/compiler/include/compiler/mcmc/mcmc_search_config.struct.toml b/lib/compiler/include/compiler/mcmc/mcmc_search_config.struct.toml
index e96ced81cd..6bc5d98be7 100644
--- a/lib/compiler/include/compiler/mcmc/mcmc_search_config.struct.toml
+++ b/lib/compiler/include/compiler/mcmc/mcmc_search_config.struct.toml
@@ -7,6 +7,7 @@ features = [
 ]
 
 includes = [
+  "pcg/device_type.dtg.h"
 ]
 
 [[fields]]
@@ -24,3 +25,7 @@ type = "int"
 [[fields]]
 name = "max_num_ops"
 type = "int"
+
+[[fields]]
+name = "device_type"
+type = "::FlexFlow::DeviceType"
\ No newline at end of file
diff --git a/lib/substitutions/src/substitutions/apply_substitution/apply_substitution_and_update_machine_mapping b/lib/compiler/src/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.cc
similarity index 98%
rename from lib/substitutions/src/substitutions/apply_substitution/apply_substitution_and_update_machine_mapping
rename to lib/compiler/src/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.cc
index 1721ee26d8..411ee67145 100644
--- a/lib/substitutions/src/substitutions/apply_substitution/apply_substitution_and_update_machine_mapping
+++ b/lib/compiler/src/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.cc
@@ -1,4 +1,4 @@
-#include "substitutions/apply_substitution/apply_substitution_and_update_machine_mapping.h"
+#include "compiler/machine_mapping/apply_substitution_and_update_machine_mapping.h"
 #include "pcg/parallel_computation_graph/parallel_computation_graph_edge.h"
 #include "pcg/parallel_computation_graph/parallel_tensor_guid_t.h"
 #include "substitutions/apply_substitution/evaluate_substitution_output.h"
@@ -181,5 +181,3 @@ SearchResult apply_substitution_and_update_machine_mapping(
 }
 
 } // namespace FlexFlow
-
-#endif
\ No newline at end of file
diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping_mutation_set.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping_mutation_set.cc
index 36651fdc5d..7f7a54d07a 100644
--- a/lib/compiler/src/compiler/machine_mapping/machine_mapping_mutation_set.cc
+++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping_mutation_set.cc
@@ -11,7 +11,8 @@ namespace FlexFlow {
 
 std::optional<MachineMapping>
     get_naive_mapping(ParallelComputationGraph &pcg,
-                      MachineSpecification const &resources) {
+                      MachineSpecification const &resources,
+                      DeviceType const &device_type) {
   std::vector<parallel_layer_guid_t> layers = topological_ordering(pcg);
   std::unordered_map<parallel_layer_guid_t, MachineView> machine_views;
   for (parallel_layer_guid_t layer : layers) {
@@ -27,9 +28,9 @@ std::optional<MachineMapping>
 }
 
 std::optional<MachineMapping>
-    get_random_mutation_notlazy(SearchResult mapped_pcg,
-                                MachineSpecification const &resources,
-                                DeviceType const &device_type) {
+    get_random_mutation(SearchResult mapped_pcg,
+                        MachineSpecification const &resources,
+                        DeviceType const &device_type) {
   ParallelComputationGraph pcg = mapped_pcg.pcg;
   std::vector<parallel_layer_guid_t> layers = topological_ordering(pcg);
   if (layers.size() == 0) {
@@ -90,25 +91,26 @@ std::optional<MachineMapping>
   return machine_mapping;
 }
 
+// "lazy" version just picks a random available machine view for a random layer
 std::optional<MachineMapping>
-    get_random_mutation(SearchResult mapped_pcg,
-                        MachineSpecification const &resources,
-                        DeviceType const &device_type) {
+    get_random_mutation_lazy(SearchResult mapped_pcg,
+                             MachineSpecification const &resources,
+                             DeviceType const &device_type) {
   ParallelComputationGraph pcg = mapped_pcg.pcg;
   std::vector<parallel_layer_guid_t> layers = topological_ordering(pcg);
   if (layers.size() == 0) {
     return std::nullopt;
   }
-  parallel_layer_guid_t random_layer = layers.at(rand() % layers.size());
+  parallel_layer_guid_t random_layer = select_random(layers);
+  ;
 
   MachineMapping machine_mapping = mapped_pcg.machine_mapping;
   MachineView machine_view = machine_mapping.machine_views.at(random_layer);
   OperatorTaskSpace task = get_operator_task_space(pcg, random_layer);
 
   std::vector<MachineView> allowed_machine_views =
-      vector_of(get_allowed_machine_views(resources, task, DeviceType::GPU));
-  MachineView random_new_machine_view =
-      allowed_machine_views.at(rand() % allowed_machine_views.size());
+      vector_of(get_allowed_machine_views(resources, task, device_type));
+  MachineView random_new_machine_view = select_random(allowed_machine_views);
 
   machine_mapping.machine_views.at(random_layer) = random_new_machine_view;
   return machine_mapping;
diff --git a/lib/compiler/src/compiler/mcmc/mcmc_algorithm.cc b/lib/compiler/src/compiler/mcmc/mcmc_algorithm.cc
index 6553823252..f8ef392eee 100644
--- a/lib/compiler/src/compiler/mcmc/mcmc_algorithm.cc
+++ b/lib/compiler/src/compiler/mcmc/mcmc_algorithm.cc
@@ -1,8 +1,8 @@
 #include "compiler/mcmc/mcmc_algorithm.h"
+#include "compiler/machine_mapping/apply_substitution_and_update_machine_mapping.h"
 #include "compiler/machine_mapping/machine_mapping_mutation_set.h"
 #include "compiler/mcmc/mcmc_graph_optimize_state.h"
 #include "compiler/task_graph_simulator/task_simulator.h"
-#include "substitutions/apply_substitution/apply_substitution_and_update_machine_mapping.h"
 #include "substitutions/pcg_pattern.h"
 #include "substitutions/pcg_pattern_match.h"
 #include "substitutions/substitution.h"
@@ -12,28 +12,28 @@
 
 namespace FlexFlow {
 
-std::vector<SearchResult> all_pcgs_obtained_by_applying_a_substitution(
-    SearchResult const &mapped_pcg,
-    std::vector<Substitution> const &substitutions) {
-  std::vector<SearchResult> results;
-  // currently not functional
-  /*SubParallelComputationGraph subpcg = sub_pcg_from_full_pcg(mapped_pcg.pcg);
-  for (Substitution const &substitution : substitutions) {
-    for (PCGPatternMatch const &pattern_match :
-         find_pattern_matches(substitution.pcg_pattern, subpcg)) {
-      SearchResult mapped_pcg_from_substitution =
-          apply_substitution_and_update_machine_mapping(
-              mapped_pcg, substitution, pattern_match);
-      results.push_back(mapped_pcg_from_substitution);
-    }
-  }*/
-  return results;
-}
-
 bool mcmc_accept(int delta, float temperature) {
   return delta < 0 || randf() < exp(-delta / temperature);
 }
 
+void modify_graph_state(MCMCOptimizeState &best_state,
+                        MCMCOptimizeState &current_state,
+                        SearchResult candidate,
+                        CostEstimator const &cost_estimator,
+                        MachineSpecification const &resources,
+                        MCMCSearchConfig const &search_config) {
+  float best_estimate = best_state.runtime;
+  float new_estimate = task_simulator_estimate_forward_pass_time(
+      candidate.pcg, cost_estimator, candidate.machine_mapping, resources);
+  float runtime_delta = new_estimate - best_estimate;
+  if (mcmc_accept(runtime_delta, search_config.temperature)) {
+    current_state = MCMCOptimizeState{candidate, new_estimate};
+    if (runtime_delta < 0) {
+      best_state = current_state;
+    }
+  }
+}
+
 SearchResult mcmc_graph_optimize(ParallelComputationGraph &pcg,
                                  CostEstimator const &cost_estimator,
                                  MachineSpecification const &resources,
@@ -42,7 +42,7 @@ SearchResult mcmc_graph_optimize(ParallelComputationGraph &pcg,
   std::vector<Substitution> substitutions = get_substitution_set(resources);
 
   std::optional<MachineMapping> naive_mapping =
-      get_naive_mapping(pcg, resources);
+      get_naive_mapping(pcg, resources, search_config.device_type);
   if (naive_mapping == std::nullopt) {
     throw std::runtime_error("Failed to find any solutions");
   }
@@ -58,49 +58,46 @@ SearchResult mcmc_graph_optimize(ParallelComputationGraph &pcg,
        ++iteration) {
 
     SearchResult current_mapped_pcg = current_state.mapped_pcg;
-    float best_estimate = best_state.runtime;
 
-    /*for (SearchResult const &new_mapped_pcg :
-         all_pcgs_obtained_by_applying_a_substitution(current_mapped_pcg,
-                                                      substitutions)) {
-      float new_estimate = task_simulator_estimate_forward_pass_time(
-          new_mapped_pcg.pcg,
-          cost_estimator,
-          new_mapped_pcg.machine_mapping,
-          resources);
-
-      if (new_estimate <= search_config.threshold &&
-          get_nodes(new_mapped_pcg.pcg.raw_graph).size() <=
-              search_config.max_num_ops) {
-        candidates.push(MCMCOptimizeState{new_mapped_pcg, -1 * new_estimate});
-      }
-    }*/
-
-    std::optional<MachineMapping> new_machine_mapping =
-        get_random_mutation(current_mapped_pcg, resources);
-    for (int searched_mutations = 0;
+    std::optional<MachineMapping> new_machine_mapping = get_random_mutation(
+        current_mapped_pcg, resources, search_config.device_type);
+    for (int searched_mutations = 1;
          searched_mutations < search_config.num_mutations_per_iteration;
          searched_mutations++) {
       if (new_machine_mapping == std::nullopt) {
         break;
       }
-      float new_estimate =
-          task_simulator_estimate_forward_pass_time(current_mapped_pcg.pcg,
-                                                    cost_estimator,
-                                                    new_machine_mapping.value(),
-                                                    resources);
-      float runtime_delta = new_estimate - best_estimate;
+      modify_graph_state(
+          best_state,
+          current_state,
+          SearchResult{current_mapped_pcg.pcg, new_machine_mapping.value()},
+          cost_estimator,
+          resources,
+          search_config);
 
-      if (mcmc_accept(runtime_delta, search_config.temperature)) {
-        current_state = MCMCOptimizeState{
-            SearchResult{current_mapped_pcg.pcg, new_machine_mapping.value()},
-            new_estimate};
-        if (runtime_delta < 0) {
-          best_state = current_state;
-        }
-      }
+      new_machine_mapping = get_random_mutation(
+          current_mapped_pcg, resources, search_config.device_type);
+    }
 
-      new_machine_mapping = get_random_mutation(current_mapped_pcg, resources);
+    std::optional<Substitution> random_substitution =
+        get_random_substitution(resources);
+    if (random_substitution != std::nullopt) {
+      std::optional<PCGPatternMatch> pattern_match = get_random_pattern_match(
+          random_substitution.value().pcg_pattern,
+          sub_pcg_from_full_pcg(current_mapped_pcg.pcg));
+      if (pattern_match != std::nullopt) {
+        SearchResult new_mapped_pcg =
+            apply_substitution_and_update_machine_mapping(
+                current_mapped_pcg,
+                random_substitution.value(),
+                pattern_match.value());
+        modify_graph_state(best_state,
+                           current_state,
+                           new_mapped_pcg,
+                           cost_estimator,
+                           resources,
+                           search_config);
+      }
     }
   }
 
diff --git a/lib/compiler/test/src/compiler/mcmc/mcmc_algorithm.cc b/lib/compiler/test/src/compiler/mcmc/mcmc_algorithm.cc
index 7cde75cecf..7aad8b098d 100644
--- a/lib/compiler/test/src/compiler/mcmc/mcmc_algorithm.cc
+++ b/lib/compiler/test/src/compiler/mcmc/mcmc_algorithm.cc
@@ -62,12 +62,12 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*intra_node_bandwidth=*/1,
     };
 
-    MCMCSearchConfig search_config = MCMCSearchConfig{
-        /*temperature=*/1.0,
-        /*num_iterations=*/100,
-        /*num_mutations_per_iteration=*/10,
-        /*max_num_ops=*/100,
-    };
+    MCMCSearchConfig search_config =
+        MCMCSearchConfig{/*temperature=*/1.0,
+                         /*num_iterations=*/5,
+                         /*num_mutations_per_iteration=*/10,
+                         /*max_num_ops=*/100,
+                         /*device_type=*/DeviceType::GPU};
 
     SearchResult result = mcmc_graph_optimize(
         pcg, cost_estimator, full_machine_spec, search_config);
diff --git a/lib/substitutions/include/substitutions/pcg_pattern.h b/lib/substitutions/include/substitutions/pcg_pattern.h
index f0962b15c2..5005a0b51c 100644
--- a/lib/substitutions/include/substitutions/pcg_pattern.h
+++ b/lib/substitutions/include/substitutions/pcg_pattern.h
@@ -12,6 +12,10 @@ namespace FlexFlow {
 
 std::unordered_set<PatternNode> get_nodes(PCGPattern const &);
 
+std::optional<PCGPatternMatch>
+    get_random_pattern_match(PCGPattern const &pattern,
+                             SubParallelComputationGraph const &pcg);
+
 /**
  * @brief Find all locations in \p pcg that match \p pattern
  */
diff --git a/lib/substitutions/include/substitutions/unity_substitution_set.h b/lib/substitutions/include/substitutions/unity_substitution_set.h
index 183f76ac8a..959ba3da2c 100644
--- a/lib/substitutions/include/substitutions/unity_substitution_set.h
+++ b/lib/substitutions/include/substitutions/unity_substitution_set.h
@@ -6,6 +6,8 @@
 #include "utils/fmt/vector.h"
 
 namespace FlexFlow {
+std::optional<Substitution>
+    get_random_substitution(MachineSpecification const &resources);
 
 std::vector<Substitution>
     get_substitution_set(MachineSpecification const &resources);
diff --git a/lib/substitutions/src/substitutions/operator_pattern/satisfies_constraint.cc b/lib/substitutions/src/substitutions/operator_pattern/satisfies_constraint.cc
index a95db6fbb4..f39b771364 100644
--- a/lib/substitutions/src/substitutions/operator_pattern/satisfies_constraint.cc
+++ b/lib/substitutions/src/substitutions/operator_pattern/satisfies_constraint.cc
@@ -13,7 +13,6 @@ bool operator_satisfies_constraint(
     return false;
   }
 
-  // std::cout << constraint.constraint_type << std::endl;
   switch (constraint.constraint_type) {
     case ConstraintType::EQUAL:
       return expr_val.value() == constraint.attribute_value;
diff --git a/lib/substitutions/src/substitutions/pcg_pattern.cc b/lib/substitutions/src/substitutions/pcg_pattern.cc
index a0af875848..fbc181a0f9 100644
--- a/lib/substitutions/src/substitutions/pcg_pattern.cc
+++ b/lib/substitutions/src/substitutions/pcg_pattern.cc
@@ -11,6 +11,7 @@
 #include "utils/graph/node/algorithms.h"
 #include "utils/graph/open_dataflow_graph/algorithms/get_inputs.h"
 #include "utils/graph/open_dataflow_graph/algorithms/get_open_dataflow_graph_inputs.h"
+#include "utils/random_utils.h"
 
 namespace FlexFlow {
 
@@ -37,6 +38,17 @@ static MatchAdditionalCriterion
       }};
 }
 
+std::optional<PCGPatternMatch>
+    get_random_pattern_match(PCGPattern const &pattern,
+                             SubParallelComputationGraph const &pcg) {
+  std::vector<PCGPatternMatch> pattern_matches =
+      find_pattern_matches(pattern, pcg);
+  if (pattern_matches.empty()) {
+    return std::nullopt;
+  }
+  return select_random(pattern_matches);
+}
+
 std::vector<PCGPatternMatch>
     find_pattern_matches(PCGPattern const &pattern,
                          SubParallelComputationGraph const &pcg) {
diff --git a/lib/substitutions/src/substitutions/unity_substitution_set.cc b/lib/substitutions/src/substitutions/unity_substitution_set.cc
index 4b00cdd95f..c8d9266978 100644
--- a/lib/substitutions/src/substitutions/unity_substitution_set.cc
+++ b/lib/substitutions/src/substitutions/unity_substitution_set.cc
@@ -7,9 +7,19 @@
 #include "utils/containers/get_only.h"
 #include "utils/nonnegative_int/nonnegative_int.h"
 #include "utils/nonnegative_int/nonnegative_range.h"
+#include "utils/random_utils.h"
 
 namespace FlexFlow {
 
+std::optional<Substitution>
+    get_random_substitution(MachineSpecification const &resources) {
+  std::vector<Substitution> substitutions = get_substitution_set(resources);
+  if (substitutions.empty()) {
+    return std::nullopt;
+  }
+  return select_random(substitutions);
+}
+
 std::vector<Substitution>
     get_substitution_set(MachineSpecification const &resources) {
   std::vector<Substitution> substitutions;
diff --git a/lib/substitutions/src/substitutions/unlabelled/pattern_matching.cc b/lib/substitutions/src/substitutions/unlabelled/pattern_matching.cc
index d73764dc3e..304bb8cf46 100644
--- a/lib/substitutions/src/substitutions/unlabelled/pattern_matching.cc
+++ b/lib/substitutions/src/substitutions/unlabelled/pattern_matching.cc
@@ -97,8 +97,6 @@ bool pattern_matches_subgraph_under(
         &full_graph_values_to_subgraph_inputs,
     UnlabelledDataflowGraphPatternMatch const &match,
     MatchAdditionalCriterion const &additional_criterion) {
-  std::cout << "gamer" << std::endl;
-  std::cout << get_open_dataflow_values(pattern.raw_graph) << std::endl;
   SubgraphConcreteFromPattern concrete_from_pattern{
       match, full_graph_values_to_subgraph_inputs};
 
@@ -133,20 +131,13 @@ bool pattern_matches_subgraph_under(
   if (concrete_values != concrete_values_from_match) {
     return false;
   }
-  std::cout << "later later mid of pattern amtches subgrpah under" << std::endl;
 
   for (PatternValue const &pattern_value : get_values(pattern)) {
-    std::cout << "dfjsahdfkiasjhdfkasjhdfkasdjhdfbgk awerhurgvt " << std::endl;
-    std::cout << get_open_dataflow_values(pattern.raw_graph) << std::endl;
-    std::cout << pattern_value << std::endl;
     if (!additional_criterion.value_criterion(
             pattern_value, concrete_from_pattern(pattern_value))) {
-      std::cout << "dfjsahdfkiasjhdfkasjhdfkasdjhdfbgk awerhurgvtfwewefewfewf "
-                << std::endl;
       return false;
     }
   }
-  std::cout << "end of pattern amtches subgrpah under" << std::endl;
 
   return true;
 }
@@ -156,19 +147,12 @@ bool unlabelled_pattern_does_match(
     OpenDataflowGraphView const &graph,
     UnlabelledDataflowGraphPatternMatch const &match,
     MatchAdditionalCriterion const &additional_criterion) {
-  std::cout << "unlabelled_pattern_does_match" << std::endl;
 
   OpenDataflowSubgraphResult subgraph_result = subgraph_matched(graph, match);
   OpenDataflowGraphView matched_subgraph = subgraph_result.graph;
 
   assert(left_entries(match.node_assignment) == get_nodes(pattern));
   assert(right_entries(match.node_assignment) == get_nodes(matched_subgraph));
-  std::cout << "middle of" << std::endl;
-  std::cout << get_open_dataflow_values(pattern.raw_graph) << std::endl;
-  std::cout << left_entries(match.node_assignment) << std::endl;
-  std::cout << right_entries(match.node_assignment) << std::endl;
-  std::cout << get_nodes(pattern) << std::endl;
-  std::cout << get_nodes(matched_subgraph) << std::endl;
 
   MatchAdditionalCriterion through_subgraph_operation =
       MatchAdditionalCriterion{
@@ -176,24 +160,17 @@ bool unlabelled_pattern_does_match(
           [&](PatternValue const &pv, OpenDataflowValue const &v) {
             return v.visit<bool>(overload{
                 [&](DataflowOutput const &) {
-                  // std::cout << "whefihweoifhewfi" <<std::endl;
                   return additional_criterion.value_criterion(pv, v);
                 },
                 [&](DataflowGraphInput const &subgraph_input) {
-                  // std::cout << "bobobobobob" << std::endl;
                   OpenDataflowValue full_graph_value =
                       subgraph_result.full_graph_values_to_subgraph_inputs.at_r(
                           subgraph_input);
-                  /*std::cout << "ppopopopopopo" << std::endl;
-                  bool ss = additional_criterion.value_criterion(pv,
-                    full_graph_value);
-                    std::cout << "lolololololo" << std::endl;*/
                   return additional_criterion.value_criterion(pv,
                                                               full_graph_value);
                 }});
           },
       };
-  // std::cout << "end of unlabelled_pattern_does_match" << std::endl;
 
   return pattern_matches_subgraph_under(
       pattern,

From e398ba0a533df71fe0d632d696ac90c0ae136340 Mon Sep 17 00:00:00 2001
From: Victor Li <vli42@sapling2.stanford.edu>
Date: Fri, 4 Apr 2025 14:47:02 -0700
Subject: [PATCH 05/11] Added generic MCMC

---
 .../compiler/mcmc/generic_mcmc_algorithm.h    | 60 +++++++++++++++++++
 .../mcmc/generic_mcmc_config.struct.toml      | 19 ++++++
 .../compiler/mcmc/generic_mcmc_state.h        | 27 +++++++++
 .../compiler/mcmc/generic_mcmc_algorithm.cc   |  1 +
 .../src/compiler/mcmc/generic_mcmc_state.cc   | 12 ++++
 .../compiler/mcmc/generic_mcmc_algorithm.cc   | 32 ++++++++++
 6 files changed, 151 insertions(+)
 create mode 100644 lib/compiler/include/compiler/mcmc/generic_mcmc_algorithm.h
 create mode 100644 lib/compiler/include/compiler/mcmc/generic_mcmc_config.struct.toml
 create mode 100644 lib/compiler/include/compiler/mcmc/generic_mcmc_state.h
 create mode 100644 lib/compiler/src/compiler/mcmc/generic_mcmc_algorithm.cc
 create mode 100644 lib/compiler/src/compiler/mcmc/generic_mcmc_state.cc
 create mode 100644 lib/compiler/test/src/compiler/mcmc/generic_mcmc_algorithm.cc

diff --git a/lib/compiler/include/compiler/mcmc/generic_mcmc_algorithm.h b/lib/compiler/include/compiler/mcmc/generic_mcmc_algorithm.h
new file mode 100644
index 0000000000..bea0061d47
--- /dev/null
+++ b/lib/compiler/include/compiler/mcmc/generic_mcmc_algorithm.h
@@ -0,0 +1,60 @@
+#ifndef _FLEXFLOW_COMPILER_MCMC_GENERIC_MCMC_ALGORITHM_H
+#define _FLEXFLOW_COMPILER_MCMC_GENERIC_MCMC_ALGORITHM_H
+
+#include "compiler/mcmc/generic_mcmc_config.dtg.h"
+#include "compiler/mcmc/generic_mcmc_state.h"
+#include "utils/nonnegative_int/nonnegative_range.h"
+#include "utils/random_utils.h"
+#include <optional>
+
+namespace FlexFlow {
+
+bool mcmc_accept(float delta, float temperature) {
+  return delta < 0 || (randf() < exp(-delta / temperature));
+}
+
+template <typename State, typename ScoringFunc>
+void modify_state(Generic_MCMC_state<State, float> &best_state,
+                  Generic_MCMC_state<State, float> &current_state,
+                  State candidate,
+                  ScoringFunc scorer,
+                  float temperature) {
+  float best_estimate = best_state.get_score();
+  float new_estimate = scorer(candidate);
+  float runtime_delta = new_estimate - best_estimate;
+  if (mcmc_accept(runtime_delta, temperature)) {
+    current_state = Generic_MCMC_state<State, float>(candidate, new_estimate);
+    if (runtime_delta < 0) {
+      best_state = current_state;
+    }
+  }
+}
+
+// GeneratingFunc : State -> nn_int -> std::optional<State>
+// ScoringFunc : State -> float
+
+template <typename State, typename GeneratingFunc, typename ScoringFunc>
+Generic_MCMC_state<State, float>
+    minimize_score(State const &starting_state,
+                   GeneratingFunc const &generator,
+                   ScoringFunc const &scorer,
+                   GenericMCMCConfig const &search_config) {
+  using MCMCState = Generic_MCMC_state<State, float>;
+  MCMCState best_state = MCMCState(starting_state, scorer(starting_state));
+  MCMCState current_state = best_state;
+  for (nonnegative_int i : nonnegative_range(search_config.num_iterations)) {
+    std::optional<State> candidate = generator(current_state.get_state(), i);
+    if (candidate != std::nullopt) {
+      modify_state(best_state,
+                   current_state,
+                   candidate.value(),
+                   scorer,
+                   search_config.temperature);
+    }
+  }
+  return best_state;
+}
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/compiler/include/compiler/mcmc/generic_mcmc_config.struct.toml b/lib/compiler/include/compiler/mcmc/generic_mcmc_config.struct.toml
new file mode 100644
index 0000000000..e11c84f0bd
--- /dev/null
+++ b/lib/compiler/include/compiler/mcmc/generic_mcmc_config.struct.toml
@@ -0,0 +1,19 @@
+namespace = "FlexFlow"
+name = "GenericMCMCConfig"
+features = [
+  "eq",
+  "hash",
+  "fmt",
+]
+
+includes = [
+  "utils/nonnegative_int/nonnegative_int.h"
+]
+
+[[fields]]
+name = "temperature"
+type = "float"
+
+[[fields]]
+name = "num_iterations"
+type = "::FlexFlow::nonnegative_int"
\ No newline at end of file
diff --git a/lib/compiler/include/compiler/mcmc/generic_mcmc_state.h b/lib/compiler/include/compiler/mcmc/generic_mcmc_state.h
new file mode 100644
index 0000000000..6a6aada32b
--- /dev/null
+++ b/lib/compiler/include/compiler/mcmc/generic_mcmc_state.h
@@ -0,0 +1,27 @@
+#ifndef _FLEXFLOW_COMPILER_MCMC_GENERIC_MCMC_STATE_H
+#define _FLEXFLOW_COMPILER_MCMC_GENERIC_MCMC_STATE_H
+#include "utils/nonnegative_int/nonnegative_int.h"
+
+namespace FlexFlow {
+
+template <typename State, typename Score>
+struct Generic_MCMC_state {
+public:
+  Generic_MCMC_state(State const &state, Score const &score)
+      : state(state), score(score) {}
+
+  State const &get_state() const {
+    return state;
+  }
+  Score const &get_score() const {
+    return score;
+  }
+
+private:
+  State state;
+  Score score;
+};
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/compiler/src/compiler/mcmc/generic_mcmc_algorithm.cc b/lib/compiler/src/compiler/mcmc/generic_mcmc_algorithm.cc
new file mode 100644
index 0000000000..1bf4f5c2b7
--- /dev/null
+++ b/lib/compiler/src/compiler/mcmc/generic_mcmc_algorithm.cc
@@ -0,0 +1 @@
+#include "compiler/mcmc/generic_mcmc_algorithm.h"
diff --git a/lib/compiler/src/compiler/mcmc/generic_mcmc_state.cc b/lib/compiler/src/compiler/mcmc/generic_mcmc_state.cc
new file mode 100644
index 0000000000..6aa4dd5eff
--- /dev/null
+++ b/lib/compiler/src/compiler/mcmc/generic_mcmc_state.cc
@@ -0,0 +1,12 @@
+#include "compiler/mcmc/generic_mcmc_state.h"
+#include "utils/archetypes/ordered_value_type.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+using State = value_type<0>;
+using Score = ordered_value_type<1>;
+
+template struct Generic_MCMC_state<State, Score>;
+template struct Generic_MCMC_state<State, float>;
+
+} // namespace FlexFlow
diff --git a/lib/compiler/test/src/compiler/mcmc/generic_mcmc_algorithm.cc b/lib/compiler/test/src/compiler/mcmc/generic_mcmc_algorithm.cc
new file mode 100644
index 0000000000..0a175933cf
--- /dev/null
+++ b/lib/compiler/test/src/compiler/mcmc/generic_mcmc_algorithm.cc
@@ -0,0 +1,32 @@
+#include "compiler/mcmc/generic_mcmc_algorithm.h"
+#include "doctest/doctest.h"
+
+using namespace FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("generic_mcmc_algorithm") {
+    float starting_state = 0.1;
+    auto generating_func = [](float x,
+                              nonnegative_int i) -> std::optional<float> {
+      float new_x = x + (randf() - 0.5) / (i.unwrap_nonnegative() + 1);
+      if (new_x < 0) {
+        return std::nullopt;
+      }
+      if (new_x > 1) {
+        return std::nullopt;
+      }
+      return new_x;
+    };
+    auto scoring_func = [](float x) { return (x - 0.5) * (x - 0.5); };
+    GenericMCMCConfig config = GenericMCMCConfig{/*temperature*/ 1.0,
+                                                 /*num_iterations*/ 10_n};
+    Generic_MCMC_state<float, float> result =
+        minimize_score(starting_state, generating_func, scoring_func, config);
+    float answer = result.get_state();
+    float error = result.get_score();
+    CHECK(answer > 0.49);
+    CHECK(answer < 0.51);
+    CHECK(error >= 0);
+    CHECK(error < 0.01);
+  }
+}

From 2f186c393cdbdd4e60b7c4d0571da0870c9eff98 Mon Sep 17 00:00:00 2001
From: Victor Li <vli42@sapling2.stanford.edu>
Date: Fri, 4 Apr 2025 18:07:06 -0700
Subject: [PATCH 06/11] Refactor MCMC to fit the generic

---
 .../compiler/mcmc/generic_mcmc_algorithm.h    |  31 +++--
 .../compiler/mcmc/mcmc_graph_optimize_state.h |  35 ------
 ...cmc_algorithm.h => mcmc_over_mapped_pcg.h} |   8 +-
 ...> mcmc_over_mapped_pcg_config.struct.toml} |  15 +--
 .../src/compiler/mcmc/mcmc_algorithm.cc       | 107 ------------------
 .../mcmc/mcmc_graph_optimize_state.cc         |  84 --------------
 .../src/compiler/mcmc/mcmc_over_mapped_pcg.cc |  73 ++++++++++++
 .../compiler/mcmc/generic_mcmc_algorithm.cc   |   4 +-
 .../test/src/compiler/mcmc/mcmc_algorithm.cc  |  13 +--
 9 files changed, 105 insertions(+), 265 deletions(-)
 delete mode 100644 lib/compiler/include/compiler/mcmc/mcmc_graph_optimize_state.h
 rename lib/compiler/include/compiler/mcmc/{mcmc_algorithm.h => mcmc_over_mapped_pcg.h} (71%)
 rename lib/compiler/include/compiler/mcmc/{mcmc_search_config.struct.toml => mcmc_over_mapped_pcg_config.struct.toml} (53%)
 delete mode 100644 lib/compiler/src/compiler/mcmc/mcmc_algorithm.cc
 delete mode 100644 lib/compiler/src/compiler/mcmc/mcmc_graph_optimize_state.cc
 create mode 100644 lib/compiler/src/compiler/mcmc/mcmc_over_mapped_pcg.cc

diff --git a/lib/compiler/include/compiler/mcmc/generic_mcmc_algorithm.h b/lib/compiler/include/compiler/mcmc/generic_mcmc_algorithm.h
index bea0061d47..a27ecbc8f4 100644
--- a/lib/compiler/include/compiler/mcmc/generic_mcmc_algorithm.h
+++ b/lib/compiler/include/compiler/mcmc/generic_mcmc_algorithm.h
@@ -9,22 +9,19 @@
 
 namespace FlexFlow {
 
-bool mcmc_accept(float delta, float temperature) {
-  return delta < 0 || (randf() < exp(-delta / temperature));
-}
-
 template <typename State, typename ScoringFunc>
-void modify_state(Generic_MCMC_state<State, float> &best_state,
-                  Generic_MCMC_state<State, float> &current_state,
-                  State candidate,
-                  ScoringFunc scorer,
-                  float temperature) {
+void modify_state_for_minimization(
+    Generic_MCMC_state<State, float> &best_state,
+    Generic_MCMC_state<State, float> &current_state,
+    State candidate,
+    ScoringFunc scorer,
+    float temperature) {
   float best_estimate = best_state.get_score();
   float new_estimate = scorer(candidate);
-  float runtime_delta = new_estimate - best_estimate;
-  if (mcmc_accept(runtime_delta, temperature)) {
+  float delta = new_estimate - best_estimate;
+  if (delta < 0 || (randf() < exp(-delta / temperature))) {
     current_state = Generic_MCMC_state<State, float>(candidate, new_estimate);
-    if (runtime_delta < 0) {
+    if (delta < 0) {
       best_state = current_state;
     }
   }
@@ -45,11 +42,11 @@ Generic_MCMC_state<State, float>
   for (nonnegative_int i : nonnegative_range(search_config.num_iterations)) {
     std::optional<State> candidate = generator(current_state.get_state(), i);
     if (candidate != std::nullopt) {
-      modify_state(best_state,
-                   current_state,
-                   candidate.value(),
-                   scorer,
-                   search_config.temperature);
+      modify_state_for_minimization(best_state,
+                                    current_state,
+                                    candidate.value(),
+                                    scorer,
+                                    search_config.temperature);
     }
   }
   return best_state;
diff --git a/lib/compiler/include/compiler/mcmc/mcmc_graph_optimize_state.h b/lib/compiler/include/compiler/mcmc/mcmc_graph_optimize_state.h
deleted file mode 100644
index 3306af123a..0000000000
--- a/lib/compiler/include/compiler/mcmc/mcmc_graph_optimize_state.h
+++ /dev/null
@@ -1,35 +0,0 @@
-#ifndef _FLEXFLOW_COMPILER_MCMC_ALGORITHM_STATE_H
-#define _FLEXFLOW_COMPILER_MCMC_ALGORITHM_STATE_H
-
-#include "compiler/search_result.h"
-#include "pcg/parallel_computation_graph/parallel_computation_graph.h"
-
-namespace FlexFlow {
-
-struct MCMCOptimizeState {
-  MCMCOptimizeState() = delete;
-  explicit MCMCOptimizeState(SearchResult const &mapped_pcg, float runtime);
-
-  SearchResult mapped_pcg;
-  float runtime;
-
-  bool operator==(MCMCOptimizeState const &other) const;
-  bool operator!=(MCMCOptimizeState const &other) const;
-  bool operator<(MCMCOptimizeState const &other) const;
-};
-
-std::string format_as(MCMCOptimizeState const &);
-std::ostream &operator<<(std::ostream &, MCMCOptimizeState const &);
-
-} // namespace FlexFlow
-
-namespace std {
-
-template <>
-struct hash<::FlexFlow::MCMCOptimizeState> {
-  size_t operator()(::FlexFlow::MCMCOptimizeState const &) const;
-};
-
-} // namespace std
-
-#endif
diff --git a/lib/compiler/include/compiler/mcmc/mcmc_algorithm.h b/lib/compiler/include/compiler/mcmc/mcmc_over_mapped_pcg.h
similarity index 71%
rename from lib/compiler/include/compiler/mcmc/mcmc_algorithm.h
rename to lib/compiler/include/compiler/mcmc/mcmc_over_mapped_pcg.h
index b17eaf3e16..c2d8737184 100644
--- a/lib/compiler/include/compiler/mcmc/mcmc_algorithm.h
+++ b/lib/compiler/include/compiler/mcmc/mcmc_over_mapped_pcg.h
@@ -1,8 +1,8 @@
-#ifndef _FLEXFLOW_COMPILER_MCMC_ALGORITHM_H
-#define _FLEXFLOW_COMPILER_MCMC_ALGORITHM_H
+#ifndef _FLEXFLOW_COMPILER_MCMC_OVER_MAPPED_PCG_H
+#define _FLEXFLOW_COMPILER_MCMC_OVER_MAPPED_PCG_H
 
 #include "compiler/cost_estimator/cost_estimator.h"
-#include "compiler/mcmc/mcmc_search_config.dtg.h"
+#include "compiler/mcmc/mcmc_over_mapped_pcg_config.dtg.h"
 #include "compiler/search_result.dtg.h"
 #include "pcg/computation_graph.h"
 #include "pcg/machine_specification.dtg.h"
@@ -15,7 +15,7 @@ namespace FlexFlow {
 SearchResult mcmc_graph_optimize(ParallelComputationGraph &pcg,
                                  CostEstimator const &cost_estimator,
                                  MachineSpecification const &resources,
-                                 MCMCSearchConfig const &search_config);
+                                 MCMCOverMappedPCGConfig const &search_config);
 
 } // namespace FlexFlow
 
diff --git a/lib/compiler/include/compiler/mcmc/mcmc_search_config.struct.toml b/lib/compiler/include/compiler/mcmc/mcmc_over_mapped_pcg_config.struct.toml
similarity index 53%
rename from lib/compiler/include/compiler/mcmc/mcmc_search_config.struct.toml
rename to lib/compiler/include/compiler/mcmc/mcmc_over_mapped_pcg_config.struct.toml
index 6bc5d98be7..e1548a581e 100644
--- a/lib/compiler/include/compiler/mcmc/mcmc_search_config.struct.toml
+++ b/lib/compiler/include/compiler/mcmc/mcmc_over_mapped_pcg_config.struct.toml
@@ -1,5 +1,5 @@
 namespace = "FlexFlow"
-name = "MCMCSearchConfig"
+name = "MCMCOverMappedPCGConfig"
 features = [
   "eq",
   "hash",
@@ -7,7 +7,8 @@ features = [
 ]
 
 includes = [
-  "pcg/device_type.dtg.h"
+  "pcg/device_type.dtg.h",
+  "utils/nonnegative_int/nonnegative_int.h"
 ]
 
 [[fields]]
@@ -16,15 +17,11 @@ type = "float"
 
 [[fields]]
 name = "num_iterations"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
-name = "num_mutations_per_iteration"
-type = "int"
-
-[[fields]]
-name = "max_num_ops"
-type = "int"
+name = "substitution_interval"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "device_type"
diff --git a/lib/compiler/src/compiler/mcmc/mcmc_algorithm.cc b/lib/compiler/src/compiler/mcmc/mcmc_algorithm.cc
deleted file mode 100644
index f8ef392eee..0000000000
--- a/lib/compiler/src/compiler/mcmc/mcmc_algorithm.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-#include "compiler/mcmc/mcmc_algorithm.h"
-#include "compiler/machine_mapping/apply_substitution_and_update_machine_mapping.h"
-#include "compiler/machine_mapping/machine_mapping_mutation_set.h"
-#include "compiler/mcmc/mcmc_graph_optimize_state.h"
-#include "compiler/task_graph_simulator/task_simulator.h"
-#include "substitutions/pcg_pattern.h"
-#include "substitutions/pcg_pattern_match.h"
-#include "substitutions/substitution.h"
-#include "substitutions/unity_substitution_set.h"
-#include "utils/optional.h"
-#include "utils/random_utils.h"
-
-namespace FlexFlow {
-
-bool mcmc_accept(int delta, float temperature) {
-  return delta < 0 || randf() < exp(-delta / temperature);
-}
-
-void modify_graph_state(MCMCOptimizeState &best_state,
-                        MCMCOptimizeState &current_state,
-                        SearchResult candidate,
-                        CostEstimator const &cost_estimator,
-                        MachineSpecification const &resources,
-                        MCMCSearchConfig const &search_config) {
-  float best_estimate = best_state.runtime;
-  float new_estimate = task_simulator_estimate_forward_pass_time(
-      candidate.pcg, cost_estimator, candidate.machine_mapping, resources);
-  float runtime_delta = new_estimate - best_estimate;
-  if (mcmc_accept(runtime_delta, search_config.temperature)) {
-    current_state = MCMCOptimizeState{candidate, new_estimate};
-    if (runtime_delta < 0) {
-      best_state = current_state;
-    }
-  }
-}
-
-SearchResult mcmc_graph_optimize(ParallelComputationGraph &pcg,
-                                 CostEstimator const &cost_estimator,
-                                 MachineSpecification const &resources,
-                                 MCMCSearchConfig const &search_config) {
-
-  std::vector<Substitution> substitutions = get_substitution_set(resources);
-
-  std::optional<MachineMapping> naive_mapping =
-      get_naive_mapping(pcg, resources, search_config.device_type);
-  if (naive_mapping == std::nullopt) {
-    throw std::runtime_error("Failed to find any solutions");
-  }
-
-  MCMCOptimizeState current_state = MCMCOptimizeState{
-      SearchResult{pcg, naive_mapping.value()},
-      task_simulator_estimate_forward_pass_time(
-          pcg, cost_estimator, naive_mapping.value(), resources)};
-
-  MCMCOptimizeState best_state = current_state;
-
-  for (int iteration = 0; iteration < search_config.num_iterations;
-       ++iteration) {
-
-    SearchResult current_mapped_pcg = current_state.mapped_pcg;
-
-    std::optional<MachineMapping> new_machine_mapping = get_random_mutation(
-        current_mapped_pcg, resources, search_config.device_type);
-    for (int searched_mutations = 1;
-         searched_mutations < search_config.num_mutations_per_iteration;
-         searched_mutations++) {
-      if (new_machine_mapping == std::nullopt) {
-        break;
-      }
-      modify_graph_state(
-          best_state,
-          current_state,
-          SearchResult{current_mapped_pcg.pcg, new_machine_mapping.value()},
-          cost_estimator,
-          resources,
-          search_config);
-
-      new_machine_mapping = get_random_mutation(
-          current_mapped_pcg, resources, search_config.device_type);
-    }
-
-    std::optional<Substitution> random_substitution =
-        get_random_substitution(resources);
-    if (random_substitution != std::nullopt) {
-      std::optional<PCGPatternMatch> pattern_match = get_random_pattern_match(
-          random_substitution.value().pcg_pattern,
-          sub_pcg_from_full_pcg(current_mapped_pcg.pcg));
-      if (pattern_match != std::nullopt) {
-        SearchResult new_mapped_pcg =
-            apply_substitution_and_update_machine_mapping(
-                current_mapped_pcg,
-                random_substitution.value(),
-                pattern_match.value());
-        modify_graph_state(best_state,
-                           current_state,
-                           new_mapped_pcg,
-                           cost_estimator,
-                           resources,
-                           search_config);
-      }
-    }
-  }
-
-  return best_state.mapped_pcg;
-}
-
-} // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/mcmc/mcmc_graph_optimize_state.cc b/lib/compiler/src/compiler/mcmc/mcmc_graph_optimize_state.cc
deleted file mode 100644
index 2556a50b4d..0000000000
--- a/lib/compiler/src/compiler/mcmc/mcmc_graph_optimize_state.cc
+++ /dev/null
@@ -1,84 +0,0 @@
-#include "compiler/mcmc/mcmc_graph_optimize_state.h"
-#include "pcg/machine_view.h"
-#include "pcg/parallel_computation_graph/parallel_tensor_guid_t.h"
-
-namespace FlexFlow {
-
-MCMCOptimizeState::MCMCOptimizeState(SearchResult const &mapped_pcg,
-                                     float runtime)
-    : mapped_pcg(mapped_pcg), runtime(runtime) {}
-
-bool MCMCOptimizeState::operator==(MCMCOptimizeState const &other) const {
-  return pcgs_are_isomorphic(mapped_pcg.pcg, other.mapped_pcg.pcg) &&
-         mapped_pcg.machine_mapping == other.mapped_pcg.machine_mapping &&
-         runtime == other.runtime;
-}
-
-bool MCMCOptimizeState::operator!=(MCMCOptimizeState const &other) const {
-  return !(*this == other);
-}
-
-bool MCMCOptimizeState::operator<(MCMCOptimizeState const &other) const {
-  return runtime < other.runtime;
-}
-
-std::string format_as(MCMCOptimizeState const &r) {
-  return fmt::format("<MCMCOptimizeState pcg={} machine_mapping={} runtime={}>",
-                     as_dot(r.mapped_pcg.pcg),
-                     r.mapped_pcg.machine_mapping,
-                     r.runtime);
-}
-
-std::ostream &operator<<(std::ostream &s, MCMCOptimizeState const &st) {
-  return (s << fmt::to_string(st));
-}
-} // namespace FlexFlow
-
-namespace std {
-
-size_t hash<::FlexFlow::MCMCOptimizeState>::operator()(
-    ::FlexFlow::MCMCOptimizeState const &state) const {
-  ::FlexFlow::ParallelComputationGraph pcg = state.mapped_pcg.pcg;
-  ::FlexFlow::MachineMapping machine_mapping = state.mapped_pcg.machine_mapping;
-  size_t seed = 0;
-  ::FlexFlow::hash_combine(seed, state.runtime);
-  std::vector<::FlexFlow::parallel_layer_guid_t> layers =
-      topological_ordering(pcg);
-  ::FlexFlow::hash_combine(seed, layers.size());
-  for (::FlexFlow::parallel_layer_guid_t const &layer : layers) {
-    ::FlexFlow::hash_combine(seed, get_parallel_layer_attrs(pcg, layer));
-    std::vector<::FlexFlow::parallel_tensor_guid_t> inputs =
-        get_incoming_tensors(pcg, layer);
-    ::FlexFlow::hash_combine(seed, inputs.size());
-    for (::FlexFlow::parallel_tensor_guid_t input : inputs) {
-      for (size_t i = 0; i < layers.size(); ++i) {
-        if (get_source_layer(input) == layers.at(i)) {
-          ::FlexFlow::hash_combine(seed, i);
-          break;
-        }
-      }
-    }
-    ::FlexFlow::MachineView machine_view =
-        machine_mapping.machine_views.at(layer);
-    ::FlexFlow::hash_combine(seed, machine_view.start.node_idx);
-    ::FlexFlow::hash_combine(seed, machine_view.start.device_idx);
-    if (get_device_type(machine_view) == ::FlexFlow::DeviceType::CPU) {
-      ::FlexFlow::hash_combine(seed, 0);
-    } else {
-      ::FlexFlow::hash_combine(seed, 1);
-    }
-    for (::FlexFlow::MachineViewDimension dimension : machine_view.dimensions) {
-      ::FlexFlow::hash_combine(seed, dimension.stride.unwrapped);
-      if (dimension.projection ==
-          ::FlexFlow::MachineSpecificationDimension::INTRA_NODE) {
-        ::FlexFlow::hash_combine(seed, 0);
-      } else {
-        ::FlexFlow::hash_combine(seed, 1);
-      }
-    }
-  }
-
-  return seed;
-}
-
-} // namespace std
diff --git a/lib/compiler/src/compiler/mcmc/mcmc_over_mapped_pcg.cc b/lib/compiler/src/compiler/mcmc/mcmc_over_mapped_pcg.cc
new file mode 100644
index 0000000000..47ecc2479f
--- /dev/null
+++ b/lib/compiler/src/compiler/mcmc/mcmc_over_mapped_pcg.cc
@@ -0,0 +1,73 @@
+#include "compiler/mcmc/mcmc_over_mapped_pcg.h"
+#include "compiler/machine_mapping/apply_substitution_and_update_machine_mapping.h"
+#include "compiler/machine_mapping/machine_mapping_mutation_set.h"
+#include "compiler/mcmc/generic_mcmc_algorithm.h"
+#include "compiler/task_graph_simulator/task_simulator.h"
+#include "substitutions/pcg_pattern.h"
+#include "substitutions/pcg_pattern_match.h"
+#include "substitutions/unity_substitution_set.h"
+#include "utils/optional.h"
+
+namespace FlexFlow {
+
+SearchResult mcmc_graph_optimize(ParallelComputationGraph &pcg,
+                                 CostEstimator const &cost_estimator,
+                                 MachineSpecification const &resources,
+                                 MCMCOverMappedPCGConfig const &search_config) {
+
+  std::vector<Substitution> substitutions = get_substitution_set(resources);
+
+  std::optional<MachineMapping> naive_mapping =
+      get_naive_mapping(pcg, resources, search_config.device_type);
+  if (naive_mapping == std::nullopt) {
+    throw std::runtime_error("Failed to find any solutions");
+  }
+
+  SearchResult starting_state = SearchResult{pcg, naive_mapping.value()};
+
+  auto generating_func = [&](SearchResult mapped_pcg,
+                             nonnegative_int i) -> std::optional<SearchResult> {
+    if (i.unwrap_nonnegative() %
+            search_config.substitution_interval.unwrap_nonnegative() ==
+        0) {
+      // substitutions every (substitution_interval) iterations
+      std::optional<Substitution> random_substitution =
+          get_random_substitution(resources);
+      if (random_substitution != std::nullopt) {
+        std::optional<PCGPatternMatch> pattern_match =
+            get_random_pattern_match(random_substitution.value().pcg_pattern,
+                                     sub_pcg_from_full_pcg(mapped_pcg.pcg));
+        if (pattern_match != std::nullopt) {
+          std::cout << "HELLO" << std::endl;
+          return apply_substitution_and_update_machine_mapping(
+              mapped_pcg, random_substitution.value(), pattern_match.value());
+        }
+      }
+      return std::nullopt;
+    } else {
+      // machine mapping mutations otherwise
+      std::optional<MachineMapping> new_machine_mapping =
+          get_random_mutation(mapped_pcg, resources, search_config.device_type);
+      if (new_machine_mapping == std::nullopt) {
+        return std::nullopt;
+      }
+      return SearchResult{mapped_pcg.pcg, new_machine_mapping.value()};
+    }
+  };
+
+  auto scoring_func = [&](SearchResult mapped_pcg) -> float {
+    return task_simulator_estimate_forward_pass_time(
+        mapped_pcg.pcg, cost_estimator, mapped_pcg.machine_mapping, resources);
+  };
+
+  GenericMCMCConfig config =
+      GenericMCMCConfig{/*temperature*/ search_config.temperature,
+                        /*num_iterations*/ search_config.num_iterations};
+
+  Generic_MCMC_state<SearchResult, float> result =
+      minimize_score(starting_state, generating_func, scoring_func, config);
+
+  return result.get_state();
+}
+
+} // namespace FlexFlow
diff --git a/lib/compiler/test/src/compiler/mcmc/generic_mcmc_algorithm.cc b/lib/compiler/test/src/compiler/mcmc/generic_mcmc_algorithm.cc
index 0a175933cf..ba6faa93c4 100644
--- a/lib/compiler/test/src/compiler/mcmc/generic_mcmc_algorithm.cc
+++ b/lib/compiler/test/src/compiler/mcmc/generic_mcmc_algorithm.cc
@@ -18,8 +18,8 @@ TEST_SUITE(FF_TEST_SUITE) {
       return new_x;
     };
     auto scoring_func = [](float x) { return (x - 0.5) * (x - 0.5); };
-    GenericMCMCConfig config = GenericMCMCConfig{/*temperature*/ 1.0,
-                                                 /*num_iterations*/ 10_n};
+    GenericMCMCConfig config = GenericMCMCConfig{/*temperature=*/1.0,
+                                                 /*num_iterations=*/10_n};
     Generic_MCMC_state<float, float> result =
         minimize_score(starting_state, generating_func, scoring_func, config);
     float answer = result.get_state();
diff --git a/lib/compiler/test/src/compiler/mcmc/mcmc_algorithm.cc b/lib/compiler/test/src/compiler/mcmc/mcmc_algorithm.cc
index 7aad8b098d..5c469c4301 100644
--- a/lib/compiler/test/src/compiler/mcmc/mcmc_algorithm.cc
+++ b/lib/compiler/test/src/compiler/mcmc/mcmc_algorithm.cc
@@ -1,5 +1,5 @@
-#include "compiler/mcmc/mcmc_algorithm.h"
 #include "../cost_estimator_for_test.h"
+#include "compiler/mcmc/mcmc_over_mapped_pcg.h"
 #include "compiler/task_graph_simulator/task_simulator.h"
 #include "doctest/doctest.h"
 #include "op-attrs/parallel_tensor_dims.h"
@@ -62,12 +62,11 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*intra_node_bandwidth=*/1,
     };
 
-    MCMCSearchConfig search_config =
-        MCMCSearchConfig{/*temperature=*/1.0,
-                         /*num_iterations=*/5,
-                         /*num_mutations_per_iteration=*/10,
-                         /*max_num_ops=*/100,
-                         /*device_type=*/DeviceType::GPU};
+    MCMCOverMappedPCGConfig search_config =
+        MCMCOverMappedPCGConfig{/*temperature=*/1.0,
+                                /*num_iterations=*/100_n,
+                                /*substitution_interval=*/100_n,
+                                /*device_type=*/DeviceType::GPU};
 
     SearchResult result = mcmc_graph_optimize(
         pcg, cost_estimator, full_machine_spec, search_config);

From d0480f4151f7da33da3cd6af261391e309721fe5 Mon Sep 17 00:00:00 2001
From: Dylan Lim <72822184+dylanllim@users.noreply.github.com>
Date: Fri, 2 May 2025 03:47:30 -0700
Subject: [PATCH 07/11] CPU Kernel Tests (#1439)

* test_utils refactor, local_cpu_allocator

* test utils modification, cast, reverse, and replicate cpu kernels

* combine kernel

* combine kernels .h file

* Implementations for methods for machine_views and associated modules  (#1429)

* initial commit for machine view adjacent modules

* Formatting

* Tests for new machine_view.cc functions

* formatting

* Minor Test correction

* formatting

* PR fixes

* PR Fixes

---------

Co-authored-by: Pietro Max Marsella <marsella@stanford.edu>

* test utils logic cleanup, reverse cpu_kernel pedagogical implmentation, other minor fixes

* cpu_kernel's refactor, generic tensor accessor indexing

* accessor.h formatting

* mk_runtime_error formatting

* reverse_kernels include

* test_utils refactor and clarity

* formatting

* comment removal reverse_kernels

* Issue #1435, tests for managed stream and handle

* #1435 formatting

* #1409 issue, change datatype for linear kernels away from void *

* R & W accessor changes, minimize code bloat

* code formatting and refactor

* issue #1502 & issue #1540

* format check

* branch merge and test fixes

* build issues

* Add AWS linux AMI to runs-on for testing (#1589)

* Pin runs-on images (#1590)

* GPU CI Fix (Pin runs-on GPU image) (#1588)

* Debug

* Change to base DL AMI

* Print disk usage

* Run nvidia-smi

* Remove excess cuda installs in base ami

* Re-enable freeing space in GPU CI

* Try updating nix-develop version

* Check what happens if you just enter the non-nixGL environment

* Try switching AMIs

* Try to remove the module stuff

* Move to lockshaw/develop-action

* Try pointing at a fixed commit

* Update nix-develop action

* Update nix-develop action to use BASH_FUNC filtering

* Remove all the /usr/local/cuda entries

* Switch back to gpu-ci env

* Update the cuda arch

* Try out the new runs-on gpu image

* Move over to pinned runs-on image

* Remove a bunch more unnecessary stuff in image to get back disk space

* Try using an emphemeral store

* Try mounting

* Fix bug

* Try sudo

* Move nix into _work

* Rollback all unnecessary changes

* Re-enable waiting on cpu-ci

* Merge substitution-builder (#1575)

* Start on pcg builder

* Add tests and some implementation for pcg builder

* Add pcg tests, make dtgen constructors explicit to fix bug

* Add remainder of PCG tests

* Fix build issues in local-execution

* Format

* Address Reyna comments, add topological_order function for PCG

* Pre multidigraph refactor

* Removing visitable from sp code

* Add open dataflow graph, start to replace pcg dataflow graph

* Start refactoring substitutions

* Add utility functions to support pattern matching

* Pre-refactor inputs

* Fix proj url

* Get back to substitutions, now with unordered graph inputs

* Get substitutions building

* substitutions-tests now builds

* Fix bug in filter, pass some initial substitution tests

* Add tests for fmt::to_string, fix some substitutions bugs

* Pass initial unit tests for find_pattern_matches

* Start on unit tests for pcg pattern

* Pass initial test for find_pattern_matches

* Fix small build issue in tests

* Format

* Sync tests in CI with tests in proj

* Fix minor build errors in kernels and local-execution

* Format

* Remove outdated code

* More outdated code removal

* More cleanup, add test for sp decomposition

* Pull apart containers.h

* More sp testing and fixes

* Break up graph algorithms.h

* Pre- full SP algo commit

* Add initial implementation and tests for cbc decomposition and inverse line graph

* Pass test for get_inverse_line_graph

* Add new multidigraph

* Fix get_inverse_line_graph to return a MultiDiGraph instead of a DiGraph

* Add tests for parallel and series reduction finding

* Add really rough implementation of valdez sp decomposition

* Fix local-execution build

* Add implementations and tests for applying series/parallel reductions

* Format

* Clean up sp decomposition interface and tests

* Format

* Add comments for top-level substitutions functions, add proj doxygen support

* Start sketching out substitutions code

* Fix build errors

* Add ability to permute node ids

* Cleanup and start to test new substitutions code

* Add test case for evaluate_substitution_output

* Add naive isomorphism detection code

* Add graph inputs to open dataflow graph isomorphism

* Add input permutation to evaluate_substitution_output

* Fix permute_node_ids

* Add test for permute_input_ids

* Migrate over to mutable implementation of apply_substitution

* Add fast isomorphism checking and an initial implementation of full substitution logic

* Pass initial full substitutions test

* Cleanup old isomorphism checking code

* Fix post-merge bugs

* Fix broken pcg builder test

* Format

* Reorganize code and remove some outdated code pre-code-review

* Format

* Restarting work on this after working on export-model-arch

* Adding in some a simple function to get the currently available substritutions

* nonnegative_int additions, code cleanup, etc.

* A bunch more moving over to nonnegative_int

* Even more nonnegative_int updating

* Fix build

* Fix failing tests

* Format

* Format

---------

Co-authored-by: Colin Unger <lockshaw@lockshaw.net>
Co-authored-by: Victor Li <vli42@sapling2.stanford.edu>

* test_utils refactor, local_cpu_allocator

* test utils modification, cast, reverse, and replicate cpu kernels

* combine kernel

* test utils logic cleanup, reverse cpu_kernel pedagogical implmentation, other minor fixes

* cpu_kernel's refactor, generic tensor accessor indexing

* test_utils refactor and clarity

* R & W accessor changes, minimize code bloat

* issue #1502 & issue #1540

* branch merge and test fixes

* merge

* build after merge

* kernel issues

* managed stream / handle test case fix

* test_utils update, kernel/ops refactor

* Review fixes

* Update doctest includes in kernels

* More PR review

* Try using rhel package-based nixgl

* Format

* Update proj with test command fixes

* Attempt to fix gpu CI

* Use custom AMI in GPU CI

* Fix proj bug in cpu-ci

* Try including run id

* Temporarily allow gpu ci to run regardless for testing purposes

* Try using official ubuntu ami in gpu ci

* Try out new ami

* Change to use new flexflow-gpu-ci AMI

* Fix bugs in GPU tests and restore GPU CI gating

* Format

* Fix bug in accessor formatting test cases

* Bugfixes and updated proj

* Fix all cpu tests

* Format

* Add improved test failure output for replicate cpu vs gpu tests

* Continue debugging replicate cuda testcases

* Format

* Fix incorrect tensor size in replicate kernel tests

* Transpose replicate backward cpu kernel

* Try flipping output dimensions in replica cuda kernel test

* Update proj

---------

Co-authored-by: Marsella8 <45826022+Marsella8@users.noreply.github.com>
Co-authored-by: Pietro Max Marsella <marsella@stanford.edu>
Co-authored-by: Colin Unger <lockshaw@lockshaw.net>
Co-authored-by: Victor Li <32348970+victorli2002@users.noreply.github.com>
Co-authored-by: Victor Li <vli42@sapling2.stanford.edu>
---
 .flake/pkgs/fccf/default.nix                  |  54 ++
 .flake/pkgs/fccf/fix-argparse-include.patch   |  13 +
 .flake/pkgs/fccf/json-package-name.patch      |  12 +
 .github/runs-on.yml                           |  19 -
 .github/workflows/tests.yml                   |   4 +-
 .proj.toml                                    |  60 +-
 cmake/flexflow-utils.cmake                    |   7 +-
 flake.lock                                    |   6 +-
 flake.nix                                     |   3 +
 lib/kernels/CMakeLists.txt                    |   4 +-
 lib/kernels/include/kernels/accessor.h        | 223 ++++---
 lib/kernels/include/kernels/allocation.h      |   9 +-
 .../include/kernels/array_coord.struct.toml   |  19 +
 lib/kernels/include/kernels/array_shape.h     |  31 +-
 .../include/kernels/attention_kernels.h       |   7 +-
 .../include/kernels/batch_matmul_kernels.h    |  10 +-
 .../include/kernels/batch_norm_kernels.h      |  15 +-
 lib/kernels/include/kernels/cast_kernels.h    |  22 +-
 .../include/kernels/cast_kernels_cpu.h        |  17 +
 lib/kernels/include/kernels/combine_kernels.h |  10 +-
 .../include/kernels/combine_kernels_cpu.h     |  17 +
 lib/kernels/include/kernels/concat_kernels.h  |  10 +-
 lib/kernels/include/kernels/conv_2d_kernels.h |  12 +-
 .../include/kernels/copy_tensor_accessor.h    |  27 +
 .../include/kernels/datatype_dispatch.h       |  13 +-
 lib/kernels/include/kernels/dropout_kernels.h |   8 +-
 .../include/kernels/element_binary_kernels.h  |   8 +-
 .../include/kernels/element_unary_kernels.h   |  14 +-
 .../include/kernels/embedding_kernels.h       |  14 +-
 lib/kernels/include/kernels/ff_handle.h       |   2 +-
 lib/kernels/include/kernels/flat_kernels.h    |  15 +-
 .../kernels/format_accessor_contents.h        |  13 +
 lib/kernels/include/kernels/gather_kernels.h  |  10 +-
 .../include/kernels/layer_norm_kernels.h      |   8 +-
 lib/kernels/include/kernels/legion_dim.h      |  24 +-
 .../kernels/legion_ordered/legion_ordered.h   | 197 ++++++
 .../include/kernels/legion_ordered/slice.h    |  24 +
 .../kernels/legion_ordered/transform.h        |  17 +
 lib/kernels/include/kernels/linear_kernels.h  |  30 +-
 .../include/kernels}/local_cpu_allocator.h    |   7 +
 .../include/kernels/local_cuda_allocator.h    |   2 +
 .../include/kernels/managed_ff_stream.h       |   5 +-
 .../kernels/managed_per_device_ff_handle.h    |   8 +-
 lib/kernels/include/kernels/metrics_kernels.h |  29 +-
 lib/kernels/include/kernels/nccl.h            |   8 +-
 .../include/kernels/optimizer_kernels.h       |  19 +-
 .../include/kernels/partition_kernels.h       |  12 +-
 .../kernels}/per_device_op_state.variant.toml |   0
 lib/kernels/include/kernels/pool_2d_kernels.h |  14 +-
 lib/kernels/include/kernels/profiling.h       |   2 +-
 lib/kernels/include/kernels/reduce_kernels.h  |  12 +-
 .../include/kernels/reduction_kernels.h       |  14 +-
 .../include/kernels/replicate_kernels.h       |  12 +-
 .../include/kernels/replicate_kernels_cpu.h   |  18 +
 lib/kernels/include/kernels/reshape_kernels.h |  12 +-
 lib/kernels/include/kernels/reverse_kernels.h |  29 +-
 .../include/kernels/reverse_kernels_cpu.h     |  20 +
 .../include/kernels/reverse_kernels_params.h  |  16 +
 .../reverse_kernels_params.struct.toml        |  28 +
 lib/kernels/include/kernels/softmax_kernels.h |  10 +-
 lib/kernels/include/kernels/split_kernels.h   |  11 +-
 lib/kernels/include/kernels/topk_kernels.h    |   8 +-
 .../include/kernels/transpose_kernels.h       |  12 +-
 lib/kernels/src/accessor.cc                   | 192 ------
 lib/kernels/src/allocation.cc                 |  21 -
 lib/kernels/src/cpu/ops/cast_kernels.cc       |  51 ++
 lib/kernels/src/cpu/ops/combine_kernels.cc    |  39 ++
 .../src/cpu/{ => ops}/initializer_kernels.cc  |   0
 lib/kernels/src/cpu/ops/replicate_kernels.cc  |  51 ++
 lib/kernels/src/cpu/ops/reverse_kernels.cc    |  46 ++
 lib/kernels/src/cuda/cuda_helper.cu           |  14 +-
 lib/kernels/src/cuda/embedding_kernels.cu     | 567 ++++++++++++++----
 lib/kernels/src/cuda/loss_function_kernels.cu |   2 +-
 lib/kernels/src/cuda/metrics_functions.cu     |  96 +--
 lib/kernels/src/cuda/ops/attention_kernels.cu |   2 +-
 .../src/cuda/ops/batch_matmul_kernels.cu      |   2 +-
 .../src/cuda/ops/batch_norm_kernels.cu        |   6 +-
 lib/kernels/src/cuda/ops/cast_kernels.cu      |  24 +-
 lib/kernels/src/cuda/ops/combine_kernels.cu   |   2 +-
 lib/kernels/src/cuda/ops/concat_kernels.cu    |  92 +--
 lib/kernels/src/cuda/ops/conv_2d_kernels.cu   |   6 +-
 lib/kernels/src/cuda/ops/dropout_kernels.cu   |   2 +-
 .../src/cuda/ops/element_binary_kernels.cu    |   2 +-
 .../src/cuda/ops/element_unary_kernels.cu     |  20 +-
 lib/kernels/src/cuda/ops/flat_kernels.cu      |   6 +-
 lib/kernels/src/cuda/ops/gather_kernels.cu    |  26 +-
 lib/kernels/src/cuda/ops/linear_kernels.cu    |  78 +--
 lib/kernels/src/cuda/ops/partition_kernels.cu |  12 +-
 lib/kernels/src/cuda/ops/pool_2d_kernels.cu   |   8 +-
 lib/kernels/src/cuda/ops/reduce_kernels.cu    |   2 +-
 lib/kernels/src/cuda/ops/reduction_kernels.cu |  12 +-
 lib/kernels/src/cuda/ops/replicate_kernels.cu |  15 +-
 lib/kernels/src/cuda/ops/reshape_kernels.cu   |  12 +-
 lib/kernels/src/cuda/ops/reverse_kernels.cu   |  78 ++-
 lib/kernels/src/cuda/ops/softmax_kernels.cu   |   4 +-
 lib/kernels/src/cuda/ops/split_kernels.cu     |   2 +-
 lib/kernels/src/cuda/ops/topk_kernels.cu      |   2 +-
 lib/kernels/src/cuda/ops/transpose_kernels.cu |   8 +-
 lib/kernels/src/cuda/optimizer_kernel.cu      | 216 -------
 lib/kernels/src/cuda/optimizer_kernels.cu     | 205 +++++++
 lib/kernels/src/hip/embedding_kernels.cpp     |  32 +-
 lib/kernels/src/hip/loss_function_kernels.cpp |   2 +-
 lib/kernels/src/hip/ops/attention_kernels.cpp |   2 +-
 .../src/hip/ops/batch_matmul_kernels.cpp      |   2 +-
 .../src/hip/ops/batch_norm_kernels.cpp        |   2 +-
 lib/kernels/src/hip/ops/cast_kernels.cpp      |   2 +-
 lib/kernels/src/hip/ops/combine_kernels.cpp   |   2 +-
 lib/kernels/src/hip/ops/concat_kernels.cpp    |   2 +-
 lib/kernels/src/hip/ops/conv_2d_kernels.h     |   2 +-
 lib/kernels/src/hip/ops/dropout_kernels.cpp   |   2 +-
 .../src/hip/ops/element_binary_kernels.cpp    |   2 +-
 .../src/hip/ops/element_unary_kernels.cpp     |   2 +-
 lib/kernels/src/hip/ops/flat_kernels.cpp      |   2 +-
 lib/kernels/src/hip/ops/gather_kernels.cpp    |   2 +-
 lib/kernels/src/hip/ops/partition_kernels.cpp |   2 +-
 lib/kernels/src/hip/ops/pool_2d_kernels.cpp   |   2 +-
 lib/kernels/src/hip/ops/reduce_kernels.cpp    |   2 +-
 lib/kernels/src/hip/ops/replicate_kernels.cpp |   2 +-
 lib/kernels/src/hip/ops/reshape_kernels.cpp   |   2 +-
 lib/kernels/src/hip/ops/reverse_kernels.cpp   |   2 +-
 lib/kernels/src/hip/ops/softmax_kernels.cpp   |   2 +-
 lib/kernels/src/hip/ops/split_kernels.cpp     |   2 +-
 lib/kernels/src/hip/ops/topk_kernels.cpp      |   2 +-
 lib/kernels/src/hip/ops/transpose_kernels.cpp |   2 +-
 lib/kernels/src/{ => internal}/device.cc      |   2 +-
 lib/kernels/src/{ => internal}/device.h       |   4 +-
 lib/kernels/src/kernels/accessor.cc           | 249 ++++++++
 lib/kernels/src/kernels/allocation.cc         |  38 ++
 lib/kernels/src/{ => kernels}/array_shape.cc  |  87 ++-
 .../src/kernels/copy_tensor_accessor.cc       |  66 ++
 .../src/kernels/format_accessor_contents.cc   | 184 ++++++
 lib/kernels/src/{ => kernels}/legion_dim.cc   |   5 +
 .../kernels/legion_ordered/legion_ordered.cc  |  10 +
 .../src/kernels/legion_ordered/slice.cc       |  12 +
 .../src/kernels/legion_ordered/transform.cc   |  12 +
 .../src/kernels}/local_cpu_allocator.cc       |  21 +-
 .../src/{ => kernels}/local_cuda_allocator.cc |  20 +-
 .../src/kernels/reverse_kernels_params.cc     |  30 +
 lib/kernels/src/managed_ff_stream.cc          |  20 +-
 .../src/managed_per_device_ff_handle.cc       |  38 +-
 lib/kernels/test/CMakeLists.txt               |   1 +
 .../test/src/cpu/ops/replicate_kernels.cc     |  57 ++
 .../test/src/cpu/ops/reverse_kernels.cc       | 206 +++++++
 lib/kernels/test/src/internal/test_utils.cc   | 392 ++++++++++++
 lib/kernels/test/src/internal/test_utils.h    |  78 +++
 lib/kernels/test/src/kernels/accessor.cc      |  73 +++
 lib/kernels/test/src/kernels/array_shape.cc   |  49 ++
 .../src/kernels/format_accessor_contents.cc   |  94 +++
 lib/kernels/test/src/kernels/legion_dim.cc    |  32 +
 .../kernels/legion_ordered/legion_ordered.cc  |  12 +
 .../test/src/kernels/legion_ordered/slice.cc  |  30 +
 .../src/kernels/legion_ordered/transform.cc   |  36 ++
 lib/kernels/test/src/test_attention_kernel.cc |  44 +-
 .../test/src/test_batch_matmul_kernel.cc      |  28 +-
 .../test/src/test_batch_norm_kernel.cc        |  68 +--
 lib/kernels/test/src/test_cast_kernel.cc      | 102 ++--
 lib/kernels/test/src/test_combine_kernel.cc   |  93 ++-
 lib/kernels/test/src/test_concat_kernel.cc    | 139 +++--
 lib/kernels/test/src/test_cuda.cc             |   6 +-
 lib/kernels/test/src/test_dropout.cc          |  31 +-
 lib/kernels/test/src/test_flat_kernel.cc      |  48 +-
 lib/kernels/test/src/test_gather_kernels.cc   | 134 +++--
 .../test/src/test_layer_norm_kernels.cc       |  37 +-
 .../test/src/test_managed_ff_stream.cc        | 107 ++++
 .../src/test_managed_per_device_ff_handle.cc  |  37 ++
 lib/kernels/test/src/test_partition_kernel.cc |  51 +-
 lib/kernels/test/src/test_pool_2d_kernels.cc  |  42 +-
 lib/kernels/test/src/test_reduction_kernel.cc |  48 +-
 lib/kernels/test/src/test_replicate_kernel.cc | 157 ++++-
 lib/kernels/test/src/test_reshape_kernel.cc   |  43 +-
 lib/kernels/test/src/test_reverse_kernels.cc  | 137 +++--
 lib/kernels/test/src/test_softmax_kernel.cc   |  33 +-
 lib/kernels/test/src/test_split_kernel.cc     |  25 +-
 lib/kernels/test/src/test_transpose_kernel.cc |  40 +-
 lib/kernels/test/src/test_utils.cc            | 106 ----
 lib/kernels/test/src/test_utils.h             |  72 ---
 .../local-execution/per_device_op_state.h     |   2 +-
 .../local-execution/task_argument_accessor.h  |   2 +-
 .../local-execution/tracked_allocator.h       |   3 +
 .../src/local_task_argument_accessor.cc       |   7 +-
 lib/local-execution/src/ops/batch_norm.cc     |   4 +-
 lib/local-execution/src/ops/cast.cc           |   8 +-
 lib/local-execution/src/ops/conv_2d.cc        |   6 +-
 lib/local-execution/src/ops/element_unary.cc  |  16 +-
 lib/local-execution/src/ops/flat.cc           |   6 +-
 lib/local-execution/src/ops/linear.cc         |  39 +-
 lib/local-execution/src/ops/pool_2d.cc        |  10 +-
 lib/local-execution/src/ops/reduction.cc      |   6 +-
 lib/local-execution/src/ops/repartition.cc    |   4 +-
 lib/local-execution/src/ops/replicate.cc      |   2 +-
 lib/local-execution/src/ops/reshape.cc        |   4 +-
 lib/local-execution/src/ops/reverse.cc        |  48 +-
 lib/local-execution/src/ops/softmax.cc        |   2 +-
 lib/local-execution/src/ops/transpose.cc      |   4 +-
 ...device_state.cc => per_device_op_state.cc} |   0
 lib/local-execution/src/tracked_allocator.cc  |   7 +-
 .../test/src/test_local_cost_estimator.cc     | 113 ++--
 .../test/src/test_local_slots_backing.cc      |  22 +-
 .../test/src/test_local_task_arg_accessor.cc  |   2 +-
 .../include/op-attrs/aggregate_op.enum.toml   |   3 +-
 .../include/op-attrs/datatype_value.h         |  16 +
 .../op-attrs/dim_ordered/dim_ordered.h        | 199 +-----
 .../include/op-attrs/dim_ordered/slice.h      |  45 +-
 .../{dim_ordered => ff_ordered}/concat.h      |   2 +-
 .../{dim_ordered => ff_ordered}/enumerate.h   |   2 +-
 .../include/op-attrs/ff_ordered/ff_ordered.h  | 228 +++++++
 .../ff_ordered_from_map.h                     |   4 +-
 .../ff_ordered_of.h                           |   2 +-
 .../{dim_ordered => ff_ordered}/get_idxs.h    |   2 +-
 .../include/op-attrs/ff_ordered/slice.h       |  49 ++
 .../include/op-attrs/ff_ordered/transform.h   |  17 +
 .../include/op-attrs/ff_ordered/zip.h         |  18 +
 .../op-attrs/ops/transpose_attrs.struct.toml  |   2 +-
 .../parallel_tensor_dim_degrees.struct.toml   |   2 +-
 .../op-attrs/parallel_tensor_dims.struct.toml |   2 +-
 lib/op-attrs/include/op-attrs/tensor_dims.h   |   2 +-
 .../include/op-attrs/tensor_dims.struct.toml  |   2 +-
 lib/op-attrs/include/op-attrs/tensor_shape.h  |   2 +-
 lib/op-attrs/src/op-attrs/datatype_value.cc   |  25 +
 .../src/op-attrs/dim_ordered/concat.cc        |   1 -
 .../src/op-attrs/dim_ordered/enumerate.cc     |   1 -
 .../dim_ordered/ff_ordered_from_map.cc        |   1 -
 .../src/op-attrs/dim_ordered/ff_ordered_of.cc |   1 -
 .../src/op-attrs/dim_ordered/get_idxs.cc      |   1 -
 .../src/op-attrs/dim_ordered/slice.cc         |  25 -
 .../src/op-attrs/dim_ordered/transform.cc     |   1 +
 .../src/op-attrs/ff_ordered/enumerate.cc      |  10 +
 .../src/op-attrs/ff_ordered/ff_ordered.cc     |  14 +
 .../ff_ordered/ff_ordered_from_map.cc         |  13 +
 .../src/op-attrs/ff_ordered/get_idxs.cc       |  10 +
 lib/op-attrs/src/op-attrs/ff_ordered/slice.cc |  24 +
 .../src/op-attrs/ff_ordered/transform.cc      |  12 +
 lib/op-attrs/src/op-attrs/ff_ordered/zip.cc   |  12 +
 lib/op-attrs/src/op-attrs/ops/batch_norm.cc   |   4 +-
 lib/op-attrs/src/op-attrs/ops/concat.cc       |   4 +-
 lib/op-attrs/src/op-attrs/ops/embedding.cc    |   6 +-
 lib/op-attrs/src/op-attrs/ops/flat.cc         |   4 +-
 lib/op-attrs/src/op-attrs/ops/layer_norm.cc   |   4 +-
 lib/op-attrs/src/op-attrs/ops/linear.cc       |  13 +-
 .../src/op-attrs/parallel_tensor_dims.cc      |   4 +-
 lib/op-attrs/src/op-attrs/tensor_dims.cc      |   6 +-
 lib/op-attrs/src/op-attrs/tensor_shape.cc     |   2 +-
 .../test/src/op-attrs/datatype_value.cc       |  68 +++
 .../src/op-attrs/dim_ordered/dim_ordered.cc   |   4 -
 .../{dim_ordered => ff_ordered}/concat.cc     |   2 +-
 .../{dim_ordered => ff_ordered}/enumerate.cc  |   2 +-
 .../src/op-attrs/ff_ordered/ff_ordered.cc     |  11 +
 .../ff_ordered_from_map.cc                    |   2 +-
 .../{dim_ordered => ff_ordered}/slice.cc      |  19 +-
 .../test/src/op-attrs/ff_ordered/transform.cc |  35 ++
 .../test/src/op-attrs/ff_ordered/zip.cc       |  38 ++
 lib/pcg/include/pcg/metric.enum.toml          |  26 +
 lib/pcg/include/pcg/metric_attrs.h            |  28 +
 lib/pcg/src/pcg/metric_attrs.cc               |  38 ++
 .../generate_weight_transform.cc              |   2 +-
 lib/runtime/src/metrics_functions.cc          |  33 -
 lib/runtime/src/metrics_functions.h           |  63 +-
 lib/runtime/src/ops/embedding.cc              |   4 +-
 .../utils/containers/{subvec.h => slice.h}    |  16 +-
 .../include/utils/containers/zip_strict.h     |  14 +-
 lib/utils/include/utils/exception.h           |   1 +
 lib/utils/include/utils/indent.h              |  12 +
 .../include/utils/stack_vector/stack_vector.h |  29 +-
 lib/utils/src/utils/containers/slice.cc       |   3 +
 lib/utils/src/utils/containers/subvec.cc      |   1 -
 .../full_binary_tree/binary_tree_path.cc      |   4 +-
 .../graph/series_parallel/series_reduction.cc |   4 +-
 lib/utils/src/utils/indent.cc                 |  17 +
 .../src/utils/stack_vector/stack_vector.cc    |   4 +-
 .../include/test/utils/doctest/check_kv.h     |  12 +
 lib/utils/test/common/src/main.cc             |  17 +-
 .../common/src/test/utils/doctest/check_kv.cc |  17 +
 .../utils/containers/{subvec.cc => slice.cc}  |  24 +-
 lib/utils/test/src/utils/indent.cc            |  66 ++
 .../src/utils/stack_vector/stack_vector.cc    |  85 +++
 275 files changed, 6048 insertions(+), 2570 deletions(-)
 create mode 100644 .flake/pkgs/fccf/default.nix
 create mode 100644 .flake/pkgs/fccf/fix-argparse-include.patch
 create mode 100644 .flake/pkgs/fccf/json-package-name.patch
 create mode 100644 lib/kernels/include/kernels/array_coord.struct.toml
 create mode 100644 lib/kernels/include/kernels/cast_kernels_cpu.h
 create mode 100644 lib/kernels/include/kernels/combine_kernels_cpu.h
 create mode 100644 lib/kernels/include/kernels/copy_tensor_accessor.h
 create mode 100644 lib/kernels/include/kernels/format_accessor_contents.h
 create mode 100644 lib/kernels/include/kernels/legion_ordered/legion_ordered.h
 create mode 100644 lib/kernels/include/kernels/legion_ordered/slice.h
 create mode 100644 lib/kernels/include/kernels/legion_ordered/transform.h
 rename lib/{local-execution/include/local-execution => kernels/include/kernels}/local_cpu_allocator.h (74%)
 rename lib/{local-execution/include/local-execution => kernels/include/kernels}/per_device_op_state.variant.toml (100%)
 create mode 100644 lib/kernels/include/kernels/replicate_kernels_cpu.h
 create mode 100644 lib/kernels/include/kernels/reverse_kernels_cpu.h
 create mode 100644 lib/kernels/include/kernels/reverse_kernels_params.h
 create mode 100644 lib/kernels/include/kernels/reverse_kernels_params.struct.toml
 delete mode 100644 lib/kernels/src/accessor.cc
 delete mode 100644 lib/kernels/src/allocation.cc
 create mode 100644 lib/kernels/src/cpu/ops/cast_kernels.cc
 create mode 100644 lib/kernels/src/cpu/ops/combine_kernels.cc
 rename lib/kernels/src/cpu/{ => ops}/initializer_kernels.cc (100%)
 create mode 100644 lib/kernels/src/cpu/ops/replicate_kernels.cc
 create mode 100644 lib/kernels/src/cpu/ops/reverse_kernels.cc
 delete mode 100644 lib/kernels/src/cuda/optimizer_kernel.cu
 create mode 100644 lib/kernels/src/cuda/optimizer_kernels.cu
 rename lib/kernels/src/{ => internal}/device.cc (97%)
 rename lib/kernels/src/{ => internal}/device.h (98%)
 create mode 100644 lib/kernels/src/kernels/accessor.cc
 create mode 100644 lib/kernels/src/kernels/allocation.cc
 rename lib/kernels/src/{ => kernels}/array_shape.cc (51%)
 create mode 100644 lib/kernels/src/kernels/copy_tensor_accessor.cc
 create mode 100644 lib/kernels/src/kernels/format_accessor_contents.cc
 rename lib/kernels/src/{ => kernels}/legion_dim.cc (78%)
 create mode 100644 lib/kernels/src/kernels/legion_ordered/legion_ordered.cc
 create mode 100644 lib/kernels/src/kernels/legion_ordered/slice.cc
 create mode 100644 lib/kernels/src/kernels/legion_ordered/transform.cc
 rename lib/{local-execution/src => kernels/src/kernels}/local_cpu_allocator.cc (52%)
 rename lib/kernels/src/{ => kernels}/local_cuda_allocator.cc (59%)
 create mode 100644 lib/kernels/src/kernels/reverse_kernels_params.cc
 create mode 100644 lib/kernels/test/src/cpu/ops/replicate_kernels.cc
 create mode 100644 lib/kernels/test/src/cpu/ops/reverse_kernels.cc
 create mode 100644 lib/kernels/test/src/internal/test_utils.cc
 create mode 100644 lib/kernels/test/src/internal/test_utils.h
 create mode 100644 lib/kernels/test/src/kernels/accessor.cc
 create mode 100644 lib/kernels/test/src/kernels/array_shape.cc
 create mode 100644 lib/kernels/test/src/kernels/format_accessor_contents.cc
 create mode 100644 lib/kernels/test/src/kernels/legion_dim.cc
 create mode 100644 lib/kernels/test/src/kernels/legion_ordered/legion_ordered.cc
 create mode 100644 lib/kernels/test/src/kernels/legion_ordered/slice.cc
 create mode 100644 lib/kernels/test/src/kernels/legion_ordered/transform.cc
 create mode 100644 lib/kernels/test/src/test_managed_ff_stream.cc
 create mode 100644 lib/kernels/test/src/test_managed_per_device_ff_handle.cc
 delete mode 100644 lib/kernels/test/src/test_utils.cc
 delete mode 100644 lib/kernels/test/src/test_utils.h
 rename lib/local-execution/src/{per_device_state.cc => per_device_op_state.cc} (100%)
 create mode 100644 lib/op-attrs/include/op-attrs/datatype_value.h
 rename lib/op-attrs/include/op-attrs/{dim_ordered => ff_ordered}/concat.h (95%)
 rename lib/op-attrs/include/op-attrs/{dim_ordered => ff_ordered}/enumerate.h (95%)
 create mode 100644 lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered.h
 rename lib/op-attrs/include/op-attrs/{dim_ordered => ff_ordered}/ff_ordered_from_map.h (88%)
 rename lib/op-attrs/include/op-attrs/{dim_ordered => ff_ordered}/ff_ordered_of.h (88%)
 rename lib/op-attrs/include/op-attrs/{dim_ordered => ff_ordered}/get_idxs.h (91%)
 create mode 100644 lib/op-attrs/include/op-attrs/ff_ordered/slice.h
 create mode 100644 lib/op-attrs/include/op-attrs/ff_ordered/transform.h
 create mode 100644 lib/op-attrs/include/op-attrs/ff_ordered/zip.h
 create mode 100644 lib/op-attrs/src/op-attrs/datatype_value.cc
 delete mode 100644 lib/op-attrs/src/op-attrs/dim_ordered/concat.cc
 delete mode 100644 lib/op-attrs/src/op-attrs/dim_ordered/enumerate.cc
 delete mode 100644 lib/op-attrs/src/op-attrs/dim_ordered/ff_ordered_from_map.cc
 delete mode 100644 lib/op-attrs/src/op-attrs/dim_ordered/ff_ordered_of.cc
 delete mode 100644 lib/op-attrs/src/op-attrs/dim_ordered/get_idxs.cc
 create mode 100644 lib/op-attrs/src/op-attrs/dim_ordered/transform.cc
 create mode 100644 lib/op-attrs/src/op-attrs/ff_ordered/enumerate.cc
 create mode 100644 lib/op-attrs/src/op-attrs/ff_ordered/ff_ordered.cc
 create mode 100644 lib/op-attrs/src/op-attrs/ff_ordered/ff_ordered_from_map.cc
 create mode 100644 lib/op-attrs/src/op-attrs/ff_ordered/get_idxs.cc
 create mode 100644 lib/op-attrs/src/op-attrs/ff_ordered/slice.cc
 create mode 100644 lib/op-attrs/src/op-attrs/ff_ordered/transform.cc
 create mode 100644 lib/op-attrs/src/op-attrs/ff_ordered/zip.cc
 create mode 100644 lib/op-attrs/test/src/op-attrs/datatype_value.cc
 rename lib/op-attrs/test/src/op-attrs/{dim_ordered => ff_ordered}/concat.cc (97%)
 rename lib/op-attrs/test/src/op-attrs/{dim_ordered => ff_ordered}/enumerate.cc (92%)
 create mode 100644 lib/op-attrs/test/src/op-attrs/ff_ordered/ff_ordered.cc
 rename lib/op-attrs/test/src/op-attrs/{dim_ordered => ff_ordered}/ff_ordered_from_map.cc (96%)
 rename lib/op-attrs/test/src/op-attrs/{dim_ordered => ff_ordered}/slice.cc (79%)
 create mode 100644 lib/op-attrs/test/src/op-attrs/ff_ordered/transform.cc
 create mode 100644 lib/op-attrs/test/src/op-attrs/ff_ordered/zip.cc
 create mode 100644 lib/pcg/include/pcg/metric.enum.toml
 create mode 100644 lib/pcg/include/pcg/metric_attrs.h
 create mode 100644 lib/pcg/src/pcg/metric_attrs.cc
 rename lib/utils/include/utils/containers/{subvec.h => slice.h} (69%)
 create mode 100644 lib/utils/include/utils/indent.h
 create mode 100644 lib/utils/src/utils/containers/slice.cc
 delete mode 100644 lib/utils/src/utils/containers/subvec.cc
 create mode 100644 lib/utils/src/utils/indent.cc
 create mode 100644 lib/utils/test/common/include/test/utils/doctest/check_kv.h
 create mode 100644 lib/utils/test/common/src/test/utils/doctest/check_kv.cc
 rename lib/utils/test/src/utils/containers/{subvec.cc => slice.cc} (69%)
 create mode 100644 lib/utils/test/src/utils/indent.cc

diff --git a/.flake/pkgs/fccf/default.nix b/.flake/pkgs/fccf/default.nix
new file mode 100644
index 0000000000..f792b8606c
--- /dev/null
+++ b/.flake/pkgs/fccf/default.nix
@@ -0,0 +1,54 @@
+{ fetchFromGitHub
+, stdenv
+, cmake
+, pkg-config
+, libclang
+, libllvm
+, lib
+, zlib
+, argparse
+, nlohmann_json
+, fmt
+}:
+
+stdenv.mkDerivation rec {
+  pname = "fccf";
+  version = "03d373fc65e2d7ceeac441ba4bbddfdc25618dff";
+
+  src = fetchFromGitHub {
+    owner = "p-ranav";
+    repo = "fccf";
+    rev = version;
+    sha256 = "sha256-3NdPon5ZfjoGFFgBlb0rzRnfWgSopvAc5Gls2NWHaOE=";
+  };
+
+  nativeBuildInputs = [
+    cmake
+    pkg-config
+  ];
+
+  buildInputs = [
+    libclang
+    libllvm
+    zlib
+    argparse
+    nlohmann_json
+    fmt
+  ];
+
+  patches = [
+    ./json-package-name.patch
+    ./fix-argparse-include.patch
+  ];
+
+  cmakeFlags = [
+    "-DCMAKE_BUILD_TYPE=Release"
+    "-DFETCHCONTENT_TRY_FIND_PACKAGE_MODE=ALWAYS"
+  ];
+
+  meta = with lib; {
+    description = "A command-line tool that quickly searches through C/C++ source code in a directory based on a search string and prints relevant code snippets that match the query";
+    homepage = "https://github.com/p-ranav/fccf";
+    license = licenses.mit;
+  };
+}
diff --git a/.flake/pkgs/fccf/fix-argparse-include.patch b/.flake/pkgs/fccf/fix-argparse-include.patch
new file mode 100644
index 0000000000..2cb648c1bf
--- /dev/null
+++ b/.flake/pkgs/fccf/fix-argparse-include.patch
@@ -0,0 +1,13 @@
+diff --git a/source/main.cpp b/source/main.cpp
+index 7e131d3..6c05d89 100644
+--- a/source/main.cpp
++++ b/source/main.cpp
+@@ -6,7 +6,7 @@
+ #include <string_view>
+ #include <vector>
+ 
+-#include <argparse.hpp>
++#include <argparse/argparse.hpp>
+ #include <nlohmann/json.hpp>
+ #include "searcher.hpp"
+ #include <unistd.h>
diff --git a/.flake/pkgs/fccf/json-package-name.patch b/.flake/pkgs/fccf/json-package-name.patch
new file mode 100644
index 0000000000..51f6a012cf
--- /dev/null
+++ b/.flake/pkgs/fccf/json-package-name.patch
@@ -0,0 +1,12 @@
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 20bcbbf..923075f 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -48,6 +48,7 @@ FetchContent_MakeAvailable(fmt)
+ 
+ FetchContent_Declare(json
+   URL https://github.com/nlohmann/json/releases/download/v3.10.5/json.tar.xz
++  FIND_PACKAGE_ARGS NAMES nlohmann_json
+ )
+ FetchContent_MakeAvailable(json)
+ 
diff --git a/.github/runs-on.yml b/.github/runs-on.yml
index a4fff33536..5033e69d65 100644
--- a/.github/runs-on.yml
+++ b/.github/runs-on.yml
@@ -1,23 +1,4 @@
 images:
-  runs-on-gpu-pinned:
-    platform: "linux"
-    arch: "x64"
-    owner: "135269210855" # runs-on
-    # to find, go to 
-    # https://us-east-2.console.aws.amazon.com/ec2/home?region=us-east-2#Images:visibility=public-images;search=:runs-on;v=3;$case=tags:false%5C,client:false;$regex=tags:false%5C,client:false
-    name: "runs-on-v2.2-ubuntu22-gpu-x64-20250220122045"
-
-  runs-on-cpu-pinned:
-    platform: "linux"
-    arch: "x64"
-    owner: "135269210855" # runs-on
-    name: "runs-on-v2.2-ubuntu22-full-x64-20250220122045"
-
-  official-ubuntu-ami:
-    platform: "linux"
-    arch: "x64"
-    ami: "ami-0a60b027285c0d4c5"
-
   flexflow-gpu-ci:
     platform: "linux"
     arch: "x64"
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 9d98fb07dd..799e3069a9 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -57,9 +57,9 @@ jobs:
     name: GPU unit tests
     needs: cpu-ci
     runs-on:
-      - runs-on
+      - runs-on=${{ github.run_id }}
       - family=g4dn.xlarge
-      - image=runs-on-gpu-pinned
+      - image=flexflow-gpu-ci
 
     strategy:
       max-parallel: 1
diff --git a/.proj.toml b/.proj.toml
index a06fb53c3a..8eed6166cd 100644
--- a/.proj.toml
+++ b/.proj.toml
@@ -2,57 +2,81 @@ project_name = "flexflow"
 testsuite_macro = "FF_TEST_SUITE"
 namespace_name = "FlexFlow"
 header_extension = ".h"
+cuda_launch_cmd = [
+  "nixGL",
+  "--",
+]
 
 [targets.utils]
 type = "lib"
-tests = true
-benchmarks = true
+has-cpu-only-tests = true
+has-cpu-only-benchmarks = true
+has-cuda-tests = false
+has-cuda-benchmarks = false
 
 [targets.op-attrs]
 type = "lib"
-tests = true
-benchmarks = false
+has-cpu-only-tests = true
+has-cpu-only-benchmarks = false
+has-cuda-tests = false
+has-cuda-benchmarks = false
 
 [targets.kernels]
 type = "lib"
-tests = true
-benchmarks = false
+has-cpu-only-tests = true
+has-cpu-only-benchmarks = false
+has-cuda-tests = true
+has-cuda-benchmarks = false
 
 [targets.pcg]
 type = "lib"
-tests = true
-benchmarks = false
+has-cpu-only-tests = true
+has-cpu-only-benchmarks = false
+has-cuda-tests = false
+has-cuda-benchmarks = false
 
 [targets.substitutions]
 type = "lib"
-tests = true
-benchmarks = false
+has-cpu-only-tests = true
+has-cpu-only-benchmarks = false
+has-cuda-tests = false
+has-cuda-benchmarks = false
 
 [targets.compiler]
 type = "lib"
-tests = true
-benchmarks = true
+has-cpu-only-tests = true
+has-cpu-only-benchmarks = true
+has-cuda-tests = false
+has-cuda-benchmarks = false
 
 [targets.substitution-generator]
 type = "lib"
-tests = true
-benchmarks = false
+has-cpu-only-tests = true
+has-cpu-only-benchmarks = false
+has-cuda-tests = false
+has-cuda-benchmarks = false
 
 [targets.local-execution]
 type = "lib"
-tests = true
-benchmarks = false
+has-cpu-only-tests = true
+has-cpu-only-benchmarks = false
+has-cuda-tests = false
+has-cuda-benchmarks = false
 
 [targets.models]
 type = "lib"
-tests = true
-benchmarks = false
+has-cpu-only-tests = true
+has-cpu-only-benchmarks = false
+has-cuda-tests = false
+has-cuda-benchmarks = false
 
 [targets.export-model-arch]
 type = "bin"
+cuda = false
 
 [targets.substitution-to-dot]
 type = "bin"
+cuda = false
 
 # default_build_targets = [
 #   "utils",
diff --git a/cmake/flexflow-utils.cmake b/cmake/flexflow-utils.cmake
index 478ebda318..ef5d6d9d11 100644
--- a/cmake/flexflow-utils.cmake
+++ b/cmake/flexflow-utils.cmake
@@ -126,11 +126,16 @@ function(ff_add_test_executable)
     ${FF_TEST_EXEC_NAME}
     ${SRC})
 
+  target_include_directories(
+    ${FF_TEST_EXEC_NAME}
+    PRIVATE
+    ${FF_TEST_EXEC_PRIVATE_INCLUDE})
+
   target_link_libraries(
     ${FF_TEST_EXEC_NAME}
     ${FF_TEST_EXEC_DEPS})
 
-  target_compile_definitions(${FF_TEST_EXEC_NAME} PRIVATE FF_TEST_SUITE="${FF_TEST_EXEC_NAME}" FF_CUDA_TEST_SUITE="cuda-${FF_TEST_EXEC_NAME}")
+  target_compile_definitions(${FF_TEST_EXEC_NAME} PRIVATE FF_TEST_SUITE="cpu-${FF_TEST_EXEC_NAME}" FF_CUDA_TEST_SUITE="cuda-${FF_TEST_EXEC_NAME}")
 
   define_ff_vars(${FF_TEST_EXEC_NAME})
   ff_set_cxx_properties(${FF_TEST_EXEC_NAME})
diff --git a/flake.lock b/flake.lock
index c991232013..ff6e797d51 100644
--- a/flake.lock
+++ b/flake.lock
@@ -66,11 +66,11 @@
         ]
       },
       "locked": {
-        "lastModified": 1741679698,
-        "narHash": "sha256-poSOQS/2qImAo/PgRu37pHdOrwAsZEyC8PMM3evFLX4=",
+        "lastModified": 1746157536,
+        "narHash": "sha256-g4Hx/05+Ce3hl8OS1zm4pY/+ThD1blWKmcaPsohSX5Y=",
         "owner": "lockshaw",
         "repo": "proj",
-        "rev": "0de983ff66abea4703f73988d29fc807e2b0a9bd",
+        "rev": "5871bc7b7fb9d7d7f14c8bca6c50a0cf2e75834d",
         "type": "github"
       },
       "original": {
diff --git a/flake.nix b/flake.nix
index 77a6c61b7d..5fa48fa3fd 100644
--- a/flake.nix
+++ b/flake.nix
@@ -59,6 +59,7 @@
         bencher-cli = pkgs.callPackage ./.flake/pkgs/bencher-cli.nix { };
         ffdb = pkgs.callPackage ./.flake/pkgs/ffdb { inherit proj; };
         hpp2plantuml = pkgs.python3Packages.callPackage ./.flake/pkgs/hpp2plantuml.nix { };
+        fccf = pkgs.callPackage ./.flake/pkgs/fccf { };
         rapidcheckFull = pkgs.symlinkJoin {
           name = "rapidcheckFull";
           paths = (with pkgs; [ rapidcheck.out rapidcheck.dev ]);
@@ -162,6 +163,7 @@
               ruff
               jq
               gh
+              expect
             ])
             (with pkgs.python3Packages; [
               gitpython
@@ -179,6 +181,7 @@
             (with self.packages.${system}; [
               ffdb
               hpp2plantuml
+              fccf
             ])
           ];
         };
diff --git a/lib/kernels/CMakeLists.txt b/lib/kernels/CMakeLists.txt
index 8ccd7c1011..f5d88f102f 100644
--- a/lib/kernels/CMakeLists.txt
+++ b/lib/kernels/CMakeLists.txt
@@ -7,8 +7,7 @@ file(GLOB_RECURSE SRC
      CONFIGURE_DEPENDS
      LIST_DIRECTORIES False
      src/*.cc
-     src/cuda/cuda_helper.cu
-     src/cuda/ops/*.cu
+     src/cuda/*.cu
      )
 
 add_library(
@@ -30,6 +29,7 @@ target_link_libraries(
   cudnn
   nccl
   utils
+  pcg
 )
 
 define_ff_vars(${project_target})
diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h
index 39da65c3be..f9bef91b25 100644
--- a/lib/kernels/include/kernels/accessor.h
+++ b/lib/kernels/include/kernels/accessor.h
@@ -1,25 +1,88 @@
 #ifndef _FLEXFLOW_KERNELS_ACCESSOR_H
 #define _FLEXFLOW_KERNELS_ACCESSOR_H
 
-#include "array_shape.h"
-#include "device.h"
+#include "kernels/array_shape.h"
+#include "kernels/device.h"
 #include "kernels/ff_handle.h"
 #include "op-attrs/datatype.h"
-#include "utils/exception.h"
+#include "pcg/device_type.dtg.h"
+#include "utils/containers/transform.h"
 #include "utils/required.h"
+#include <libassert/assert.hpp>
 
 namespace FlexFlow {
 
+nonnegative_int
+    calculate_accessor_offset(LegionOrdered<nonnegative_int> const &,
+                              ArrayShape const &);
+
+class GenericTensorAccessorR {
+public:
+  template <DataType DT>
+  typename data_type_enum_to_class<DT>::type const *get() const {
+    ASSERT(this->data_type == DT, "Invalid datatype requested");
+
+    return static_cast<real_type_t<DT> const *>(this->ptr);
+  }
+
+  int32_t const *get_int32_ptr() const;
+  int64_t const *get_int64_ptr() const;
+  float const *get_float_ptr() const;
+  double const *get_double_ptr() const;
+  half const *get_half_ptr() const;
+
+  GenericTensorAccessorR() = delete;
+
+  GenericTensorAccessorR(DataType data_type,
+                         ArrayShape const &shape,
+                         void const *ptr,
+                         DeviceType device_type);
+
+  bool operator==(GenericTensorAccessorR const &) const;
+  bool operator!=(GenericTensorAccessorR const &) const;
+
+  template <DataType DT>
+  real_type_t<DT> const &at(FFOrdered<nonnegative_int> const &indices) const {
+    return this->at<DT>(legion_ordered_from_ff_ordered(indices));
+  }
+
+  template <DataType DT>
+  real_type_t<DT> const &
+      at(LegionOrdered<nonnegative_int> const &indices) const {
+    ASSERT(this->device_type == DeviceType::CPU,
+           "GenericTensorAccessorR::at() requires CPU-allocated tensor");
+    ASSERT(this->data_type == DT, "Invalid datatype requested");
+
+    using T = real_type_t<DT>;
+    T const *data_ptr = static_cast<T const *>(this->ptr);
+    nonnegative_int offset = calculate_accessor_offset(indices, this->shape);
+    return data_ptr[offset.unwrap_nonnegative()];
+  }
+
+public:
+  DataType data_type;
+  ArrayShape shape;
+  void const *ptr;
+  DeviceType device_type;
+
+private:
+  std::tuple<decltype(data_type) const &,
+             decltype(shape) const &,
+             decltype(ptr) const &,
+             decltype(device_type) const &>
+      tie() const;
+};
+
+std::string format_as(GenericTensorAccessorR const &);
+std::ostream &operator<<(std::ostream &, GenericTensorAccessorR const &);
+
 class GenericTensorAccessorW {
 public:
   template <DataType DT>
   typename data_type_enum_to_class<DT>::type *get() const {
-    if (this->data_type == DT) {
-      return static_cast<real_type_t<DT> *>(this->ptr);
-    } else {
-      throw mk_runtime_error(fmt::format(
-          "Invalid access data type ({} != {})", this->data_type, DT));
-    }
+    ASSERT(this->data_type == DT, "Invalid datatype requested");
+
+    return static_cast<real_type_t<DT> *>(this->ptr);
   }
 
   int32_t *get_int32_ptr() const;
@@ -28,76 +91,76 @@ class GenericTensorAccessorW {
   double *get_double_ptr() const;
   half *get_half_ptr() const;
 
-public:
-  DataType data_type;
-  ArrayShape shape;
-  req<void *> ptr;
-};
-FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GenericTensorAccessorW,
-                                             data_type,
-                                             shape,
-                                             ptr);
+  GenericTensorAccessorW() = delete;
 
-std::string format_as(GenericTensorAccessorW const &);
-std::ostream &operator<<(std::ostream &, GenericTensorAccessorW const &);
+  GenericTensorAccessorW(DataType data_type,
+                         ArrayShape const &shape,
+                         void *ptr,
+                         DeviceType device_type);
+
+  bool operator==(GenericTensorAccessorW const &) const;
+  bool operator!=(GenericTensorAccessorW const &) const;
+
+  operator GenericTensorAccessorR() const;
 
-class GenericTensorAccessorR {
-public:
   template <DataType DT>
-  typename data_type_enum_to_class<DT>::type const *get() const {
-    if (this->data_type == DT) {
-      return static_cast<real_type_t<DT> const *>(this->ptr);
-    } else {
-      throw mk_runtime_error(fmt::format(
-          "Invalid access data type ({} != {})", this->data_type, DT));
-    }
+  real_type_t<DT> &at(FFOrdered<nonnegative_int> const &indices) {
+    return this->at<DT>(legion_ordered_from_ff_ordered(indices));
   }
 
-  int32_t const *get_int32_ptr() const;
-  int64_t const *get_int64_ptr() const;
-  float const *get_float_ptr() const;
-  double const *get_double_ptr() const;
-  half const *get_half_ptr() const;
+  template <DataType DT>
+  real_type_t<DT> &at(LegionOrdered<nonnegative_int> const &indices) {
+    ASSERT(this->device_type == DeviceType::CPU,
+           "GenericTensorAccessorW::at() requires CPU-allocated tensor");
+    ASSERT(this->data_type == DT, "Invalid datatype requested");
+
+    using T = real_type_t<DT>;
+    T *data_ptr = static_cast<T *>(this->ptr);
+    nonnegative_int offset = calculate_accessor_offset(indices, this->shape);
+    return data_ptr[offset.unwrap_nonnegative()];
+  }
+
+  template <DataType DT>
+  real_type_t<DT> const &at(FFOrdered<nonnegative_int> const &indices) const {
+    return this->at<DT>(legion_ordered_from_ff_ordered(indices));
+  }
+
+  template <DataType DT>
+  real_type_t<DT> &at(LegionOrdered<nonnegative_int> const &indices) const {
+    ASSERT(this->device_type == DeviceType::CPU,
+           "GenericTensorAccessorW::at() requires CPU-allocated tensor");
+    ASSERT(this->data_type == DT, "Invalid datatype requested");
+
+    using T = real_type_t<DT>;
+    T const *data_ptr = static_cast<T const *>(this->ptr);
+    nonnegative_int offset = calculate_accessor_offset(indices, this->shape);
+    return data_ptr[offset];
+  }
 
 public:
   DataType data_type;
   ArrayShape shape;
-  req<void const *> ptr;
+  void *ptr;
+  DeviceType device_type;
+
+private:
+  std::tuple<decltype(data_type) const &,
+             decltype(shape) const &,
+             decltype(ptr) const &,
+             decltype(device_type) const &>
+      tie() const;
 };
-FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GenericTensorAccessorR,
-                                             data_type,
-                                             shape,
-                                             ptr);
-
-std::string format_as(GenericTensorAccessorR const &);
-std::ostream &operator<<(std::ostream &, GenericTensorAccessorR const &);
 
-int32_t *get_int32_ptr(GenericTensorAccessorW const &);
-int64_t *get_int64_ptr(GenericTensorAccessorW const &);
-float *get_float_ptr(GenericTensorAccessorW const &);
-double *get_double_ptr(GenericTensorAccessorW const &);
-half *get_half_ptr(GenericTensorAccessorW const &);
-std::vector<int32_t *>
-    get_int32_ptrs(std::vector<GenericTensorAccessorW> const &);
-std::vector<int64_t *>
-    get_int64_ptrs(std::vector<GenericTensorAccessorW> const &);
-std::vector<float *>
-    get_float_ptrs(std::vector<GenericTensorAccessorW> const &);
-std::vector<double *>
-    get_double_ptrs(std::vector<GenericTensorAccessorW> const &);
-std::vector<half *> get_half_ptrs(std::vector<GenericTensorAccessorW> const &);
+std::string format_as(GenericTensorAccessorW const &);
+std::ostream &operator<<(std::ostream &, GenericTensorAccessorW const &);
 
 static_assert(is_fmtable<req<DataType> const &>::value, "");
 
 template <DataType DT>
 typename data_type_enum_to_class<DT>::type *
     get(GenericTensorAccessorW const &a) {
-  if (a.data_type == DT) {
-    return static_cast<real_type_t<DT> *>(a.ptr);
-  } else {
-    throw mk_runtime_error(
-        fmt::format("Invalid access data type ({} != {})", a.data_type, DT));
-  }
+  ASSERT(a.data_type == DT, "Invalid datatype requested");
+  return static_cast<real_type_t<DT> *>(a.ptr);
 }
 
 template <DataType DT>
@@ -113,12 +176,8 @@ std::vector<real_type_t<DT> *>
 template <DataType DT>
 typename data_type_enum_to_class<DT>::type const *
     get(GenericTensorAccessorR const &a) {
-  if (a.data_type == DT) {
-    return static_cast<real_type_t<DT> const *>(a.ptr);
-  } else {
-    throw mk_runtime_error(
-        fmt::format("Invalid access data type ({} != {})", a.data_type, DT));
-  }
+  ASSERT(a.data_type == DT, "Invalid datatype requested");
+  return static_cast<real_type_t<DT> const *>(a.ptr);
 }
 
 int32_t const *get_int32_ptr(GenericTensorAccessorR const &);
@@ -137,6 +196,21 @@ std::vector<double const *>
 std::vector<half const *>
     get_half_ptrs(std::vector<GenericTensorAccessorR> const &);
 
+int32_t *get_int32_ptr(GenericTensorAccessorW const &);
+int64_t *get_int64_ptr(GenericTensorAccessorW const &);
+float *get_float_ptr(GenericTensorAccessorW const &);
+double *get_double_ptr(GenericTensorAccessorW const &);
+half *get_half_ptr(GenericTensorAccessorW const &);
+std::vector<int32_t *>
+    get_int32_ptrs(std::vector<GenericTensorAccessorW> const &);
+std::vector<int64_t *>
+    get_int64_ptrs(std::vector<GenericTensorAccessorW> const &);
+std::vector<float *>
+    get_float_ptrs(std::vector<GenericTensorAccessorW> const &);
+std::vector<double *>
+    get_double_ptrs(std::vector<GenericTensorAccessorW> const &);
+std::vector<half *> get_half_ptrs(std::vector<GenericTensorAccessorW> const &);
+
 template <DataType DT>
 std::vector<real_type_t<DT> const *>
     get(std::vector<GenericTensorAccessorR> const &accs) {
@@ -150,12 +224,8 @@ std::vector<real_type_t<DT> const *>
 GenericTensorAccessorR read_only_accessor_from_write_accessor(
     GenericTensorAccessorW const &write_accessor);
 
-bool is_shape_and_dtype_equal(GenericTensorAccessorW const &acc1,
-                              GenericTensorAccessorW const &acc2);
-
-bool shape_and_dtype_matches(GenericTensorAccessorW const &accessor,
-                             ArrayShape const &expected_shape,
-                             DataType const &expected_dtype);
+bool is_shape_and_dtype_equal(GenericTensorAccessorR const &acc1,
+                              GenericTensorAccessorR const &acc2);
 
 bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor,
                              ArrayShape const &expected_shape,
@@ -163,8 +233,9 @@ bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor,
 
 std::pair<ArrayShape, DataType>
     get_shape_and_datatype(GenericTensorAccessorR const &accessor);
-std::pair<ArrayShape, DataType>
-    get_shape_and_datatype(GenericTensorAccessorW const &accessor);
+
+void copy_accessor_data_to_l_from_r(GenericTensorAccessorW &dst_accessor,
+                                    GenericTensorAccessorR const &src_accessor);
 
 } // namespace FlexFlow
 
diff --git a/lib/kernels/include/kernels/allocation.h b/lib/kernels/include/kernels/allocation.h
index 6500899394..39bad6599c 100644
--- a/lib/kernels/include/kernels/allocation.h
+++ b/lib/kernels/include/kernels/allocation.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_KERNELS_ALLOCATION_H
 #define _FLEXFLOW_KERNELS_ALLOCATION_H
 
-#include "accessor.h"
+#include "kernels/accessor.h"
 #include <cstddef>
 #include <memory>
 
@@ -11,6 +11,8 @@ struct IAllocator {
   virtual void *allocate(size_t) = 0;
   virtual void deallocate(void *) = 0;
 
+  virtual DeviceType get_allocation_device_type() const = 0;
+
   virtual ~IAllocator() = default;
 };
 
@@ -18,9 +20,14 @@ struct Allocator {
   Allocator() = delete;
 
   GenericTensorAccessorW allocate_tensor(TensorShape const &tensor_shape);
+  void deallocate_tensor(GenericTensorAccessorW const &);
+  void deallocate_tensor(GenericTensorAccessorR const &);
+
   void *allocate(size_t mem_size);
   void deallocate(void *ptr);
 
+  DeviceType get_allocation_device_type() const;
+
   template <typename T, typename... Args>
   static typename std::enable_if<std::is_base_of<IAllocator, T>::value,
                                  Allocator>::type
diff --git a/lib/kernels/include/kernels/array_coord.struct.toml b/lib/kernels/include/kernels/array_coord.struct.toml
new file mode 100644
index 0000000000..8ce121f2bf
--- /dev/null
+++ b/lib/kernels/include/kernels/array_coord.struct.toml
@@ -0,0 +1,19 @@
+namespace = "FlexFlow"
+name = "ArrayCoord"
+features = [
+  "eq",
+  "ord",
+  "hash",
+  "fmt",
+  "rapidcheck",
+  "json",
+]
+
+includes = [
+  "op-attrs/ff_ordered/ff_ordered.h",
+  "utils/nonnegative_int/nonnegative_int.h"
+]
+
+[[fields]]
+name = "ff_ordered"
+type = "::FlexFlow::FFOrdered<::FlexFlow::nonnegative_int>"
diff --git a/lib/kernels/include/kernels/array_shape.h b/lib/kernels/include/kernels/array_shape.h
index 57498ee466..25ef8116f2 100644
--- a/lib/kernels/include/kernels/array_shape.h
+++ b/lib/kernels/include/kernels/array_shape.h
@@ -1,6 +1,7 @@
 #ifndef _FLEXFLOW_KERNELS_ARRAY_SHAPE_H
 #define _FLEXFLOW_KERNELS_ARRAY_SHAPE_H
 
+#include "kernels/array_coord.dtg.h"
 #include "kernels/legion_dim.h"
 #include "op-attrs/tensor_shape.dtg.h"
 #include "utils/nonnegative_int/nonnegative_int.h"
@@ -15,9 +16,7 @@ namespace FlexFlow {
 struct ArrayShape {
 public:
   ArrayShape() = delete;
-  ArrayShape(nonnegative_int *dims, nonnegative_int num_dims);
-  ArrayShape(TensorShape const &shape);
-  ArrayShape(std::vector<nonnegative_int> const &);
+  explicit ArrayShape(LegionOrdered<nonnegative_int> const &dims);
 
   /**
    * @brief Alias of ArrayShape::num_elements for compatibility with
@@ -46,24 +45,40 @@ struct ArrayShape {
   std::optional<nonnegative_int> at_maybe(legion_dim_t) const;
   std::optional<nonnegative_int> at_maybe(ff_dim_t) const;
 
-  ArrayShape
-      sub_shape(std::optional<std::variant<ff_dim_t, legion_dim_t>> start,
-                std::optional<std::variant<ff_dim_t, legion_dim_t>> end) const;
+  ArrayShape sub_shape(ff_dim_t const &start,
+                       std::optional<ff_dim_t> const &end) const;
+
+  ArrayShape sub_shape(legion_dim_t const &start,
+                       std::optional<legion_dim_t> const &end) const;
 
 public:
   LegionOrdered<nonnegative_int> dims;
 
 private:
   std::tuple<decltype(dims) const &> tie() const;
+
+  friend ::std::hash<ArrayShape>;
 };
 
+std::string format_as(ArrayShape const &);
+std::ostream &operator<<(std::ostream &, ArrayShape const &);
+
 nonnegative_int get_volume(ArrayShape const &);
 
+ArrayShape array_shape_from_tensor_shape(TensorShape const &);
 TensorShape get_tensor_shape(ArrayShape const &, DataType);
 
-std::string format_as(ArrayShape const &);
-std::ostream &operator<<(std::ostream &, ArrayShape const &);
+std::unordered_set<ArrayCoord> get_array_coord_set(ArrayShape const &);
 
 } // namespace FlexFlow
 
+namespace std {
+
+template <>
+struct hash<::FlexFlow::ArrayShape> {
+  size_t operator()(::FlexFlow::ArrayShape const &) const;
+};
+
+} // namespace std
+
 #endif
diff --git a/lib/kernels/include/kernels/attention_kernels.h b/lib/kernels/include/kernels/attention_kernels.h
index eb5a1b8198..b3c77d3430 100644
--- a/lib/kernels/include/kernels/attention_kernels.h
+++ b/lib/kernels/include/kernels/attention_kernels.h
@@ -1,7 +1,6 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_ATTENTION_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_ATTENTION_KERNELS_H
 
-#include "device.h"
 #include "kernels/allocation.h"
 #include "kernels/device.h"
 #include "kernels/ff_handle.h"
@@ -64,8 +63,7 @@ FF_VISITABLE_STRUCT_NO_EQ(MHAPerDeviceState,
 std::string format_as(MHAPerDeviceState const &x);
 std::ostream &operator<<(std::ostream &s, MHAPerDeviceState const &x);
 
-namespace Kernels {
-namespace MultiHeadAttention {
+namespace Kernels::MultiHeadAttention {
 
 MHAPerDeviceState init_kernel(PerDeviceFFHandle const &,
                               Allocator &,
@@ -105,8 +103,7 @@ void backward_kernel(ffStream_t stream,
 void cleanup_kernel(Allocator &allocator,
                     MHAPerDeviceState const &device_state);
 
-} // namespace MultiHeadAttention
-} // namespace Kernels
+} // namespace Kernels::MultiHeadAttention
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/kernels/include/kernels/batch_matmul_kernels.h b/lib/kernels/include/kernels/batch_matmul_kernels.h
index bfd72647b0..8b67f564d2 100644
--- a/lib/kernels/include/kernels/batch_matmul_kernels.h
+++ b/lib/kernels/include/kernels/batch_matmul_kernels.h
@@ -1,13 +1,11 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_BATCH_MATMUL_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_BATCH_MATMUL_KERNELS_H
 
-#include "device.h"
 #include "kernels/allocation.h"
+#include "kernels/device.h"
 #include "kernels/ff_handle.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace BatchMatmul {
+namespace FlexFlow::Kernels::BatchMatmul {
 
 void forward_kernel(ffStream_t stream,
                     PerDeviceFFHandle const &handle,
@@ -35,8 +33,6 @@ void backward_kernel(ffStream_t stream,
                      int k,
                      int batch);
 
-} // namespace BatchMatmul
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::BatchMatmul
 
 #endif
diff --git a/lib/kernels/include/kernels/batch_norm_kernels.h b/lib/kernels/include/kernels/batch_norm_kernels.h
index f2ca17f429..9bb2753a12 100644
--- a/lib/kernels/include/kernels/batch_norm_kernels.h
+++ b/lib/kernels/include/kernels/batch_norm_kernels.h
@@ -1,15 +1,13 @@
 #ifndef _FLEXFLOW_KERNELS_BATCH_NORM_KERNELS_H
 #define _FLEXFLOW_KERNELS_BATCH_NORM_KERNELS_H
 
-#include "device.h"
 #include "kernels/allocation.h"
 #include "kernels/batch_norm_per_device_state.dtg.h"
+#include "kernels/device.h"
 #include "kernels/ff_handle.h"
 #include <memory>
 
-namespace FlexFlow {
-namespace Kernels {
-namespace BatchNorm {
+namespace FlexFlow::Kernels::BatchNorm {
 
 BatchNormPerDeviceState init_kernel(PerDeviceFFHandle handle,
                                     Allocator allocator,
@@ -29,9 +27,9 @@ void forward_kernel(ffStream_t stream,
 
 void backward_kernel(ffStream_t stream,
                      BatchNormPerDeviceState const &per_device_state,
-                     float const *input_ptr,
-                     float *output_grad_ptr,
                      float const *output_ptr,
+                     float *output_grad_ptr,
+                     float const *input_ptr,
                      float *input_grad_ptr,
                      float const *scale_ptr,
                      float *scale_grad_ptr,
@@ -46,8 +44,5 @@ void cleanup_kernel(Allocator allocator,
                     bool relu,
                     float *runningMean);
 
-} // namespace BatchNorm
-} // namespace Kernels
-} // namespace FlexFlow
-
+} // namespace FlexFlow::Kernels::BatchNorm
 #endif
diff --git a/lib/kernels/include/kernels/cast_kernels.h b/lib/kernels/include/kernels/cast_kernels.h
index 96f9aadd52..5ec4cb3975 100644
--- a/lib/kernels/include/kernels/cast_kernels.h
+++ b/lib/kernels/include/kernels/cast_kernels.h
@@ -1,29 +1,19 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_CAST_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_CAST_KERNELS_H
 
-#include "device.h"
 #include "kernels/accessor.h"
-#include "kernels/ff_handle.h"
-#include "op-attrs/activation.dtg.h"
+#include "kernels/device.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Cast {
+namespace FlexFlow::Kernels::Cast {
 
 void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorR const &input,
-                    GenericTensorAccessorW const &output,
-                    DataType input_type,
-                    DataType output_type);
+                    GenericTensorAccessorW const &output);
 
 void backward_kernel(ffStream_t stream,
-                     GenericTensorAccessorR const &input,
-                     GenericTensorAccessorW const &output,
-                     DataType input_type,
-                     DataType output_type);
+                     GenericTensorAccessorR const &output,
+                     GenericTensorAccessorW const &input);
 
-} // namespace Cast
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Cast
 
 #endif
diff --git a/lib/kernels/include/kernels/cast_kernels_cpu.h b/lib/kernels/include/kernels/cast_kernels_cpu.h
new file mode 100644
index 0000000000..343ba253d9
--- /dev/null
+++ b/lib/kernels/include/kernels/cast_kernels_cpu.h
@@ -0,0 +1,17 @@
+#ifndef _FLEXFLOW_OPS_KERNELS_CAST_KERNELS_CPU_H
+#define _FLEXFLOW_OPS_KERNELS_CAST_KERNELS_CPU_H
+
+#include "kernels/accessor.h"
+#include "kernels/device.h"
+
+namespace FlexFlow::Kernels::Cast {
+
+void cpu_forward_kernel(GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output);
+
+void cpu_backward_kernel(GenericTensorAccessorR const &output,
+                         GenericTensorAccessorW const &input);
+
+} // namespace FlexFlow::Kernels::Cast
+
+#endif
diff --git a/lib/kernels/include/kernels/combine_kernels.h b/lib/kernels/include/kernels/combine_kernels.h
index eb263e0734..c87465a01f 100644
--- a/lib/kernels/include/kernels/combine_kernels.h
+++ b/lib/kernels/include/kernels/combine_kernels.h
@@ -1,12 +1,10 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_H
 
-#include "device.h"
 #include "kernels/accessor.h"
+#include "kernels/device.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Combine {
+namespace FlexFlow::Kernels::Combine {
 
 void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorR const &input,
@@ -16,8 +14,6 @@ void backward_kernel(ffStream_t stream,
                      GenericTensorAccessorR const &output_grad,
                      GenericTensorAccessorW const &input_grad);
 
-} // namespace Combine
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Combine
 
 #endif // _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_H
diff --git a/lib/kernels/include/kernels/combine_kernels_cpu.h b/lib/kernels/include/kernels/combine_kernels_cpu.h
new file mode 100644
index 0000000000..75fdd56498
--- /dev/null
+++ b/lib/kernels/include/kernels/combine_kernels_cpu.h
@@ -0,0 +1,17 @@
+#ifndef _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_CPU_H
+#define _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_CPU_H
+
+#include "kernels/accessor.h"
+#include "kernels/device.h"
+
+namespace FlexFlow::Kernels::Combine {
+
+void cpu_forward_kernel(GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output);
+
+void cpu_backward_kernel(GenericTensorAccessorR const &output_grad,
+                         GenericTensorAccessorW const &input_grad);
+
+} // namespace FlexFlow::Kernels::Combine
+
+#endif // _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_CPU_H
diff --git a/lib/kernels/include/kernels/concat_kernels.h b/lib/kernels/include/kernels/concat_kernels.h
index a44affc1f2..1e3c55bf59 100644
--- a/lib/kernels/include/kernels/concat_kernels.h
+++ b/lib/kernels/include/kernels/concat_kernels.h
@@ -1,12 +1,10 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_CONCAT_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_CONCAT_KERNELS_H
 
-#include "device.h"
 #include "kernels/accessor.h"
+#include "kernels/device.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Concat {
+namespace FlexFlow::Kernels::Concat {
 
 void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorW const &output,
@@ -18,8 +16,6 @@ void backward_kernel(ffStream_t stream,
                      std::vector<GenericTensorAccessorW> const &input_grads,
                      ff_dim_t axis);
 
-} // namespace Concat
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Concat
 
 #endif
diff --git a/lib/kernels/include/kernels/conv_2d_kernels.h b/lib/kernels/include/kernels/conv_2d_kernels.h
index cfc64f963d..3b7c0672df 100644
--- a/lib/kernels/include/kernels/conv_2d_kernels.h
+++ b/lib/kernels/include/kernels/conv_2d_kernels.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_CONV_2D_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_CONV_2D_KERNELS_H
 
-#include "device.h"
 #include "kernels/accessor.h"
+#include "kernels/device.h"
 #include "kernels/ff_handle.h"
 #include "op-attrs/activation.dtg.h"
 #include "utils/visitable.h"
@@ -34,8 +34,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(Conv2DPerDeviceState,
                                              bwdFilterAlgo,
                                              bwdDataAlgo);
 
-namespace Kernels {
-namespace Conv2D {
+namespace Kernels::Conv2D {
 
 Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle,
                                  std::optional<Activation> activation,
@@ -61,17 +60,16 @@ void forward_kernel(ffStream_t stream,
 
 void backward_kernel(ffStream_t stream,
                      Conv2DPerDeviceState const &m,
-                     float const *input_ptr,
-                     float *input_grad_ptr,
                      float const *output_ptr,
                      float *output_grad_ptr,
+                     float const *input_ptr,
+                     float *input_grad_ptr,
                      float const *filter_ptr,
                      float *filter_grad_ptr,
                      float *bias_grad_ptr,
                      std::optional<Activation> activation);
 
-} // namespace Conv2D
-} // namespace Kernels
+} // namespace Kernels::Conv2D
 } // namespace FlexFlow
 
 #endif // _FLEXFLOW_OPS_KERNELS_CONV_2D_KERNELS_H
diff --git a/lib/kernels/include/kernels/copy_tensor_accessor.h b/lib/kernels/include/kernels/copy_tensor_accessor.h
new file mode 100644
index 0000000000..81fd59dafb
--- /dev/null
+++ b/lib/kernels/include/kernels/copy_tensor_accessor.h
@@ -0,0 +1,27 @@
+#ifndef _FLEXFLOW_KERNELS_COPY_TENSOR_ACCESSOR_H
+#define _FLEXFLOW_KERNELS_COPY_TENSOR_ACCESSOR_H
+
+#include "kernels/accessor.h"
+#include "kernels/allocation.h"
+
+namespace FlexFlow {
+
+GenericTensorAccessorR
+    copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor,
+                           Allocator &allocator);
+
+GenericTensorAccessorW
+    copy_tensor_accessor_w(GenericTensorAccessorW const &src_accessor,
+                           Allocator &allocator);
+
+GenericTensorAccessorR
+    copy_tensor_accessor_r_to_cpu_if_necessary(GenericTensorAccessorR const &,
+                                               Allocator &cpu_allocator);
+
+GenericTensorAccessorW
+    copy_tensor_accessor_w_to_cpu_if_necessary(GenericTensorAccessorW const &,
+                                               Allocator &cpu_allocator);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/kernels/include/kernels/datatype_dispatch.h b/lib/kernels/include/kernels/datatype_dispatch.h
index e83fc3325d..50ca66a820 100644
--- a/lib/kernels/include/kernels/datatype_dispatch.h
+++ b/lib/kernels/include/kernels/datatype_dispatch.h
@@ -1,7 +1,8 @@
 #ifndef _FLEXFLOW_KERNELS_DATATYPE_DISPATCH_H
 #define _FLEXFLOW_KERNELS_DATATYPE_DISPATCH_H
 
-#include "accessor.h"
+#include "op-attrs/datatype.h"
+#include "utils/exception.h"
 
 namespace FlexFlow {
 
@@ -33,7 +34,7 @@ struct DataTypeDispatch1 {
     template <typename... Args,
               typename Out = decltype(std::declval<F<DataType::FLOAT>>()(
                   std::declval<Args>()...))>
-    Out operator()(Args... args) const {
+    Out operator()(Args &&...args) const {
       return F<DT>{}(std::forward<Args>(args)...);
     }
   };
@@ -41,7 +42,7 @@ struct DataTypeDispatch1 {
   template <typename... Args,
             typename Out = decltype(std::declval<F<DataType::FLOAT>>()(
                 std::declval<Args>()...))>
-  Out operator()(DataType data_type, Args... args) {
+  Out operator()(DataType data_type, Args &&...args) {
     return dispatch<Type1Dispatch>(data_type, std::forward<Args>(args)...);
   }
 };
@@ -54,13 +55,13 @@ struct DataTypeDispatch2 {
     template <DataType OT>
     struct OutputType {
       template <typename... Args>
-      void operator()(Args... args) const {
+      void operator()(Args &&...args) const {
         F<IT, OT>{}(std::forward<Args>(args)...);
       }
     };
 
     template <typename... Args>
-    void operator()(DataType output_type, Args... args) const {
+    void operator()(DataType output_type, Args &&...args) const {
       dispatch<OutputType>(output_type, std::forward<Args>(args)...);
     }
   };
@@ -68,7 +69,7 @@ struct DataTypeDispatch2 {
   template <typename... Args>
   void operator()(DataType input_data_type,
                   DataType output_data_type,
-                  Args... args) {
+                  Args &&...args) {
     dispatch<InputType>(
         input_data_type, output_data_type, std::forward<Args>(args)...);
   }
diff --git a/lib/kernels/include/kernels/dropout_kernels.h b/lib/kernels/include/kernels/dropout_kernels.h
index c0e503be5b..2cc6dd60a3 100644
--- a/lib/kernels/include/kernels/dropout_kernels.h
+++ b/lib/kernels/include/kernels/dropout_kernels.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_DROPOUT_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_DROPOUT_KERNELS_H
 
-#include "device.h"
 #include "kernels/allocation.h"
 #include "kernels/array_shape.h"
+#include "kernels/device.h"
 #include "kernels/ff_handle.h"
 #include <cstddef>
 
@@ -31,8 +31,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(DropoutPerDeviceState,
                                              reserveSpaceSize,
                                              dropoutStateSize);
 
-namespace Kernels {
-namespace Dropout {
+namespace Kernels::Dropout {
 
 DropoutPerDeviceState init_kernel(PerDeviceFFHandle handle,
                                   float rate,
@@ -56,8 +55,7 @@ void cleanup_kernel(Allocator allocator,
                     ffDropoutDescriptor_t dropoutDesc,
                     void *dropoutStates);
 
-} // namespace Dropout
-} // namespace Kernels
+} // namespace Kernels::Dropout
 } // namespace FlexFlow
 
 #endif // _FLEXFLOW_OPS_KERNELS_DROPOUT_KERNELS_H
diff --git a/lib/kernels/include/kernels/element_binary_kernels.h b/lib/kernels/include/kernels/element_binary_kernels.h
index 41447e98e6..fd596f2ccf 100644
--- a/lib/kernels/include/kernels/element_binary_kernels.h
+++ b/lib/kernels/include/kernels/element_binary_kernels.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_ELEMENT_BINARY_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_ELEMENT_BINARY_KERNELS_H
 
-#include "device.h"
 #include "ff_handle.h"
 #include "kernels/array_shape.h"
+#include "kernels/device.h"
 #include "op-attrs/datatype.h"
 #include "op-attrs/operator_type.h"
 
@@ -26,8 +26,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(ElementBinaryPerDeviceState,
                                              opDesc,
                                              reduceAddDesc);
 
-namespace Kernels {
-namespace ElementBinary {
+namespace Kernels::ElementBinary {
 
 ElementBinaryPerDeviceState init_kernel(PerDeviceFFHandle handle,
                                         OperatorType op_type,
@@ -58,8 +57,7 @@ void backward_kernel(ffStream_t stream,
                      bool broadcast_inputRHS,
                      PerDeviceFFHandle handle);
 
-} // namespace ElementBinary
-} // namespace Kernels
+} // namespace Kernels::ElementBinary
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/kernels/include/kernels/element_unary_kernels.h b/lib/kernels/include/kernels/element_unary_kernels.h
index 8c6864b2d9..0257b3b4a6 100644
--- a/lib/kernels/include/kernels/element_unary_kernels.h
+++ b/lib/kernels/include/kernels/element_unary_kernels.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_ELEMENT_UNARY_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_ELEMENT_UNARY_KERNELS_H
 
-#include "device.h"
 #include "kernels/accessor.h"
+#include "kernels/device.h"
 #include "kernels/ff_handle.h"
 #include "op-attrs/ops/element_unary.h"
 #include <cstddef>
@@ -19,8 +19,7 @@ FF_VISITABLE_STRUCT_NO_EQ(ElementUnaryPerDeviceState,
                           outputTensor,
                           actiDesc);
 
-namespace Kernels {
-namespace ElementUnary {
+namespace Kernels::ElementUnary {
 
 ElementUnaryPerDeviceState init_kernel(ArrayShape const &input_shape,
                                        ArrayShape const &output_shape,
@@ -37,13 +36,12 @@ void backward_kernel(ffStream_t stream,
                      ElementUnaryPerDeviceState const &device_state,
                      ElementUnaryAttrs const &attrs,
                      PerDeviceFFHandle const &handle,
-                     GenericTensorAccessorR const &input,
-                     GenericTensorAccessorW const &input_grad,
                      GenericTensorAccessorR const &output,
-                     GenericTensorAccessorR const &output_grad);
+                     GenericTensorAccessorR const &output_grad,
+                     GenericTensorAccessorR const &input,
+                     GenericTensorAccessorW const &input_grad);
 
-} // namespace ElementUnary
-} // namespace Kernels
+} // namespace Kernels::ElementUnary
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/kernels/include/kernels/embedding_kernels.h b/lib/kernels/include/kernels/embedding_kernels.h
index 06582ca1d5..f51a730314 100644
--- a/lib/kernels/include/kernels/embedding_kernels.h
+++ b/lib/kernels/include/kernels/embedding_kernels.h
@@ -1,13 +1,11 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_EMBEDDING_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_EMBEDDING_KERNELS_H
 
-#include "device.h"
 #include "kernels/accessor.h"
+#include "kernels/device.h"
 #include "op-attrs/ops/embedding.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Embedding {
+namespace FlexFlow::Kernels::Embedding {
 void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorR const &input,
                     GenericTensorAccessorW const &output,
@@ -19,11 +17,11 @@ void forward_kernel(ffStream_t stream,
                     int out_dim,
                     int batch_size);
 void backward_kernel(ffStream_t stream,
-                     GenericTensorAccessorR const &input,
                      GenericTensorAccessorR const &output,
+                     GenericTensorAccessorR const &input,
                      GenericTensorAccessorW const &weight_grad,
-                     DataType input_data_type,
                      DataType output_data_type,
+                     DataType input_data_type,
                      std::optional<AggregateOp> aggr,
                      int in_dim,
                      int out_dim,
@@ -35,8 +33,6 @@ void rand_generate_int32_wrapper(int32_t *ptr, size_t size, int32_t p);
 template <typename TD>
 __global__ void rand_generate_int(TD *ptr, size_t size, TD p);
 
-} // namespace Embedding
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Embedding
 
 #endif // _FLEXFLOW_OPS_KERNELS_EMBEDDING_KERNELS_H
diff --git a/lib/kernels/include/kernels/ff_handle.h b/lib/kernels/include/kernels/ff_handle.h
index 179ce41cbf..31b3296a98 100644
--- a/lib/kernels/include/kernels/ff_handle.h
+++ b/lib/kernels/include/kernels/ff_handle.h
@@ -5,7 +5,7 @@
 #include <nccl.h>
 #endif
 
-#include "device.h"
+#include "kernels/device.h"
 #include "utils/visitable.h"
 
 namespace FlexFlow {
diff --git a/lib/kernels/include/kernels/flat_kernels.h b/lib/kernels/include/kernels/flat_kernels.h
index 3e600c48de..b2b1164f92 100644
--- a/lib/kernels/include/kernels/flat_kernels.h
+++ b/lib/kernels/include/kernels/flat_kernels.h
@@ -1,23 +1,20 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_FLAT_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_FLAT_KERNELS_H
 
-#include "device.h"
 #include "kernels/accessor.h"
+#include "kernels/device.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Flat {
+namespace FlexFlow::Kernels::Flat {
 
 void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorR input,
                     float *output_ptr);
+
 void backward_kernel(ffStream_t stream,
                      GenericTensorAccessorR input,
-                     float *input_grad_ptr,
-                     float const *output_grad_ptr);
+                     float const *output_grad_ptr,
+                     float *input_grad_ptr);
 
-} // namespace Flat
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Flat
 
 #endif // _FLEXFLOW_OPS_KERNELS_FLAT_KERNELS_H
diff --git a/lib/kernels/include/kernels/format_accessor_contents.h b/lib/kernels/include/kernels/format_accessor_contents.h
new file mode 100644
index 0000000000..b50cffbbef
--- /dev/null
+++ b/lib/kernels/include/kernels/format_accessor_contents.h
@@ -0,0 +1,13 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_FORMAT_ACCESSOR_CONTENTS_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_FORMAT_ACCESSOR_CONTENTS_H
+
+#include "kernels/accessor.h"
+
+namespace FlexFlow {
+
+std::string format_accessor_r_contents(GenericTensorAccessorR const &);
+std::string format_accessor_w_contents(GenericTensorAccessorW const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/kernels/include/kernels/gather_kernels.h b/lib/kernels/include/kernels/gather_kernels.h
index 13bf4b898a..8cbc7e457e 100644
--- a/lib/kernels/include/kernels/gather_kernels.h
+++ b/lib/kernels/include/kernels/gather_kernels.h
@@ -15,23 +15,21 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GatherPerDeviceState,
                                              handle,
                                              legion_dim);
 
-namespace Kernels {
-namespace Gather {
+namespace Kernels::Gather {
 
 void forward_kernel(ffStream_t stream,
-                    GatherPerDeviceState const &m,
+                    GatherPerDeviceState const &per_device_state,
                     GenericTensorAccessorR const &input,
                     GenericTensorAccessorR const &index,
                     GenericTensorAccessorW const &output);
 
 void backward_kernel(ffStream_t stream,
-                     GatherPerDeviceState const &m,
+                     GatherPerDeviceState const &per_device_state,
                      GenericTensorAccessorR const &output_grad,
                      GenericTensorAccessorR const &index,
                      GenericTensorAccessorW const &input_grad);
 
-} // namespace Gather
-} // namespace Kernels
+} // namespace Kernels::Gather
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/kernels/include/kernels/layer_norm_kernels.h b/lib/kernels/include/kernels/layer_norm_kernels.h
index be13d32879..10cf2fb14b 100644
--- a/lib/kernels/include/kernels/layer_norm_kernels.h
+++ b/lib/kernels/include/kernels/layer_norm_kernels.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_LAYER_NORM_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_LAYER_NORM_KERNELS_H
 
-#include "device.h"
 #include "kernels/allocation.h"
+#include "kernels/device.h"
 #include "kernels/ff_handle.h"
 
 namespace FlexFlow {
@@ -30,8 +30,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(LayerNormPerDeviceState,
                                              bias,
                                              data_type);
 
-namespace Kernels {
-namespace LayerNorm {
+namespace Kernels::LayerNorm {
 
 // todo: this may have some problem.
 LayerNormPerDeviceState init_kernel(PerDeviceFFHandle const &handle,
@@ -57,8 +56,7 @@ void backward_kernel(ffStream_t stream,
                      GenericTensorAccessorW const &gamma_grad,
                      GenericTensorAccessorW const &beta_grad);
 
-} // namespace LayerNorm
-} // namespace Kernels
+} // namespace Kernels::LayerNorm
 } // namespace FlexFlow
 
 #endif // _FLEXFLOW_OPS_KERNELS_LAYER_NORM_KERNELS_H
diff --git a/lib/kernels/include/kernels/legion_dim.h b/lib/kernels/include/kernels/legion_dim.h
index 7b9b9c455c..947bbd00bb 100644
--- a/lib/kernels/include/kernels/legion_dim.h
+++ b/lib/kernels/include/kernels/legion_dim.h
@@ -2,7 +2,13 @@
 #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_LEGION_DIM_H
 
 #include "kernels/legion_dim_t.dtg.h"
-#include "op-attrs/dim_ordered/dim_ordered.h"
+#include "kernels/legion_ordered/legion_ordered.h"
+#include "op-attrs/ff_dim_t.dtg.h"
+#include "op-attrs/ff_ordered/ff_ordered.h"
+#include "utils/containers/set_of.h"
+#include "utils/containers/transform.h"
+#include "utils/nonnegative_int/nonnegative_range.h"
+#include "utils/nonnegative_int/num_elements.h"
 
 namespace FlexFlow {
 
@@ -11,7 +17,10 @@ legion_dim_t add_to_legion_dim(legion_dim_t legion_dim, int value);
 legion_dim_t legion_dim_from_ff_dim(ff_dim_t, nonnegative_int num_dimensions);
 
 template <typename T>
-using LegionOrdered = DimOrdered<legion_dim_t, T>;
+std::set<legion_dim_t> key_range(LegionOrdered<T> const &d) {
+  return transform(set_of(nonnegative_range(num_elements(d))),
+                   [](nonnegative_int i) { return legion_dim_t{i}; });
+}
 
 template <typename T>
 FFOrdered<T>
@@ -25,17 +34,6 @@ LegionOrdered<T>
   return LegionOrdered<T>(ff_ordered.rbegin(), ff_ordered.rend());
 }
 
-template <typename T>
-std::string format_as(LegionOrdered<T> const &v) {
-  std::vector<T> as_vec(v.cbegin(), v.cend());
-  return fmt::format("<legion_ordered {}>", as_vec);
-}
-
-template <typename T>
-std::ostream &operator<<(std::ostream &s, LegionOrdered<T> const &v) {
-  return (s << fmt::to_string(v));
-}
-
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/kernels/include/kernels/legion_ordered/legion_ordered.h b/lib/kernels/include/kernels/legion_ordered/legion_ordered.h
new file mode 100644
index 0000000000..ad8b3bad6d
--- /dev/null
+++ b/lib/kernels/include/kernels/legion_ordered/legion_ordered.h
@@ -0,0 +1,197 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LEGION_ORDERED_LEGION_ORDERED_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LEGION_ORDERED_LEGION_ORDERED_H
+
+#include "kernels/legion_dim_t.dtg.h"
+#include "utils/fmt/vector.h"
+#include "utils/stack_vector/stack_vector.h"
+
+namespace FlexFlow {
+
+template <typename T>
+struct LegionOrdered {
+  LegionOrdered() {}
+
+  LegionOrdered(std::initializer_list<T> const &l)
+      : contents(l.begin(), l.end()) {}
+
+  LegionOrdered(std::vector<T> const &contents)
+      : contents(contents.begin(), contents.end()) {}
+
+  template <typename It>
+  LegionOrdered(It begin, It end) : contents(begin, end) {}
+
+  template <size_t MAXSIZE>
+  LegionOrdered(stack_vector<T, MAXSIZE> const &contents)
+      : contents(contents.begin(), contents.end()) {}
+
+  T const &at(legion_dim_t idx) const {
+    int raw = idx.value.unwrap_nonnegative();
+    return this->contents.at(raw);
+  }
+
+  T &at(legion_dim_t idx) {
+    int raw = idx.value.unwrap_nonnegative();
+    return this->contents.at(raw);
+  }
+
+  T const &operator[](legion_dim_t idx) const {
+    return this->at(idx);
+  }
+
+  T &operator[](legion_dim_t idx) {
+    return this->at(idx);
+  }
+
+  bool idx_is_valid(legion_dim_t const &idx) const {
+    int raw = idx.value.unwrap_nonnegative();
+    return raw < this->contents.size();
+  }
+
+  bool operator==(LegionOrdered const &other) const {
+    return this->contents == other.contents;
+  }
+
+  bool operator!=(LegionOrdered const &other) const {
+    return this->contents != other.contents;
+  }
+
+  using iterator = typename stack_vector<T, MAX_TENSOR_DIM>::iterator;
+  using const_iterator =
+      typename stack_vector<T, MAX_TENSOR_DIM>::const_iterator;
+  using reverse_iterator =
+      typename stack_vector<T, MAX_TENSOR_DIM>::reverse_iterator;
+  using const_reverse_iterator =
+      typename stack_vector<T, MAX_TENSOR_DIM>::const_reverse_iterator;
+  using value_type = T;
+  using pointer = value_type *;
+  using const_pointer = value_type const *;
+  using reference = value_type &;
+  using const_reference = value_type const &;
+
+  iterator begin() {
+    return this->contents.begin();
+  }
+
+  const_iterator begin() const {
+    return this->cbegin();
+  }
+
+  const_iterator cbegin() const {
+    return this->contents.cbegin();
+  }
+
+  iterator end() {
+    return this->contents.end();
+  }
+
+  const_iterator end() const {
+    return this->cend();
+  }
+
+  const_iterator cend() const {
+    return this->contents.cend();
+  }
+
+  reverse_iterator rbegin() {
+    return this->contents.rbegin();
+  }
+
+  const_reverse_iterator rbegin() const {
+    return this->crbegin();
+  }
+
+  const_reverse_iterator crbegin() const {
+    return this->contents.crbegin();
+  }
+
+  reverse_iterator rend() {
+    return this->contents.rend();
+  }
+
+  const_reverse_iterator rend() const {
+    return this->crend();
+  }
+
+  const_reverse_iterator crend() const {
+    return this->contents.crend();
+  }
+
+  size_t size() const {
+    return this->contents.size();
+  }
+
+  size_t empty() const {
+    return this->contents.empty();
+  }
+
+  size_t num_dims() const {
+    return this->size();
+  }
+
+  friend struct ::std::hash<LegionOrdered>;
+
+private:
+  stack_vector<T, MAX_TENSOR_DIM> contents;
+};
+
+template <typename T>
+auto operator<(LegionOrdered<T> const &lhs, LegionOrdered<T> const &rhs)
+    -> std::enable_if_t<is_lt_comparable_v<T>, bool> {
+  return std::lexicographical_compare(
+      lhs.cbegin(), lhs.cend(), rhs.cbegin(), rhs.cend());
+}
+
+template <typename T>
+std::string format_as(LegionOrdered<T> const &v) {
+  std::vector<T> as_vec(v.cbegin(), v.cend());
+  return fmt::format("<legion_ordered {}>", as_vec);
+}
+
+template <typename T>
+std::ostream &operator<<(std::ostream &s, LegionOrdered<T> const &v) {
+  return (s << fmt::to_string(v));
+}
+
+} // namespace FlexFlow
+
+namespace nlohmann {
+template <typename T>
+struct adl_serializer<::FlexFlow::LegionOrdered<T>> {
+  static ::FlexFlow::LegionOrdered<T> from_json(nlohmann::json const &j) {
+    return {j.template get<std::vector<T>>()};
+  }
+
+  static void to_json(nlohmann::json &j,
+                      ::FlexFlow::LegionOrdered<T> const &x) {
+    j = std::vector<T>{x.cbegin(), x.cend()};
+  }
+};
+} // namespace nlohmann
+
+namespace std {
+
+template <typename T>
+struct hash<::FlexFlow::LegionOrdered<T>> {
+  size_t operator()(::FlexFlow::LegionOrdered<T> const &t) const {
+    static_assert(::FlexFlow::is_hashable<T>::value,
+                  "Elements must be hashable");
+
+    return get_std_hash(t.contents);
+  }
+};
+
+} // namespace std
+
+namespace rc {
+
+template <typename T>
+struct Arbitrary<::FlexFlow::LegionOrdered<T>> {
+  static Gen<::FlexFlow::LegionOrdered<T>> arbitrary() {
+    return gen::construct<::FlexFlow::LegionOrdered<T>>(
+        gen::arbitrary<::FlexFlow::stack_vector<T, MAX_TENSOR_DIM>>());
+  }
+};
+
+} // namespace rc
+
+#endif
diff --git a/lib/kernels/include/kernels/legion_ordered/slice.h b/lib/kernels/include/kernels/legion_ordered/slice.h
new file mode 100644
index 0000000000..6980c0d9ec
--- /dev/null
+++ b/lib/kernels/include/kernels/legion_ordered/slice.h
@@ -0,0 +1,24 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LEGION_ORDERED_SLICE_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LEGION_ORDERED_SLICE_H
+
+#include "kernels/legion_ordered/legion_ordered.h"
+#include "utils/containers/slice.h"
+#include "utils/containers/transform.h"
+#include "utils/containers/vector_of.h"
+
+namespace FlexFlow {
+
+template <typename T>
+LegionOrdered<T> slice(LegionOrdered<T> const &d,
+                       legion_dim_t const &start,
+                       std::optional<legion_dim_t> const &end) {
+  int raw_start = start.value.unwrap_nonnegative();
+  std::optional<int> raw_end = transform(
+      end, [](legion_dim_t const &i) { return i.value.unwrap_nonnegative(); });
+
+  return LegionOrdered<T>{slice(vector_of(d), raw_start, raw_end)};
+}
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/kernels/include/kernels/legion_ordered/transform.h b/lib/kernels/include/kernels/legion_ordered/transform.h
new file mode 100644
index 0000000000..55cc1ff1ea
--- /dev/null
+++ b/lib/kernels/include/kernels/legion_ordered/transform.h
@@ -0,0 +1,17 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LEGION_ORDERED_TRANSFORM_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LEGION_ORDERED_TRANSFORM_H
+
+#include "kernels/legion_ordered/legion_ordered.h"
+#include "utils/containers/vector_of.h"
+#include "utils/containers/vector_transform.h"
+
+namespace FlexFlow {
+
+template <typename T, typename F, typename Out = std::invoke_result_t<F, T>>
+LegionOrdered<Out> transform(LegionOrdered<T> const &d, F &&f) {
+  return LegionOrdered<Out>{vector_transform(vector_of(d), f)};
+}
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/kernels/include/kernels/linear_kernels.h b/lib/kernels/include/kernels/linear_kernels.h
index 3128e39fd0..21d84c2567 100644
--- a/lib/kernels/include/kernels/linear_kernels.h
+++ b/lib/kernels/include/kernels/linear_kernels.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_LINEAR_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_LINEAR_KERNELS_H
 
-#include "device.h"
 #include "ff_handle.h"
+#include "kernels/device.h"
 #include "op-attrs/datatype.h"
 #include "op-attrs/ops/linear_attrs.dtg.h"
 
@@ -33,8 +33,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(LinearPerDeviceState,
                                              weight_type,
                                              output_type);
 
-namespace Kernels {
-namespace Linear {
+namespace Kernels::Linear {
 
 LinearPerDeviceState init_kernel(PerDeviceFFHandle handle,
                                  float *one_ptr,
@@ -51,29 +50,28 @@ bool use_activation(Activation activation);
 
 void forward_kernel(ffStream_t stream,
                     LinearPerDeviceState const &m,
-                    void const *input_ptr,
-                    void *output_ptr,
-                    void const *filter_ptr,
-                    void const *bias_ptr,
+                    float const *input_ptr,
+                    float *output_ptr,
+                    float const *filter_ptr,
+                    float const *bias_ptr,
                     int in_dim,
                     int out_dim,
                     int batch_size);
 
 void backward_kernel(ffStream_t stream,
                      LinearPerDeviceState const &m,
-                     void const *input_ptr,
-                     void *input_grad_ptr,
-                     void const *output_ptr,
-                     void *output_grad_ptr,
-                     void const *kernel_ptr,
-                     void *kernel_grad_ptr,
-                     void *bias_ptr,
+                     float const *output_ptr,
+                     float *output_grad_ptr,
+                     float const *input_ptr,
+                     float *input_grad_ptr,
+                     float const *kernel_ptr,
+                     float *kernel_grad_ptr,
+                     float *bias_grad_ptr,
                      int in_dim,
                      int out_dim,
                      int batch_size);
 
-} // namespace Linear
-} // namespace Kernels
+} // namespace Kernels::Linear
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/local-execution/include/local-execution/local_cpu_allocator.h b/lib/kernels/include/kernels/local_cpu_allocator.h
similarity index 74%
rename from lib/local-execution/include/local-execution/local_cpu_allocator.h
rename to lib/kernels/include/kernels/local_cpu_allocator.h
index d1e81facf2..9653dcf00e 100644
--- a/lib/local-execution/include/local-execution/local_cpu_allocator.h
+++ b/lib/kernels/include/kernels/local_cpu_allocator.h
@@ -1,3 +1,6 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LOCAL_CPU_ALLOCATOR_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LOCAL_CPU_ALLOCATOR_H
+
 #include "kernels/allocation.h"
 #include <unordered_set>
 
@@ -12,6 +15,8 @@ struct LocalCPUAllocator : public IAllocator {
   void *allocate(size_t) override;
   void deallocate(void *) override;
 
+  DeviceType get_allocation_device_type() const override;
+
 private:
   std::unordered_map<void *, std::unique_ptr<void, decltype(&free)>> ptrs;
 };
@@ -20,3 +25,5 @@ CHECK_RC_COPY_VIRTUAL_COMPLIANT(LocalCPUAllocator);
 Allocator create_local_cpu_memory_allocator();
 
 } // namespace FlexFlow
+
+#endif
diff --git a/lib/kernels/include/kernels/local_cuda_allocator.h b/lib/kernels/include/kernels/local_cuda_allocator.h
index 18a4b6e78a..b8e0540974 100644
--- a/lib/kernels/include/kernels/local_cuda_allocator.h
+++ b/lib/kernels/include/kernels/local_cuda_allocator.h
@@ -12,6 +12,8 @@ struct LocalCudaAllocator : public IAllocator {
   void *allocate(size_t) override;
   void deallocate(void *) override;
 
+  DeviceType get_allocation_device_type() const override;
+
 private:
   std::unordered_set<void *> ptrs;
 };
diff --git a/lib/kernels/include/kernels/managed_ff_stream.h b/lib/kernels/include/kernels/managed_ff_stream.h
index 2f690b2eb3..576edb0ffa 100644
--- a/lib/kernels/include/kernels/managed_ff_stream.h
+++ b/lib/kernels/include/kernels/managed_ff_stream.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_KERNELS_MANAGED_FF_STREAM_H
 #define _FLEXFLOW_KERNELS_MANAGED_FF_STREAM_H
 
-#include "device.h"
+#include "kernels/device.h"
 
 namespace FlexFlow {
 
@@ -19,6 +19,9 @@ struct ManagedFFStream {
 
   ffStream_t const &raw_stream() const;
 
+private:
+  void cleanup();
+
 private:
   ffStream_t *stream;
 };
diff --git a/lib/kernels/include/kernels/managed_per_device_ff_handle.h b/lib/kernels/include/kernels/managed_per_device_ff_handle.h
index 0a83a5eecb..9bd9370685 100644
--- a/lib/kernels/include/kernels/managed_per_device_ff_handle.h
+++ b/lib/kernels/include/kernels/managed_per_device_ff_handle.h
@@ -7,7 +7,10 @@ namespace FlexFlow {
 
 struct ManagedPerDeviceFFHandle {
 public:
-  ManagedPerDeviceFFHandle();
+  ManagedPerDeviceFFHandle() = delete;
+
+  ManagedPerDeviceFFHandle(size_t workSpaceSize,
+                           bool allowTensorOpMathConversion);
 
   ManagedPerDeviceFFHandle(ManagedPerDeviceFFHandle const &) = delete;
   ManagedPerDeviceFFHandle &
@@ -21,6 +24,9 @@ struct ManagedPerDeviceFFHandle {
 
   PerDeviceFFHandle const &raw_handle() const;
 
+private:
+  void cleanup();
+
 private:
   PerDeviceFFHandle *handle;
 };
diff --git a/lib/kernels/include/kernels/metrics_kernels.h b/lib/kernels/include/kernels/metrics_kernels.h
index e4660808b9..430608db55 100644
--- a/lib/kernels/include/kernels/metrics_kernels.h
+++ b/lib/kernels/include/kernels/metrics_kernels.h
@@ -1,25 +1,24 @@
 #ifndef _FLEXFLOW_KERNELS_INCLUDE_KERNELS_METRICS_KERNELS_H
 #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_METRICS_KERNELS_H
 
-#include "perf_metrics.h"
+#include "kernels/perf_metrics.h"
+#include "pcg/metric_attrs.h"
 
 namespace FlexFlow {
 
-void update_metrics_sparse_label_kernel(ffStream_t,
-                                        MetricsAttrs const &,
-                                        float const *logit_ptr,
-                                        int const *label_ptr,
-                                        int num_samples,
-                                        int num_classes,
-                                        PerfMetrics &perf_zc);
-void update_metrics_label_kernel(ffStream_t,
-                                 MetricsAttrs const &,
-                                 float const *logit_ptr,
-                                 float const *label_ptr,
-                                 int num_samples,
-                                 int num_classes,
-                                 PerfMetrics &perf_zc);
+void update_metrics_sparse_label_kernel_wrapper(float const *logit_ptr,
+                                                int const *label_ptr,
+                                                MetricsAttrs const &me,
+                                                int num_effective_samples,
+                                                int num_classes,
+                                                PerfMetrics &perf_zc);
 
+void update_metrics_label_kernel_wrapper(float const *logit_ptr,
+                                         float const *label_ptr,
+                                         MetricsAttrs const &me,
+                                         int num_samples,
+                                         int num_classes,
+                                         PerfMetrics &perf_zc);
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/kernels/include/kernels/nccl.h b/lib/kernels/include/kernels/nccl.h
index b8a6784676..042911d172 100644
--- a/lib/kernels/include/kernels/nccl.h
+++ b/lib/kernels/include/kernels/nccl.h
@@ -23,15 +23,11 @@ struct ncclUniqueId {};
 struct ncclComm_t {};
 #endif
 
-namespace FlexFlow {
-namespace Kernels {
-namespace NCCL {
+namespace FlexFlow::Kernels::NCCL {
 
 ncclUniqueId generate_unique_id();
 ncclComm_t create_comm(ncclUniqueId const &, int num_ranks, int my_rank);
 
-} // namespace NCCL
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::NCCL
 
 #endif
diff --git a/lib/kernels/include/kernels/optimizer_kernels.h b/lib/kernels/include/kernels/optimizer_kernels.h
index 9ca6bf8e2b..d552831c78 100644
--- a/lib/kernels/include/kernels/optimizer_kernels.h
+++ b/lib/kernels/include/kernels/optimizer_kernels.h
@@ -1,7 +1,8 @@
 #ifndef _FLEXFLOW_KERNELS_INCLUDE_KERNELS_OPTIMIZER_KERNELS_H
 #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_OPTIMIZER_KERNELS_H
 
-#include "device.h"
+#include "kernels/device.h"
+#include "kernels/ff_handle.h"
 
 namespace FlexFlow {
 
@@ -16,15 +17,18 @@ void sgd_ps_update_task_gpu(ffStream_t,
                             float *weight_ptr,
                             float *sgd_v_ptr);
 
+#ifdef FF_USE_NCCL
 void sgd_nccl_update_task_gpu(ffStream_t,
                               float lr,
                               float momentum,
                               bool nesterov,
-                              float weight_decay PerDeviceFFHandle const &,
+                              float weight_decay,
+                              PerDeviceFFHandle const &,
                               float const *weight_grad_ptr,
                               size_t size,
                               float *weight_ptr,
                               float *sgd_v_ptr);
+#endif
 
 void adam_ps_update_task_gpu(ffStream_t,
                              float alpha_t,
@@ -33,9 +37,11 @@ void adam_ps_update_task_gpu(ffStream_t,
                              float weight_decay,
                              float epsilon,
                              float const *weight_grad_ptr,
-                             float *adam_m_ptr,
+                             size_t size,
+                             int num_replicas,
+                             float *weight_ptr,
                              float *adam_v_ptr,
-                             float *weight_ptr);
+                             float *adam_m_ptr);
 
 void adam_nccl_update_task_gpu(ffStream_t,
                                float alpha_t,
@@ -45,9 +51,10 @@ void adam_nccl_update_task_gpu(ffStream_t,
                                float epsilon,
                                PerDeviceFFHandle const &,
                                float const *weight_grad_ptr,
-                               float *adam_m_ptr,
+                               size_t size,
+                               float *weight_ptr,
                                float *adam_v_ptr,
-                               float *weight_ptr);
+                               float *adam_m_ptr);
 
 } // namespace FlexFlow
 
diff --git a/lib/kernels/include/kernels/partition_kernels.h b/lib/kernels/include/kernels/partition_kernels.h
index 64ef1a1352..aa3a7a1ef7 100644
--- a/lib/kernels/include/kernels/partition_kernels.h
+++ b/lib/kernels/include/kernels/partition_kernels.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_PARTITION_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_PARTITION_KERNELS_H
 
-#include "device.h"
 #include "kernels/accessor.h"
+#include "kernels/device.h"
 
 namespace FlexFlow {
 
@@ -13,8 +13,7 @@ struct RepartitionPerDeviceState {
 
 FF_VISITABLE_STRUCT_NO_EQ(RepartitionPerDeviceState, handle, data_type);
 
-namespace Kernels {
-namespace Repartition {
+namespace Kernels::Repartition {
 
 RepartitionPerDeviceState init_kernel(PerDeviceFFHandle const &handle,
                                       DataType data_type);
@@ -26,11 +25,10 @@ void forward_kernel(ffStream_t stream,
 
 void backward_kernel(ffStream_t stream,
                      RepartitionPerDeviceState const &m,
-                     GenericTensorAccessorW const &output_grad,
-                     GenericTensorAccessorR const &input_grad);
+                     GenericTensorAccessorR const &output_grad,
+                     GenericTensorAccessorW const &input_grad);
 
-} // namespace Repartition
-} // namespace Kernels
+} // namespace Kernels::Repartition
 } // namespace FlexFlow
 
 #endif // _FLEXFLOW_OPS_KERNELS_PARTITION_KERNELS_H
diff --git a/lib/local-execution/include/local-execution/per_device_op_state.variant.toml b/lib/kernels/include/kernels/per_device_op_state.variant.toml
similarity index 100%
rename from lib/local-execution/include/local-execution/per_device_op_state.variant.toml
rename to lib/kernels/include/kernels/per_device_op_state.variant.toml
diff --git a/lib/kernels/include/kernels/pool_2d_kernels.h b/lib/kernels/include/kernels/pool_2d_kernels.h
index 798c0507f8..76aa07d0a4 100644
--- a/lib/kernels/include/kernels/pool_2d_kernels.h
+++ b/lib/kernels/include/kernels/pool_2d_kernels.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_POOL_2D_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_POOL_2D_KERNELS_H
 
-#include "device.h"
+#include "kernels/device.h"
 #include "kernels/ff_handle.h"
 #include "op-attrs/activation.dtg.h"
 #include "op-attrs/ops/pool_2d.h"
@@ -25,8 +25,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(Pool2DPerDeviceState,
                                              poolDesc,
                                              relu);
 
-namespace Kernels {
-namespace Pool2D {
+namespace Kernels::Pool2D {
 
 Pool2DPerDeviceState init_kernel(PerDeviceFFHandle handle,
                                  std::optional<Activation> activation,
@@ -70,13 +69,12 @@ void forward_kernel(ffStream_t stream,
 
 void backward_kernel(ffStream_t stream,
                      Pool2DPerDeviceState const &m,
-                     void const *input_ptr,
-                     void *input_grad_ptr,
                      void const *output_ptr,
-                     void const *output_grad_ptr);
+                     void const *output_grad_ptr,
+                     void const *input_ptr,
+                     void *input_grad_ptr);
 
-} // namespace Pool2D
-} // namespace Kernels
+} // namespace Kernels::Pool2D
 } // namespace FlexFlow
 
 #endif // _FLEXFLOW_OPS_KERNELS_POOL_2D_KERNELS_H
diff --git a/lib/kernels/include/kernels/profiling.h b/lib/kernels/include/kernels/profiling.h
index 655d540685..7c4145c426 100644
--- a/lib/kernels/include/kernels/profiling.h
+++ b/lib/kernels/include/kernels/profiling.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_KERNELS_PROFILING_H
 #define _FLEXFLOW_KERNELS_PROFILING_H
 
-#include "device.h"
+#include "kernels/device.h"
 #include "kernels/profiling_settings.dtg.h"
 #include "utils/visitable.h"
 
diff --git a/lib/kernels/include/kernels/reduce_kernels.h b/lib/kernels/include/kernels/reduce_kernels.h
index 4287472875..10e8e4393b 100644
--- a/lib/kernels/include/kernels/reduce_kernels.h
+++ b/lib/kernels/include/kernels/reduce_kernels.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_REDUCE_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_REDUCE_KERNELS_H
 
-#include "array_shape.h"
-#include "device.h"
-#include "ff_handle.h"
+#include "kernels/array_shape.h"
+#include "kernels/device.h"
+#include "kernels/ff_handle.h"
 #include "op-attrs/operator_type.dtg.h"
 
 namespace FlexFlow {
@@ -25,8 +25,7 @@ FF_VISITABLE_STRUCT(ReducePerDeviceState,
                     op_type,
                     reduction_size);
 
-namespace Kernels {
-namespace Reduce {
+namespace Kernels::Reduce {
 
 ReducePerDeviceState init_kernel(PerDeviceFFHandle const &,
                                  OperatorType const &,
@@ -43,8 +42,7 @@ void backward_kernel(ffStream_t stream,
                      ReducePerDeviceState const &m,
                      float const *output_grad_ptr,
                      float *input_grad_ptr);
-} // namespace Reduce
-} // namespace Kernels
+} // namespace Kernels::Reduce
 } // namespace FlexFlow
 
 #endif // _FLEXFLOW_OPS_KERNELS_REDUCE_KERNELS_H
diff --git a/lib/kernels/include/kernels/reduction_kernels.h b/lib/kernels/include/kernels/reduction_kernels.h
index fb3baf215c..08f73cd9ab 100644
--- a/lib/kernels/include/kernels/reduction_kernels.h
+++ b/lib/kernels/include/kernels/reduction_kernels.h
@@ -1,12 +1,10 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_REDUCTION_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_REDUCTION_KERNELS_H
 
-#include "device.h"
 #include "kernels/accessor.h"
+#include "kernels/device.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Reduction {
+namespace FlexFlow::Kernels::Reduction {
 
 void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorR const &input,
@@ -14,11 +12,9 @@ void forward_kernel(ffStream_t stream,
                     size_t num_replicas);
 
 void backward_kernel(ffStream_t stream,
-                     GenericTensorAccessorW const &input,
-                     GenericTensorAccessorR const &output);
+                     GenericTensorAccessorR const &output,
+                     GenericTensorAccessorW const &input);
 
-} // namespace Reduction
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Reduction
 
 #endif // _FLEXFLOW_OPS_KERNELS_REDUCTION_KERNELS_H
diff --git a/lib/kernels/include/kernels/replicate_kernels.h b/lib/kernels/include/kernels/replicate_kernels.h
index 409fc81f44..0b113868ee 100644
--- a/lib/kernels/include/kernels/replicate_kernels.h
+++ b/lib/kernels/include/kernels/replicate_kernels.h
@@ -1,24 +1,20 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_H
 
-#include "device.h"
 #include "kernels/accessor.h"
+#include "kernels/device.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Replicate {
+namespace FlexFlow::Kernels::Replicate {
 
 void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorR const &input,
                     GenericTensorAccessorW const &output);
 
 void backward_kernel(ffStream_t stream,
-                     GenericTensorAccessorW const &input,
                      GenericTensorAccessorR const &output,
+                     GenericTensorAccessorW const &input,
                      size_t num_replicas);
 
-} // namespace Replicate
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Replicate
 
 #endif // _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_H
diff --git a/lib/kernels/include/kernels/replicate_kernels_cpu.h b/lib/kernels/include/kernels/replicate_kernels_cpu.h
new file mode 100644
index 0000000000..2a2eaa5eb6
--- /dev/null
+++ b/lib/kernels/include/kernels/replicate_kernels_cpu.h
@@ -0,0 +1,18 @@
+#ifndef _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_CPU_H
+#define _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_CPU_H
+
+#include "kernels/accessor.h"
+#include "kernels/device.h"
+
+namespace FlexFlow::Kernels::Replicate {
+
+void cpu_forward_kernel(GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW &output);
+
+void cpu_backward_kernel(GenericTensorAccessorR const &output,
+                         GenericTensorAccessorW &input,
+                         size_t num_replicas);
+
+} // namespace FlexFlow::Kernels::Replicate
+
+#endif // _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_CPU_H
diff --git a/lib/kernels/include/kernels/reshape_kernels.h b/lib/kernels/include/kernels/reshape_kernels.h
index a83caa6bea..88c11d2fb0 100644
--- a/lib/kernels/include/kernels/reshape_kernels.h
+++ b/lib/kernels/include/kernels/reshape_kernels.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H
 
-#include "device.h"
 #include "kernels/accessor.h"
+#include "kernels/device.h"
 #include "utils/required_core.h"
 
 namespace FlexFlow {
@@ -13,8 +13,7 @@ struct ReshapePerDeviceState {
 
 FF_VISITABLE_STRUCT(ReshapePerDeviceState, data_type);
 
-namespace Kernels {
-namespace Reshape {
+namespace Kernels::Reshape {
 
 ReshapePerDeviceState init_kernel(DataType data_type);
 
@@ -25,11 +24,10 @@ void forward_kernel(ffStream_t stream,
 
 void backward_kernel(ffStream_t stream,
                      ReshapePerDeviceState const &per_device_state,
-                     GenericTensorAccessorW const &input,
-                     GenericTensorAccessorR const &output);
+                     GenericTensorAccessorR const &output,
+                     GenericTensorAccessorW const &input);
 
-} // namespace Reshape
-} // namespace Kernels
+} // namespace Kernels::Reshape
 } // namespace FlexFlow
 
 #endif // _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H
diff --git a/lib/kernels/include/kernels/reverse_kernels.h b/lib/kernels/include/kernels/reverse_kernels.h
index 42a83ae219..768707175c 100644
--- a/lib/kernels/include/kernels/reverse_kernels.h
+++ b/lib/kernels/include/kernels/reverse_kernels.h
@@ -1,30 +1,21 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_H
 
-#include "device.h"
+#include "kernels/device.h"
+#include "kernels/reverse_kernels_cpu.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Reverse {
+namespace FlexFlow::Kernels::Reverse {
 
 void forward_kernel(ffStream_t stream,
-                    float const *in_ptr,
-                    float *out_ptr,
-                    coord_t num_out_blks,
-                    coord_t reverse_dim_size,
-                    coord_t in_blk_size,
-                    coord_t output_size);
+                    GenericTensorAccessorR const &input_accessor,
+                    GenericTensorAccessorW &output_accessor,
+                    ReverseAttrs const &);
 
 void backward_kernel(ffStream_t stream,
-                     float const *out_grad_ptr,
-                     float *in_grad_ptr,
-                     coord_t num_out_blks,
-                     coord_t reverse_dim_size,
-                     coord_t in_blk_size,
-                     coord_t input_size);
+                     GenericTensorAccessorR const &output_accessor,
+                     GenericTensorAccessorW &input_accessor,
+                     ReverseAttrs const &);
 
-} // namespace Reverse
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Reverse
 
 #endif // _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_H
diff --git a/lib/kernels/include/kernels/reverse_kernels_cpu.h b/lib/kernels/include/kernels/reverse_kernels_cpu.h
new file mode 100644
index 0000000000..ec82000f8f
--- /dev/null
+++ b/lib/kernels/include/kernels/reverse_kernels_cpu.h
@@ -0,0 +1,20 @@
+#ifndef _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H
+#define _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H
+
+#include "kernels/accessor.h"
+#include "kernels/device.h"
+#include "op-attrs/ops/reverse_attrs.dtg.h"
+
+namespace FlexFlow::Kernels::Reverse {
+
+void cpu_forward_kernel(GenericTensorAccessorR const &input_accessor,
+                        GenericTensorAccessorW &output_accessor,
+                        ReverseAttrs const &);
+
+void cpu_backward_kernel(GenericTensorAccessorR const &output_accessor,
+                         GenericTensorAccessorW &input_accessor,
+                         ReverseAttrs const &);
+
+} // namespace FlexFlow::Kernels::Reverse
+
+#endif // _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H
diff --git a/lib/kernels/include/kernels/reverse_kernels_params.h b/lib/kernels/include/kernels/reverse_kernels_params.h
new file mode 100644
index 0000000000..766d70b915
--- /dev/null
+++ b/lib/kernels/include/kernels/reverse_kernels_params.h
@@ -0,0 +1,16 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_REVERSE_KERNELS_PARAMS_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_REVERSE_KERNELS_PARAMS_H
+
+#include "kernels/array_shape.h"
+#include "kernels/reverse_kernels_params.dtg.h"
+#include "op-attrs/ops/reverse_attrs.dtg.h"
+
+namespace FlexFlow {
+
+ReverseKernelsParams
+    compute_reverse_kernels_params(ArrayShape const &output_shape,
+                                   ReverseAttrs const &attrs);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/kernels/include/kernels/reverse_kernels_params.struct.toml b/lib/kernels/include/kernels/reverse_kernels_params.struct.toml
new file mode 100644
index 0000000000..a5dbd750bc
--- /dev/null
+++ b/lib/kernels/include/kernels/reverse_kernels_params.struct.toml
@@ -0,0 +1,28 @@
+namespace = "FlexFlow"
+name = "ReverseKernelsParams"
+features = [
+  "eq",
+  "ord",
+  "hash",
+  "fmt",
+]
+
+includes = [
+  "utils/nonnegative_int/nonnegative_int.h",
+]
+
+[[fields]]
+name = "num_out_blks"
+type = "::FlexFlow::nonnegative_int"
+
+[[fields]]
+name = "reverse_dim_size"
+type = "::FlexFlow::nonnegative_int"
+
+[[fields]]
+name = "in_blk_size"
+type = "::FlexFlow::nonnegative_int"
+
+[[fields]]
+name = "out_size"
+type = "::FlexFlow::nonnegative_int"
diff --git a/lib/kernels/include/kernels/softmax_kernels.h b/lib/kernels/include/kernels/softmax_kernels.h
index 061230ec52..60101578e3 100644
--- a/lib/kernels/include/kernels/softmax_kernels.h
+++ b/lib/kernels/include/kernels/softmax_kernels.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_SOFTMAX_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_SOFTMAX_KERNELS_H
 
-#include "device.h"
 #include "ff_handle.h"
+#include "kernels/device.h"
 
 namespace FlexFlow {
 
@@ -15,8 +15,7 @@ struct SoftmaxPerDeviceState {
 
 FF_VISITABLE_STRUCT(SoftmaxPerDeviceState, handle, inputTensor, dim);
 
-namespace Kernels {
-namespace Softmax {
+namespace Kernels::Softmax {
 
 SoftmaxPerDeviceState init_kernel(PerDeviceFFHandle const &handle,
                                   int dim,
@@ -31,12 +30,11 @@ void forward_kernel(ffStream_t stream,
                     float *output_ptr);
 
 void backward_kernel(ffStream_t stream,
-                     float *input_grad_ptr,
                      float const *output_grad_ptr,
+                     float *input_grad_ptr,
                      size_t num_elements);
 
-} // namespace Softmax
-} // namespace Kernels
+} // namespace Kernels::Softmax
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/kernels/include/kernels/split_kernels.h b/lib/kernels/include/kernels/split_kernels.h
index 36434d4be8..3b580f94be 100644
--- a/lib/kernels/include/kernels/split_kernels.h
+++ b/lib/kernels/include/kernels/split_kernels.h
@@ -1,12 +1,9 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_SPLIT_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_SPLIT_KERNELS_H
 
-#include "device.h"
+#include "kernels/device.h"
 
-namespace FlexFlow {
-
-namespace Kernels {
-namespace Split {
+namespace FlexFlow::Kernels::Split {
 void forward_kernel(ffStream_t stream,
                     float **out_ptrs,
                     float const *in_ptr,
@@ -22,8 +19,6 @@ void backward_kernel(ffStream_t stream,
                      coord_t num_blks,
                      int numOutputs);
 
-} // namespace Split
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Split
 
 #endif // _FLEXFLOW_OPS_KERNELS_SPLIT_KERNELS_H
diff --git a/lib/kernels/include/kernels/topk_kernels.h b/lib/kernels/include/kernels/topk_kernels.h
index ae1c739f6c..085594d57f 100644
--- a/lib/kernels/include/kernels/topk_kernels.h
+++ b/lib/kernels/include/kernels/topk_kernels.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_TOPK_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_TOPK_KERNELS_H
 
-#include "device.h"
 #include "kernels/allocation.h"
+#include "kernels/device.h"
 
 namespace FlexFlow {
 
@@ -12,8 +12,7 @@ struct TopKPerDeviceState {
 
 FF_VISITABLE_STRUCT(TopKPerDeviceState, sorted);
 
-namespace Kernels {
-namespace TopK {
+namespace Kernels::TopK {
 
 TopKPerDeviceState init_kernel(bool sorted);
 
@@ -35,8 +34,7 @@ void backward_kernel(ffStream_t stream,
                      int length,
                      int k);
 
-} // namespace TopK
-} // namespace Kernels
+} // namespace Kernels::TopK
 } // namespace FlexFlow
 
 #endif // _FLEXFLOW_OPS_KERNELS_TOPK_KERNELS_H
diff --git a/lib/kernels/include/kernels/transpose_kernels.h b/lib/kernels/include/kernels/transpose_kernels.h
index 0f1cc2ae61..776370dcbd 100644
--- a/lib/kernels/include/kernels/transpose_kernels.h
+++ b/lib/kernels/include/kernels/transpose_kernels.h
@@ -1,15 +1,14 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_TRANSPOSE_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_TRANSPOSE_KERNELS_H
 
-#include "device.h"
 #include "kernels/accessor.h"
+#include "kernels/device.h"
 #include "op-attrs/ops/transpose_attrs.dtg.h"
 #include <vector>
 
 namespace FlexFlow {
 
-namespace Kernels {
-namespace Transpose {
+namespace Kernels::Transpose {
 
 void forward_kernel(cudaStream_t stream,
                     TransposeAttrs const &attrs,
@@ -18,11 +17,10 @@ void forward_kernel(cudaStream_t stream,
 
 void backward_kernel(cudaStream_t stream,
                      TransposeAttrs const &attrs,
-                     GenericTensorAccessorW const &in_grad,
-                     GenericTensorAccessorR const &out_grad);
+                     GenericTensorAccessorR const &out_grad,
+                     GenericTensorAccessorW const &in_grad);
 
-} // namespace Transpose
-} // namespace Kernels
+} // namespace Kernels::Transpose
 } // namespace FlexFlow
 
 #endif // _FLEXFLOW_OPS_KERNELS_TRANSPOSE_KERNELS_H
diff --git a/lib/kernels/src/accessor.cc b/lib/kernels/src/accessor.cc
deleted file mode 100644
index 27b7eb390d..0000000000
--- a/lib/kernels/src/accessor.cc
+++ /dev/null
@@ -1,192 +0,0 @@
-#include "kernels/accessor.h"
-
-namespace FlexFlow {
-
-int32_t *GenericTensorAccessorW::get_int32_ptr() const {
-  return this->get<DataType::INT32>();
-}
-
-int64_t *GenericTensorAccessorW::get_int64_ptr() const {
-  return this->get<DataType::INT64>();
-}
-
-float *GenericTensorAccessorW::get_float_ptr() const {
-  return this->get<DataType::FLOAT>();
-}
-
-double *GenericTensorAccessorW::get_double_ptr() const {
-  return this->get<DataType::DOUBLE>();
-}
-
-half *GenericTensorAccessorW::get_half_ptr() const {
-  return this->get<DataType::HALF>();
-}
-
-std::string format_as(GenericTensorAccessorW const &a) {
-  return fmt::format("<GenericTensorAccessorW data_type={} shape={} ptr={}>",
-                     a.data_type,
-                     a.shape,
-                     a.ptr);
-}
-
-std::ostream &operator<<(std::ostream &s, GenericTensorAccessorW const &a) {
-  return (s << fmt::to_string(a));
-}
-
-int32_t const *GenericTensorAccessorR::get_int32_ptr() const {
-  return this->get<DataType::INT32>();
-}
-
-int64_t const *GenericTensorAccessorR::get_int64_ptr() const {
-  return this->get<DataType::INT64>();
-}
-
-float const *GenericTensorAccessorR::get_float_ptr() const {
-  return this->get<DataType::FLOAT>();
-}
-
-double const *GenericTensorAccessorR::get_double_ptr() const {
-  return this->get<DataType::DOUBLE>();
-}
-
-half const *GenericTensorAccessorR::get_half_ptr() const {
-  return get<DataType::HALF>();
-}
-
-std::string format_as(GenericTensorAccessorR const &a) {
-  return fmt::format("<GenericTensorAccessorR data_type={} shape={} ptr={}>",
-                     a.data_type,
-                     a.shape,
-                     a.ptr);
-}
-
-std::ostream &operator<<(std::ostream &s, GenericTensorAccessorR const &a) {
-  return (s << fmt::to_string(a));
-}
-
-int32_t *get_int32_ptr(GenericTensorAccessorW const &a) {
-  return get<DataType::INT32>(a);
-}
-
-int64_t *get_int64_ptr(GenericTensorAccessorW const &a) {
-  return get<DataType::INT64>(a);
-}
-
-float *get_float_ptr(GenericTensorAccessorW const &a) {
-  return get<DataType::FLOAT>(a);
-}
-
-double *get_double_ptr(GenericTensorAccessorW const &a) {
-  return get<DataType::DOUBLE>(a);
-}
-
-half *get_half_ptr(GenericTensorAccessorW const &a) {
-  return get<DataType::HALF>(a);
-}
-
-std::vector<int32_t *>
-    get_int32_ptrs(std::vector<GenericTensorAccessorW> const &a) {
-  return get<DataType::INT32>(a);
-}
-
-std::vector<int64_t *>
-    get_int64_ptrs(std::vector<GenericTensorAccessorW> const &a) {
-  return get<DataType::INT64>(a);
-}
-
-std::vector<float *>
-    get_float_ptrs(std::vector<GenericTensorAccessorW> const &a) {
-  return get<DataType::FLOAT>(a);
-}
-
-std::vector<double *>
-    get_double_ptrs(std::vector<GenericTensorAccessorW> const &a) {
-  return get<DataType::DOUBLE>(a);
-}
-
-std::vector<half *>
-    get_half_ptrs(std::vector<GenericTensorAccessorW> const &a) {
-  return get<DataType::HALF>(a);
-}
-
-int32_t const *get_int32_ptr(GenericTensorAccessorR const &a) {
-  return get<DataType::INT32>(a);
-}
-
-int64_t const *get_int64_ptr(GenericTensorAccessorR const &a) {
-  return get<DataType::INT64>(a);
-}
-
-float const *get_float_ptr(GenericTensorAccessorR const &a) {
-  return get<DataType::FLOAT>(a);
-}
-
-double const *get_double_ptr(GenericTensorAccessorR const &a) {
-  return get<DataType::DOUBLE>(a);
-}
-
-half const *get_half_ptr(GenericTensorAccessorR const &a) {
-  return get<DataType::HALF>(a);
-}
-
-std::vector<int32_t const *>
-    get_int32_ptrs(std::vector<GenericTensorAccessorR> const &a) {
-  return get<DataType::INT32>(a);
-}
-
-std::vector<int64_t const *>
-    get_int64_ptrs(std::vector<GenericTensorAccessorR> const &a) {
-  return get<DataType::INT64>(a);
-}
-
-std::vector<float const *>
-    get_float_ptrs(std::vector<GenericTensorAccessorR> const &a) {
-  return get<DataType::FLOAT>(a);
-}
-
-std::vector<double const *>
-    get_double_ptrs(std::vector<GenericTensorAccessorR> const &a) {
-  return get<DataType::DOUBLE>(a);
-}
-
-std::vector<half const *>
-    get_half_ptrs(std::vector<GenericTensorAccessorR> const &a) {
-  return get<DataType::HALF>(a);
-}
-
-GenericTensorAccessorR read_only_accessor_from_write_accessor(
-    GenericTensorAccessorW const &writable) {
-  return GenericTensorAccessorR{
-      writable.data_type, writable.shape, req<void const *>(writable.ptr)};
-}
-
-bool is_shape_and_dtype_equal(GenericTensorAccessorW const &acc1,
-                              GenericTensorAccessorW const &acc2) {
-  return acc1.shape == acc2.shape && acc1.data_type == acc2.data_type;
-}
-
-bool shape_and_dtype_matches(GenericTensorAccessorW const &accessor,
-                             ArrayShape const &expected_shape,
-                             DataType const &expected_dtype) {
-  return accessor.shape == expected_shape &&
-         accessor.data_type == expected_dtype;
-}
-
-bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor,
-                             ArrayShape const &expected_shape,
-                             DataType const &expected_dtype) {
-  return accessor.shape == expected_shape &&
-         accessor.data_type == expected_dtype;
-}
-
-std::pair<ArrayShape, DataType>
-    get_shape_and_datatype(GenericTensorAccessorR const &accessor) {
-  return std::make_pair(accessor.shape, accessor.data_type);
-}
-
-std::pair<ArrayShape, DataType>
-    get_shape_and_datatype(GenericTensorAccessorW const &accessor) {
-  return std::make_pair(accessor.shape, accessor.data_type);
-}
-
-} // namespace FlexFlow
diff --git a/lib/kernels/src/allocation.cc b/lib/kernels/src/allocation.cc
deleted file mode 100644
index d666592e77..0000000000
--- a/lib/kernels/src/allocation.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-#include "kernels/allocation.h"
-#include "op-attrs/tensor_shape.h"
-
-namespace FlexFlow {
-
-void *Allocator::allocate(size_t mem_size) {
-  return this->i_allocator->allocate(mem_size);
-}
-
-void Allocator::deallocate(void *ptr) {
-  this->i_allocator->deallocate(ptr);
-}
-
-GenericTensorAccessorW
-    Allocator::allocate_tensor(TensorShape const &tensor_shape) {
-  void *ptr =
-      this->allocate(get_size_in_bytes(tensor_shape).unwrap_nonnegative());
-  return {tensor_shape.data_type, tensor_shape, ptr};
-}
-
-} // namespace FlexFlow
diff --git a/lib/kernels/src/cpu/ops/cast_kernels.cc b/lib/kernels/src/cpu/ops/cast_kernels.cc
new file mode 100644
index 0000000000..cdd57b8947
--- /dev/null
+++ b/lib/kernels/src/cpu/ops/cast_kernels.cc
@@ -0,0 +1,51 @@
+#include "kernels/cast_kernels_cpu.h"
+#include "kernels/datatype_dispatch.h"
+
+namespace FlexFlow::Kernels::Cast {
+
+template <typename IDT, typename ODT>
+void cpu_cast_forward(IDT const *input, ODT *output, size_t volume) {
+  for (size_t i = 0; i < volume; ++i) {
+    output[i] = static_cast<ODT>(input[i]);
+  }
+}
+
+template <typename IDT, typename ODT>
+void cpu_cast_backward(IDT const *input, ODT *output, size_t volume, ODT beta) {
+  for (size_t i = 0; i < volume; i++) {
+    output[i] = static_cast<ODT>(input[i]) + beta * output[i];
+  }
+}
+
+template <DataType IDT, DataType ODT>
+struct CPUForwardKernel {
+  void operator()(GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &output) {
+    size_t volume = input.shape.get_volume().unwrap_nonnegative();
+    cpu_cast_forward(input.get<IDT>(), output.get<ODT>(), volume);
+  }
+};
+
+template <DataType IDT, DataType ODT>
+struct CPUBackwardKernel {
+  void operator()(GenericTensorAccessorR const &output,
+                  GenericTensorAccessorW const &input) {
+    size_t volume = output.shape.get_volume().unwrap_nonnegative();
+    cpu_cast_backward(
+        output.get<IDT>(), input.get<ODT>(), volume, cast_to<ODT>(1.0f));
+  }
+};
+
+void cpu_forward_kernel(GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output) {
+  DataTypeDispatch2<CPUForwardKernel>{}(
+      input.data_type, output.data_type, input, output);
+}
+
+void cpu_backward_kernel(GenericTensorAccessorR const &output,
+                         GenericTensorAccessorW const &input) {
+  DataTypeDispatch2<CPUBackwardKernel>{}(
+      output.data_type, input.data_type, output, input);
+}
+
+} // namespace FlexFlow::Kernels::Cast
diff --git a/lib/kernels/src/cpu/ops/combine_kernels.cc b/lib/kernels/src/cpu/ops/combine_kernels.cc
new file mode 100644
index 0000000000..577984f21a
--- /dev/null
+++ b/lib/kernels/src/cpu/ops/combine_kernels.cc
@@ -0,0 +1,39 @@
+#include "kernels/combine_kernels_cpu.h"
+#include "kernels/datatype_dispatch.h"
+
+namespace FlexFlow::Kernels::Combine {
+
+template <DataType DT>
+struct CPUForwardKernel {
+  void operator()(GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &output) {
+    memcpy(output.get<DT>(),
+           input.get<DT>(),
+           input.shape.get_volume().unwrap_nonnegative() *
+               size_of_datatype(DT).unwrap_nonnegative());
+  }
+};
+
+template <DataType DT>
+struct CPUBackwardKernel {
+  void operator()(GenericTensorAccessorR const &output_grad,
+                  GenericTensorAccessorW const &input_grad) {
+    size_t num_elements = output_grad.shape.get_volume().unwrap_nonnegative();
+    for (int i = 0; i < num_elements; ++i) {
+      input_grad.get<DT>()[i] += output_grad.get<DT>()[i];
+    }
+  }
+};
+
+void cpu_forward_kernel(GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output) {
+  DataTypeDispatch1<CPUForwardKernel>{}(input.data_type, input, output);
+}
+
+void cpu_backward_kernel(GenericTensorAccessorR const &output_grad,
+                         GenericTensorAccessorW const &input_grad) {
+  DataTypeDispatch1<CPUBackwardKernel>{}(
+      input_grad.data_type, output_grad, input_grad);
+}
+
+} // namespace FlexFlow::Kernels::Combine
diff --git a/lib/kernels/src/cpu/initializer_kernels.cc b/lib/kernels/src/cpu/ops/initializer_kernels.cc
similarity index 100%
rename from lib/kernels/src/cpu/initializer_kernels.cc
rename to lib/kernels/src/cpu/ops/initializer_kernels.cc
diff --git a/lib/kernels/src/cpu/ops/replicate_kernels.cc b/lib/kernels/src/cpu/ops/replicate_kernels.cc
new file mode 100644
index 0000000000..798a4ea8c7
--- /dev/null
+++ b/lib/kernels/src/cpu/ops/replicate_kernels.cc
@@ -0,0 +1,51 @@
+#include "kernels/datatype_dispatch.h"
+#include "kernels/replicate_kernels_cpu.h"
+
+namespace FlexFlow::Kernels::Replicate {
+
+template <DataType DT>
+struct CPUForwardKernel {
+  void operator()(GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW &output) {
+    memcpy(output.get<DT>(),
+           input.get<DT>(),
+           input.shape.num_elements().unwrap_nonnegative() *
+               size_of_datatype(DT).unwrap_nonnegative());
+  }
+};
+
+template <DataType DT>
+struct CPUBackwardKernel {
+  void operator()(GenericTensorAccessorR const &output,
+                  GenericTensorAccessorW &input,
+                  nonnegative_int num_elements,
+                  nonnegative_int num_replicas) {
+    using T = real_type_t<DT>;
+
+    for (nonnegative_int i : nonnegative_range(num_elements)) {
+      T cur_sum = 0;
+      for (nonnegative_int replica_idx : nonnegative_range(num_replicas)) {
+        cur_sum += output.at<DT>(LegionOrdered{replica_idx, i});
+      }
+      input.at<DT>(LegionOrdered{i}) = cur_sum;
+    }
+  }
+};
+
+void cpu_forward_kernel(GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW &output) {
+  DataTypeDispatch1<CPUForwardKernel>{}(input.data_type, input, output);
+}
+
+void cpu_backward_kernel(GenericTensorAccessorR const &output,
+                         GenericTensorAccessorW &input,
+                         size_t num_replicas) {
+  nonnegative_int num_elements = input.shape.num_elements();
+  DataTypeDispatch1<CPUBackwardKernel>{}(input.data_type,
+                                         output,
+                                         input,
+                                         num_elements,
+                                         nonnegative_int{num_replicas});
+}
+
+} // namespace FlexFlow::Kernels::Replicate
diff --git a/lib/kernels/src/cpu/ops/reverse_kernels.cc b/lib/kernels/src/cpu/ops/reverse_kernels.cc
new file mode 100644
index 0000000000..4d9eb8cc09
--- /dev/null
+++ b/lib/kernels/src/cpu/ops/reverse_kernels.cc
@@ -0,0 +1,46 @@
+#include "kernels/datatype_dispatch.h"
+#include "kernels/reverse_kernels_cpu.h"
+#include <vector>
+
+namespace FlexFlow::Kernels::Reverse {
+
+template <DataType DT>
+struct CPUReverseForwardKernel {
+  void operator()(GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW &output,
+                  ReverseAttrs const &attrs) {
+    nonnegative_int reverse_axis_size = input.shape.at(attrs.axis);
+
+    for (ArrayCoord const &input_coord : get_array_coord_set(input.shape)) {
+      nonnegative_int input_reverse_axis_coord =
+          input_coord.ff_ordered.at(attrs.axis);
+
+      ArrayCoord output_coord = input_coord;
+      output_coord.ff_ordered.at(attrs.axis) =
+          nonnegative_int{reverse_axis_size.unwrap_nonnegative() -
+                          input_reverse_axis_coord.unwrap_nonnegative() - 1};
+
+      output.at<DT>(output_coord.ff_ordered) =
+          input.at<DT>(input_coord.ff_ordered);
+    }
+  }
+};
+
+void cpu_forward_kernel(GenericTensorAccessorR const &input_accessor,
+                        GenericTensorAccessorW &output_accessor,
+                        ReverseAttrs const &attrs) {
+
+  DataTypeDispatch1<CPUReverseForwardKernel>{}(
+      input_accessor.data_type, input_accessor, output_accessor, attrs);
+}
+
+void cpu_backward_kernel(GenericTensorAccessorR const &output_grad_accessor,
+                         GenericTensorAccessorW &input_grad_accessor,
+                         ReverseAttrs const &attrs) {
+  DataTypeDispatch1<CPUReverseForwardKernel>{}(output_grad_accessor.data_type,
+                                               output_grad_accessor,
+                                               input_grad_accessor,
+                                               attrs);
+}
+
+} // namespace FlexFlow::Kernels::Reverse
diff --git a/lib/kernels/src/cuda/cuda_helper.cu b/lib/kernels/src/cuda/cuda_helper.cu
index 66388c0ec8..86b2d8a437 100644
--- a/lib/kernels/src/cuda/cuda_helper.cu
+++ b/lib/kernels/src/cuda/cuda_helper.cu
@@ -1,4 +1,4 @@
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/datatype_dispatch.h"
 #include "utils/containers/reversed.h"
 
@@ -29,13 +29,13 @@ cudaError_t get_legion_stream(cudaStream_t *stream) {
 #error "Unknown device, please make sure if CUDA is enabled"
 #endif
 
-__global__ void scale_kernel(float *ptr, coord_t size, float a, float b) {
+__global__ void scale_kernel(float *ptr, size_t size, float a, float b) {
   CUDA_KERNEL_LOOP(i, size) {
     ptr[i] = (b - a) * ptr[i] + a;
   }
 }
 
-__global__ void ones_kernel(float *ptr, coord_t size) {
+__global__ void ones_kernel(float *ptr, size_t size) {
   CUDA_KERNEL_LOOP(i, size) {
     ptr[i] = 1.0f;
   }
@@ -49,7 +49,7 @@ __global__ void assign_kernel(DT *ptr, size_t size, DT value) {
 }
 
 template <typename DT>
-__global__ void copy_kernel(DT *dst, const DT *src, coord_t size) {
+__global__ void copy_kernel(DT *dst, const DT *src, size_t size) {
   CUDA_KERNEL_LOOP(i, size) {
     dst[i] = src[i];
   }
@@ -281,11 +281,11 @@ template __global__ void
     add_kernel<bool>(bool *dst, bool const *src, unsigned long size);
 
 template __global__ void
-    copy_kernel<float>(float *dst, float const *src, coord_t size);
+    copy_kernel<float>(float *dst, float const *src, size_t size);
 template __global__ void
-    copy_kernel<int32_t>(int32_t *dst, int32_t const *src, coord_t size);
+    copy_kernel<int32_t>(int32_t *dst, int32_t const *src, size_t size);
 template __global__ void
-    copy_kernel<int64_t>(int64_t *dst, int64_t const *src, coord_t size);
+    copy_kernel<int64_t>(int64_t *dst, int64_t const *src, size_t size);
 
 template __global__ void apply_add_with_scale<float>(float *data_ptr,
                                                      float const *grad_ptr,
diff --git a/lib/kernels/src/cuda/embedding_kernels.cu b/lib/kernels/src/cuda/embedding_kernels.cu
index e6a614ba70..cb84f0e777 100644
--- a/lib/kernels/src/cuda/embedding_kernels.cu
+++ b/lib/kernels/src/cuda/embedding_kernels.cu
@@ -13,16 +13,15 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/datatype_dispatch.h"
 #include "kernels/embedding_kernels.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Embedding {
+namespace FlexFlow::Kernels::Embedding {
 
 void rand_generate_int64_wrapper(int64_t *ptr, size_t size, int64_t p) {
   cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
 
   // Randomly initialize the intput tensor to avoid out of index range issues
   rand_generate_int<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
@@ -31,36 +30,14 @@ void rand_generate_int64_wrapper(int64_t *ptr, size_t size, int64_t p) {
 
 void rand_generate_int32_wrapper(int32_t *ptr, size_t size, int32_t p) {
   cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
 
   // Randomly initialize the intput tensor to avoid out of index range issues
   rand_generate_int<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
       ptr, size, p);
 }
 
-template <typename TI, typename TD>
-__global__ void embed_forward_no_aggr(
-    TI const *input, TD *output, TD const *embed, int out_dim, int batch_size);
-template <typename TI, typename TD>
-__global__ void embed_forward_with_aggr(TI const *input,
-                                        TD *output,
-                                        TD const *embed,
-                                        int out_dim,
-                                        int in_dim,
-                                        int batch_size,
-                                        std::optional<AggregateOp> aggr);
-template <typename TI, typename TD>
-__global__ void embed_backward_no_aggr(
-    TI const *input, TD const *output, TD *embed, int out_dim, int batch_size);
-template <typename TI, typename TD>
-__global__ void embed_backward_with_aggr(TI const *input,
-                                         TD const *output,
-                                         TD *embed,
-                                         int out_dim,
-                                         int in_dim,
-                                         int batch_size,
-                                         std::optional<AggregateOp> aggr);
-
-template <int32_t, typename TD>
+template <typename TD>
 __global__ void embed_forward_no_aggr(int32_t const *input,
                                       TD *output,
                                       TD const *embed,
@@ -75,7 +52,7 @@ __global__ void embed_forward_no_aggr(int32_t const *input,
   }
 }
 
-template <int64_t, typename TD>
+template <typename TD>
 __global__ void embed_forward_no_aggr(int64_t const *input,
                                       TD *output,
                                       TD const *embed,
@@ -90,14 +67,14 @@ __global__ void embed_forward_no_aggr(int64_t const *input,
   }
 }
 
-template <int32_t, typename TD>
+template <typename TD>
 __global__ void embed_forward_with_aggr(int32_t const *input,
                                         TD *output,
                                         TD const *embed,
                                         int out_dim,
                                         int in_dim,
                                         int batch_size,
-                                        std::optional<AggregateOp> aggr) {
+                                        AggregateOp aggr) {
   TD scale = 1.0f / in_dim;
   CUDA_KERNEL_LOOP(i, batch_size * out_dim) {
     output[i] = 0;
@@ -115,14 +92,14 @@ __global__ void embed_forward_with_aggr(int32_t const *input,
   }
 }
 
-template <int64_t, typename TD>
+template <typename TD>
 __global__ void embed_forward_with_aggr(int64_t const *input,
                                         TD *output,
                                         TD const *embed,
                                         int out_dim,
                                         int in_dim,
                                         int batch_size,
-                                        std::optional<AggregateOp> aggr) {
+                                        AggregateOp aggr) {
   TD scale = 1.0f / in_dim;
   CUDA_KERNEL_LOOP(i, batch_size * out_dim) {
     output[i] = 0;
@@ -140,7 +117,7 @@ __global__ void embed_forward_with_aggr(int64_t const *input,
   }
 }
 
-template <int32_t, typename TD>
+template <typename TD>
 __global__ void embed_backward_no_aggr(int32_t const *input,
                                        TD const *output,
                                        TD *embed,
@@ -154,7 +131,7 @@ __global__ void embed_backward_no_aggr(int32_t const *input,
   }
 }
 
-template <int64_t, typename TD>
+template <typename TD>
 __global__ void embed_backward_no_aggr(int64_t const *input,
                                        TD const *output,
                                        TD *embed,
@@ -171,11 +148,11 @@ __global__ void embed_backward_no_aggr(int64_t const *input,
 // Specialization for half type
 
 template <>
-__global__ void embed_backward_no_aggr<int32_t, half>(int32_t const *input,
-                                                      half const *output,
-                                                      half *embed,
-                                                      int out_dim,
-                                                      int batch_size) {
+__global__ void embed_backward_no_aggr<half>(int32_t const *input,
+                                             half const *output,
+                                             half *embed,
+                                             int out_dim,
+                                             int batch_size) {
   CUDA_KERNEL_LOOP(i, batch_size * out_dim) {
     int idx = i / out_dim;
     int off = i % out_dim;
@@ -192,11 +169,11 @@ __global__ void embed_backward_no_aggr<int32_t, half>(int32_t const *input,
 }
 
 template <>
-__global__ void embed_backward_no_aggr<int64_t, half>(int64_t const *input,
-                                                      half const *output,
-                                                      half *embed,
-                                                      int out_dim,
-                                                      int batch_size) {
+__global__ void embed_backward_no_aggr<half>(int64_t const *input,
+                                             half const *output,
+                                             half *embed,
+                                             int out_dim,
+                                             int batch_size) {
   CUDA_KERNEL_LOOP(i, batch_size * out_dim) {
     int idx = i / out_dim;
     int off = i % out_dim;
@@ -212,14 +189,14 @@ __global__ void embed_backward_no_aggr<int64_t, half>(int64_t const *input,
   }
 }
 
-template <int32_t, typename TD>
+template <typename TD>
 __global__ void embed_backward_with_aggr(int32_t const *input,
                                          TD const *output,
                                          TD *embed,
                                          int out_dim,
                                          int in_dim,
                                          int batch_size,
-                                         std::optional<AggregateOp> aggr) {
+                                         AggregateOp aggr) {
   TD scale = 1.0f / in_dim;
   CUDA_KERNEL_LOOP(i, batch_size * out_dim) {
     int idx = i / out_dim;
@@ -238,14 +215,14 @@ __global__ void embed_backward_with_aggr(int32_t const *input,
   }
 }
 
-template <int64_t, typename TD>
+template <typename TD>
 __global__ void embed_backward_with_aggr(int64_t const *input,
                                          TD const *output,
                                          TD *embed,
                                          int out_dim,
                                          int in_dim,
                                          int batch_size,
-                                         std::optional<AggregateOp> aggr) {
+                                         AggregateOp aggr) {
   TD scale = 1.0f / in_dim;
   CUDA_KERNEL_LOOP(i, batch_size * out_dim) {
     int idx = i / out_dim;
@@ -267,14 +244,13 @@ __global__ void embed_backward_with_aggr(int64_t const *input,
 // Specialization for half type
 
 template <>
-__global__ void
-    embed_backward_with_aggr<int32_t, half>(int32_t const *input,
-                                            half const *output,
-                                            half *embed,
-                                            int out_dim,
-                                            int in_dim,
-                                            int batch_size,
-                                            std::optional<AggregateOp> aggr) {
+__global__ void embed_backward_with_aggr<half>(int32_t const *input,
+                                               half const *output,
+                                               half *embed,
+                                               int out_dim,
+                                               int in_dim,
+                                               int batch_size,
+                                               AggregateOp aggr) {
   half scale = 1.0f / in_dim;
   CUDA_KERNEL_LOOP(i, batch_size * out_dim) {
     int idx = i / out_dim;
@@ -301,14 +277,13 @@ __global__ void
 }
 
 template <>
-__global__ void
-    embed_backward_with_aggr<int64_t, half>(int64_t const *input,
-                                            half const *output,
-                                            half *embed,
-                                            int out_dim,
-                                            int in_dim,
-                                            int batch_size,
-                                            std::optional<AggregateOp> aggr) {
+__global__ void embed_backward_with_aggr<half>(int64_t const *input,
+                                               half const *output,
+                                               half *embed,
+                                               int out_dim,
+                                               int in_dim,
+                                               int batch_size,
+                                               AggregateOp aggr) {
   half scale = 1.0f / in_dim;
   CUDA_KERNEL_LOOP(i, batch_size * out_dim) {
     int idx = i / out_dim;
@@ -351,35 +326,229 @@ struct ForwardKernel {
                   int in_dim,
                   int out_dim,
                   int batch_size) {
-    assert(input.data_type == DataType::INT32 ||
-           input.data_type == DataType::INT64);
-    assert(weight.data_type == DataType::HALF ||
-           weight.data_type == DataType::FLOAT ||
-           weight.data_type == DataType::DOUBLE);
+    throw mk_runtime_error(fmt::format(
+        "Invalid type combination: input type {} and output type {}", TI, TD));
+  }
+};
+
+template <>
+struct ForwardKernel<DataType::INT32, DataType::FLOAT> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &output,
+                  GenericTensorAccessorR const &weight,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    if (!aggr.has_value()) {
+      embed_forward_no_aggr<float>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::FLOAT>(),
+                       weight.get<DataType::FLOAT>(),
+                       out_dim,
+                       batch_size);
+    } else {
+      assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM);
+      embed_forward_with_aggr<float>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::FLOAT>(),
+                       weight.get<DataType::FLOAT>(),
+                       out_dim,
+                       in_dim,
+                       batch_size,
+                       aggr.value());
+    }
+  }
+};
 
+template <>
+struct ForwardKernel<DataType::INT32, DataType::HALF> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &output,
+                  GenericTensorAccessorR const &weight,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
     if (!aggr.has_value()) {
-      embed_forward_no_aggr<real_type_t<TI>, real_type_t<TD>>
-          <<<GET_BLOCKS(output.shape.get_volume()),
+      embed_forward_no_aggr<half>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
              CUDA_NUM_THREADS,
              0,
-             stream>>>(input.get<TI>(),
-                       output.get<TD>(),
-                       weight.get<TD>(),
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::HALF>(),
+                       weight.get<DataType::HALF>(),
                        out_dim,
                        batch_size);
     } else {
       assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM);
-      embed_forward_with_aggr<real_type_t<TI>, real_type_t<TD>>
-          <<<GET_BLOCKS(output.shape.get_volume()),
+      embed_forward_with_aggr<half>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
              CUDA_NUM_THREADS,
              0,
-             stream>>>(input.get<TI>(),
-                       output.get<TD>(),
-                       weight.get<TD>(),
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::HALF>(),
+                       weight.get<DataType::HALF>(),
                        out_dim,
                        in_dim,
                        batch_size,
-                       aggr);
+                       aggr.value());
+    }
+  }
+};
+
+template <>
+struct ForwardKernel<DataType::INT32, DataType::DOUBLE> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &output,
+                  GenericTensorAccessorR const &weight,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    if (!aggr.has_value()) {
+      embed_forward_no_aggr<double>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::DOUBLE>(),
+                       weight.get<DataType::DOUBLE>(),
+                       out_dim,
+                       batch_size);
+    } else {
+      assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM);
+      embed_forward_with_aggr<double>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::DOUBLE>(),
+                       weight.get<DataType::DOUBLE>(),
+                       out_dim,
+                       in_dim,
+                       batch_size,
+                       aggr.value());
+    }
+  }
+};
+
+template <>
+struct ForwardKernel<DataType::INT64, DataType::FLOAT> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &output,
+                  GenericTensorAccessorR const &weight,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    if (!aggr.has_value()) {
+      embed_forward_no_aggr<float>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::FLOAT>(),
+                       weight.get<DataType::FLOAT>(),
+                       out_dim,
+                       batch_size);
+    } else {
+      assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM);
+      embed_forward_with_aggr<float>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::FLOAT>(),
+                       weight.get<DataType::FLOAT>(),
+                       out_dim,
+                       in_dim,
+                       batch_size,
+                       aggr.value());
+    }
+  }
+};
+
+template <>
+struct ForwardKernel<DataType::INT64, DataType::HALF> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &output,
+                  GenericTensorAccessorR const &weight,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    if (!aggr.has_value()) {
+      embed_forward_no_aggr<half>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::HALF>(),
+                       weight.get<DataType::HALF>(),
+                       out_dim,
+                       batch_size);
+    } else {
+      assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM);
+      embed_forward_with_aggr<half>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::HALF>(),
+                       weight.get<DataType::HALF>(),
+                       out_dim,
+                       in_dim,
+                       batch_size,
+                       aggr.value());
+    }
+  }
+};
+
+template <>
+struct ForwardKernel<DataType::INT64, DataType::DOUBLE> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &output,
+                  GenericTensorAccessorR const &weight,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    if (!aggr.has_value()) {
+      embed_forward_no_aggr<double>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::DOUBLE>(),
+                       weight.get<DataType::DOUBLE>(),
+                       out_dim,
+                       batch_size);
+    } else {
+      assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM);
+      embed_forward_with_aggr<double>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::DOUBLE>(),
+                       weight.get<DataType::DOUBLE>(),
+                       out_dim,
+                       in_dim,
+                       batch_size,
+                       aggr.value());
     }
   }
 };
@@ -388,39 +557,229 @@ template <DataType TI, DataType TD>
 struct BackwardKernel {
   void operator()(cudaStream_t stream,
                   std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &output,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &weight_grad,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    throw mk_runtime_error(fmt::format(
+        "Invalid type combination: input type {} and output type {}", TI, TD));
+  }
+};
+
+template <>
+struct BackwardKernel<DataType::INT32, DataType::FLOAT> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &output,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &weight_grad,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    if (!aggr.has_value()) {
+      embed_backward_no_aggr<float>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::FLOAT>(),
+                       weight_grad.get<DataType::FLOAT>(),
+                       out_dim,
+                       batch_size);
+    } else {
+      embed_backward_with_aggr<float>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::FLOAT>(),
+                       weight_grad.get<DataType::FLOAT>(),
+                       out_dim,
+                       in_dim,
+                       batch_size,
+                       aggr.value());
+    }
+  }
+};
+
+template <>
+struct BackwardKernel<DataType::INT32, DataType::DOUBLE> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &output,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &weight_grad,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    if (!aggr.has_value()) {
+      embed_backward_no_aggr<double>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::DOUBLE>(),
+                       weight_grad.get<DataType::DOUBLE>(),
+                       out_dim,
+                       batch_size);
+    } else {
+      embed_backward_with_aggr<double>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::DOUBLE>(),
+                       weight_grad.get<DataType::DOUBLE>(),
+                       out_dim,
+                       in_dim,
+                       batch_size,
+                       aggr.value());
+    }
+  }
+};
+
+template <>
+struct BackwardKernel<DataType::INT32, DataType::HALF> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &output,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &weight_grad,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    if (!aggr.has_value()) {
+      embed_backward_no_aggr<half>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::HALF>(),
+                       weight_grad.get<DataType::HALF>(),
+                       out_dim,
+                       batch_size);
+    } else {
+      embed_backward_with_aggr<half>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::HALF>(),
+                       weight_grad.get<DataType::HALF>(),
+                       out_dim,
+                       in_dim,
+                       batch_size,
+                       aggr.value());
+    }
+  }
+};
+
+template <>
+struct BackwardKernel<DataType::INT64, DataType::FLOAT> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &output,
                   GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &weight_grad,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    if (!aggr.has_value()) {
+      embed_backward_no_aggr<float>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::FLOAT>(),
+                       weight_grad.get<DataType::FLOAT>(),
+                       out_dim,
+                       batch_size);
+    } else {
+      embed_backward_with_aggr<float>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::FLOAT>(),
+                       weight_grad.get<DataType::FLOAT>(),
+                       out_dim,
+                       in_dim,
+                       batch_size,
+                       aggr.value());
+    }
+  }
+};
+
+template <>
+struct BackwardKernel<DataType::INT64, DataType::DOUBLE> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &output,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &weight_grad,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    if (!aggr.has_value()) {
+      embed_backward_no_aggr<double>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::DOUBLE>(),
+                       weight_grad.get<DataType::DOUBLE>(),
+                       out_dim,
+                       batch_size);
+    } else {
+      embed_backward_with_aggr<double>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::DOUBLE>(),
+                       weight_grad.get<DataType::DOUBLE>(),
+                       out_dim,
+                       in_dim,
+                       batch_size,
+                       aggr.value());
+    }
+  }
+};
+
+template <>
+struct BackwardKernel<DataType::INT64, DataType::HALF> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
                   GenericTensorAccessorR const &output,
+                  GenericTensorAccessorR const &input,
                   GenericTensorAccessorW const &weight_grad,
                   int in_dim,
                   int out_dim,
                   int batch_size) {
-    assert(input.data_type == DataType::INT32 ||
-           input.data_type == DataType::INT64);
-    assert(output.data_type == DataType::HALF ||
-           output.data_type == DataType::FLOAT ||
-           output.data_type == DataType::DOUBLE);
     if (!aggr.has_value()) {
-      embed_backward_no_aggr<real_type_t<TI>, real_type_t<TD>>
-          <<<GET_BLOCKS(output.shape.get_volume()),
+      embed_backward_no_aggr<half>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
              CUDA_NUM_THREADS,
              0,
-             stream>>>(input.get<TI>(),
-                       output.get<TD>(),
-                       weight_grad.get<TD>(),
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::HALF>(),
+                       weight_grad.get<DataType::HALF>(),
                        out_dim,
                        batch_size);
     } else {
-      embed_backward_with_aggr<real_type_t<TI>, real_type_t<TD>>
-          <<<GET_BLOCKS(output.shape.get_volume()),
+      embed_backward_with_aggr<half>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
              CUDA_NUM_THREADS,
              0,
-             stream>>>(input.get<TI>(),
-                       output.get<TD>(),
-                       weight_grad.get<TD>(),
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::HALF>(),
+                       weight_grad.get<DataType::HALF>(),
                        out_dim,
                        in_dim,
                        batch_size,
-                       aggr);
+                       aggr.value());
     }
   }
 };
@@ -448,27 +807,25 @@ void forward_kernel(ffStream_t stream,
 }
 
 void backward_kernel(cudaStream_t stream,
-                     GenericTensorAccessorR const &input,
                      GenericTensorAccessorR const &output,
+                     GenericTensorAccessorR const &input,
                      GenericTensorAccessorW const &weight_grad,
-                     DataType input_data_type,
                      DataType output_data_type,
+                     DataType input_data_type,
                      std::optional<AggregateOp> aggr,
                      int in_dim,
                      int out_dim,
                      int batch_size) {
-  DataTypeDispatch2<BackwardKernel>{}(input_data_type,
-                                      output_data_type,
+  DataTypeDispatch2<BackwardKernel>{}(output_data_type,
+                                      input_data_type,
                                       stream,
                                       aggr,
-                                      input,
                                       output,
+                                      input,
                                       weight_grad,
                                       in_dim,
                                       out_dim,
                                       batch_size);
 }
 
-} // namespace Embedding
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Embedding
diff --git a/lib/kernels/src/cuda/loss_function_kernels.cu b/lib/kernels/src/cuda/loss_function_kernels.cu
index 6c22efda21..2fccf4b48f 100644
--- a/lib/kernels/src/cuda/loss_function_kernels.cu
+++ b/lib/kernels/src/cuda/loss_function_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/loss_function_kernels.h"
 
 namespace FlexFlow {
diff --git a/lib/kernels/src/cuda/metrics_functions.cu b/lib/kernels/src/cuda/metrics_functions.cu
index 2e037eb472..54ecd076f6 100644
--- a/lib/kernels/src/cuda/metrics_functions.cu
+++ b/lib/kernels/src/cuda/metrics_functions.cu
@@ -13,17 +13,42 @@
  * limitations under the License.
  */
 
-#include "flexflow/model.h"
-#include "flexflow/utils/cuda_helper.h"
+#include "internal/device.h"
+#include "kernels/metrics_kernels.h"
+#include "kernels/perf_metrics.h"
+#include "pcg/metric_attrs.h"
 
 namespace FlexFlow {
 
+struct CUDAPerfMetrics {
+  int train_all;
+  int train_correct;
+  float cce_loss;
+  float sparse_cce_loss;
+  float mse_loss;
+  float rmse_loss;
+  float mae_loss;
+  double start_time;
+  double current_time;
+
+  CUDAPerfMetrics() = delete;
+  CUDAPerfMetrics(PerfMetrics const &perf)
+      : train_all(perf.train_all),
+        train_correct(perf.train_correct.value_or(-1)),
+        cce_loss(perf.cce_loss.value_or(-1)),
+        sparse_cce_loss(perf.sparse_cce_loss.value_or(-1)),
+        mse_loss(perf.mse_loss.value_or(-1)),
+        rmse_loss(perf.rmse_loss.value_or(-1)),
+        mae_loss(perf.mae_loss.value_or(-1)), start_time(perf.start_time),
+        current_time(perf.current_time) {}
+};
+
 float const LOG_MIN_VALUE = 0.00000001f;
 
 __global__ void update_metrics_sparse_label_kernel(float const *logits,
                                                    int const *labels,
-                                                   PerfMetrics *perf,
-                                                   const Metrics metrics,
+                                                   CUDAPerfMetrics *perf,
+                                                   const MetricsAttrs metrics,
                                                    int num_samples,
                                                    int num_classes) {
   CUDA_KERNEL_LOOP(b, num_samples) {
@@ -72,8 +97,8 @@ __global__ void update_metrics_sparse_label_kernel(float const *logits,
 
 __global__ void update_metrics_label_kernel(float const *logits,
                                             float const *labels,
-                                            PerfMetrics *perf,
-                                            const Metrics metrics,
+                                            CUDAPerfMetrics *perf,
+                                            const MetricsAttrs metrics,
                                             int num_samples,
                                             int num_classes) {
   CUDA_KERNEL_LOOP(b, num_samples) {
@@ -136,17 +161,17 @@ __global__ void update_metrics_label_kernel(float const *logits,
   }
 }
 
-void Metrics::update_metrics_sparse_label_kernel_wrapper(
-    float const *logit_ptr,
-    int const *label_ptr,
-    Metrics const *me,
-    int num_effective_samples,
-    int num_classes,
-    PerfMetrics &perf_zc) {
-  PerfMetrics *perf;
-  checkCUDA(cudaMalloc(&perf, sizeof(PerfMetrics)));
-  checkCUDA(
-      cudaMemcpy(perf, &perf_zc, sizeof(PerfMetrics), cudaMemcpyHostToDevice));
+void update_metrics_sparse_label_kernel_wrapper(float const *logit_ptr,
+                                                int const *label_ptr,
+                                                MetricsAttrs const &me,
+                                                int num_effective_samples,
+                                                int num_classes,
+                                                PerfMetrics &perf_zc) {
+  CUDAPerfMetrics perf(perf_zc);
+  CUDAPerfMetrics *perf_cuda;
+  checkCUDA(cudaMalloc(&perf_cuda, sizeof(CUDAPerfMetrics)));
+  checkCUDA(cudaMemcpy(
+      perf_cuda, &perf, sizeof(CUDAPerfMetrics), cudaMemcpyHostToDevice));
 
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
@@ -154,32 +179,33 @@ void Metrics::update_metrics_sparse_label_kernel_wrapper(
                                        CUDA_NUM_THREADS,
                                        0,
                                        stream>>>(
-      logit_ptr, label_ptr, perf, *me, num_effective_samples, num_classes);
+      logit_ptr, label_ptr, perf_cuda, me, num_effective_samples, num_classes);
   checkCUDA(cudaStreamSynchronize(stream));
-  checkCUDA(
-      cudaMemcpy(&perf_zc, perf, sizeof(PerfMetrics), cudaMemcpyDeviceToHost));
-  checkCUDA(cudaFree(perf));
+  checkCUDA(cudaMemcpy(
+      &perf, perf_cuda, sizeof(CUDAPerfMetrics), cudaMemcpyDeviceToHost));
+  checkCUDA(cudaFree(perf_cuda));
 }
 
-void Metrics::update_metrics_label_kernel_wrapper(float const *logit_ptr,
-                                                  float const *label_ptr,
-                                                  Metrics const *me,
-                                                  int num_samples,
-                                                  int num_classes,
-                                                  PerfMetrics &perf_zc) {
-  PerfMetrics *perf;
-  checkCUDA(cudaMalloc(&perf, sizeof(PerfMetrics)));
-  checkCUDA(
-      cudaMemcpy(perf, &perf_zc, sizeof(PerfMetrics), cudaMemcpyHostToDevice));
+void update_metrics_label_kernel_wrapper(float const *logit_ptr,
+                                         float const *label_ptr,
+                                         MetricsAttrs const &me,
+                                         int num_samples,
+                                         int num_classes,
+                                         PerfMetrics &perf_zc) {
+  CUDAPerfMetrics perf(perf_zc);
+  CUDAPerfMetrics *perf_cuda;
+  checkCUDA(cudaMalloc(&perf_cuda, sizeof(CUDAPerfMetrics)));
+  checkCUDA(cudaMemcpy(
+      perf_cuda, &perf, sizeof(CUDAPerfMetrics), cudaMemcpyHostToDevice));
 
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   update_metrics_label_kernel<<<GET_BLOCKS(num_samples), 256, 0, stream>>>(
-      logit_ptr, label_ptr, perf, *me, num_samples, num_classes);
+      logit_ptr, label_ptr, perf_cuda, me, num_samples, num_classes);
   checkCUDA(cudaStreamSynchronize(stream));
-  checkCUDA(
-      cudaMemcpy(&perf_zc, perf, sizeof(PerfMetrics), cudaMemcpyDeviceToHost));
-  checkCUDA(cudaFree(perf));
+  checkCUDA(cudaMemcpy(
+      &perf, perf_cuda, sizeof(CUDAPerfMetrics), cudaMemcpyDeviceToHost));
+  checkCUDA(cudaFree(perf_cuda));
 }
 
 }; // namespace FlexFlow
diff --git a/lib/kernels/src/cuda/ops/attention_kernels.cu b/lib/kernels/src/cuda/ops/attention_kernels.cu
index 38c32ad9e4..e5bdb6f21d 100644
--- a/lib/kernels/src/cuda/ops/attention_kernels.cu
+++ b/lib/kernels/src/cuda/ops/attention_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/attention_kernels.h"
 #include "kernels/device.h"
 
diff --git a/lib/kernels/src/cuda/ops/batch_matmul_kernels.cu b/lib/kernels/src/cuda/ops/batch_matmul_kernels.cu
index eb23514c5f..348eed9f0c 100644
--- a/lib/kernels/src/cuda/ops/batch_matmul_kernels.cu
+++ b/lib/kernels/src/cuda/ops/batch_matmul_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/batch_matmul_kernels.h"
 
 namespace FlexFlow {
diff --git a/lib/kernels/src/cuda/ops/batch_norm_kernels.cu b/lib/kernels/src/cuda/ops/batch_norm_kernels.cu
index 4e153a028e..ceb3a1b3d9 100644
--- a/lib/kernels/src/cuda/ops/batch_norm_kernels.cu
+++ b/lib/kernels/src/cuda/ops/batch_norm_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/allocation.h"
 #include "kernels/batch_norm_kernels.h"
 #include "kernels/ff_handle.h"
@@ -53,9 +53,9 @@ void forward_kernel(cudaStream_t stream,
 
 void backward_kernel(cudaStream_t stream,
                      BatchNormPerDeviceState const &m,
-                     float const *input_ptr,
-                     float *output_grad_ptr,
                      float const *output_ptr,
+                     float *output_grad_ptr,
+                     float const *input_ptr,
                      float *input_grad_ptr,
                      float const *scale_ptr,
                      float *scale_grad_ptr,
diff --git a/lib/kernels/src/cuda/ops/cast_kernels.cu b/lib/kernels/src/cuda/ops/cast_kernels.cu
index fe7aec68b9..f3ea6db660 100644
--- a/lib/kernels/src/cuda/ops/cast_kernels.cu
+++ b/lib/kernels/src/cuda/ops/cast_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/cast_kernels.h"
 #include "kernels/datatype_dispatch.h"
 
@@ -50,30 +50,26 @@ struct ForwardKernel {
 template <DataType IDT, DataType ODT>
 struct BackwardKernel {
   void operator()(ffStream_t stream,
-                  GenericTensorAccessorR const &input,
-                  GenericTensorAccessorW const &output) {
-    size_t volume = input.shape.get_volume().unwrap_nonnegative();
+                  GenericTensorAccessorR const &output,
+                  GenericTensorAccessorW const &input) {
+    size_t volume = output.shape.get_volume().unwrap_nonnegative();
     cast_backward<<<GET_BLOCKS(volume), CUDA_NUM_THREADS, 0, stream>>>(
-        input.get<IDT>(), output.get<ODT>(), volume, cast_to<ODT>(1.0f));
+        output.get<IDT>(), input.get<ODT>(), volume, cast_to<ODT>(1.0f));
   }
 };
 
 void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorR const &input,
-                    GenericTensorAccessorW const &output,
-                    DataType input_type,
-                    DataType output_type) {
+                    GenericTensorAccessorW const &output) {
   DataTypeDispatch2<ForwardKernel>{}(
-      input_type, output_type, stream, input, output);
+      input.data_type, output.data_type, stream, input, output);
 }
 
 void backward_kernel(ffStream_t stream,
-                     GenericTensorAccessorR const &input,
-                     GenericTensorAccessorW const &output,
-                     DataType input_type,
-                     DataType output_type) {
+                     GenericTensorAccessorR const &output,
+                     GenericTensorAccessorW const &input) {
   DataTypeDispatch2<BackwardKernel>{}(
-      input_type, output_type, stream, input, output);
+      output.data_type, input.data_type, stream, output, input);
 }
 
 } // namespace Cast
diff --git a/lib/kernels/src/cuda/ops/combine_kernels.cu b/lib/kernels/src/cuda/ops/combine_kernels.cu
index 7cc67ceed8..08cc343fd2 100644
--- a/lib/kernels/src/cuda/ops/combine_kernels.cu
+++ b/lib/kernels/src/cuda/ops/combine_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/accessor.h"
 #include "kernels/combine_kernels.h"
 #include "kernels/datatype_dispatch.h"
diff --git a/lib/kernels/src/cuda/ops/concat_kernels.cu b/lib/kernels/src/cuda/ops/concat_kernels.cu
index 2715ff16e9..37dbbe12f8 100644
--- a/lib/kernels/src/cuda/ops/concat_kernels.cu
+++ b/lib/kernels/src/cuda/ops/concat_kernels.cu
@@ -13,50 +13,58 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/concat_kernels.h"
 #include <cassert>
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Concat {
+namespace FlexFlow::Kernels::Concat {
 
 void calc_blk_size(size_t &num_blocks,
                    size_t &blk_size,
                    ArrayShape const &shape,
                    ff_dim_t axis) {
-  blk_size = shape.sub_shape(legion_dim_t{0_n}, axis)
+  legion_dim_t legion_axis = legion_dim_from_ff_dim(axis, shape.num_dims());
+  assert(legion_axis.value < shape.num_dims());
+  if (legion_axis.value == 0_n) {
+    legion_axis.value = 1_n;
+  }
+  blk_size = shape.sub_shape(legion_dim_t{0_n}, legion_axis)
                  .num_elements()
                  .unwrap_nonnegative();
-  num_blocks =
-      shape.sub_shape(axis, std::nullopt).num_elements().unwrap_nonnegative();
+  num_blocks = shape.sub_shape(legion_axis, std::nullopt)
+                   .num_elements()
+                   .unwrap_nonnegative();
 }
 
 void forward_kernel(cudaStream_t stream,
                     GenericTensorAccessorW const &output,
                     std::vector<GenericTensorAccessorR> const &inputs,
                     ff_dim_t axis) {
-  size_t num_blocks = 1, output_blk_size = 1, input_blk_sizes[MAX_NUM_INPUTS];
-  int num_inputs = inputs.size();
-  assert(num_inputs <= MAX_NUM_INPUTS);
+  assert(inputs.size() <= MAX_NUM_INPUTS);
+  size_t num_blocks = 1, output_blk_size = 1;
   calc_blk_size(num_blocks, output_blk_size, output.shape, axis);
-  for (int i = 0; i < num_inputs; i++) {
-    size_t input_num_blocks = 1;
-    calc_blk_size(input_num_blocks, input_blk_sizes[i], inputs[i].shape, axis);
-    assert(input_num_blocks == num_blocks);
-  }
-
   off_t offset = 0;
-  for (int i = 0; i < num_inputs; i++) {
-    copy_with_stride<<<GET_BLOCKS(input_blk_sizes[i] * num_blocks),
+
+  for (GenericTensorAccessorR const &input : inputs) {
+    size_t input_num_blocks = 1, input_blk_size = 1;
+    calc_blk_size(input_num_blocks, input_blk_size, input.shape, axis);
+    assert(input_num_blocks == num_blocks || output_blk_size == input_blk_size);
+
+    int blocks_to_copy =
+        (output_blk_size == input_blk_size) ? input_num_blocks : num_blocks;
+
+    copy_with_stride<<<GET_BLOCKS(input_blk_size * num_blocks),
                        CUDA_NUM_THREADS,
                        0,
                        stream>>>(output.get_float_ptr() + offset,
-                                 inputs[i].get_float_ptr(),
-                                 num_blocks,
+                                 input.get_float_ptr(),
+                                 blocks_to_copy,
                                  output_blk_size,
-                                 input_blk_sizes[i]);
-    offset += input_blk_sizes[i];
+                                 input_blk_size);
+
+    offset += (output_blk_size == input_blk_size)
+                  ? input_blk_size * input_num_blocks
+                  : input_blk_size;
   }
 }
 
@@ -64,32 +72,32 @@ void backward_kernel(cudaStream_t stream,
                      GenericTensorAccessorR const &output_grad,
                      std::vector<GenericTensorAccessorW> const &input_grads,
                      ff_dim_t axis) {
-  size_t num_blocks = 1, output_blk_size = 1, input_blk_sizes[MAX_NUM_INPUTS];
-  int num_inputs = input_grads.size();
-  assert(num_inputs <= MAX_NUM_INPUTS);
-
+  assert(input_grads.size() <= MAX_NUM_INPUTS);
+  size_t num_blocks = 1, output_blk_size = 1;
   calc_blk_size(num_blocks, output_blk_size, output_grad.shape, axis);
-  for (int i = 0; i < num_inputs; i++) {
-    ArrayShape shape = input_grads[i].shape;
-    size_t input_num_blocks = 1;
-    calc_blk_size(input_num_blocks, input_blk_sizes[i], shape, axis);
-    assert(input_num_blocks == num_blocks);
-  }
-
   off_t offset = 0;
-  for (int i = 0; i < num_inputs; i++) {
-    add_with_stride<<<GET_BLOCKS(input_blk_sizes[i] * num_blocks),
+
+  for (auto &input_grad : input_grads) {
+    size_t input_num_blocks = 1, input_blk_size = 1;
+    calc_blk_size(input_num_blocks, input_blk_size, input_grad.shape, axis);
+    assert(input_num_blocks == num_blocks || output_blk_size == input_blk_size);
+
+    int blocks_to_add =
+        (output_blk_size == input_blk_size) ? input_num_blocks : num_blocks;
+
+    add_with_stride<<<GET_BLOCKS(input_blk_size * num_blocks),
                       CUDA_NUM_THREADS,
                       0,
-                      stream>>>(input_grads[i].get_float_ptr(),
+                      stream>>>(input_grad.get_float_ptr(),
                                 output_grad.get_float_ptr() + offset,
-                                num_blocks,
-                                input_blk_sizes[i],
+                                blocks_to_add,
+                                input_blk_size,
                                 output_blk_size);
-    offset += input_blk_sizes[i];
+
+    offset += (output_blk_size == input_blk_size)
+                  ? input_blk_size * input_num_blocks
+                  : input_blk_size;
   }
 }
 
-} // namespace Concat
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Concat
diff --git a/lib/kernels/src/cuda/ops/conv_2d_kernels.cu b/lib/kernels/src/cuda/ops/conv_2d_kernels.cu
index dac55539d2..16db62a57f 100644
--- a/lib/kernels/src/cuda/ops/conv_2d_kernels.cu
+++ b/lib/kernels/src/cuda/ops/conv_2d_kernels.cu
@@ -1,4 +1,4 @@
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/conv_2d_kernels.h"
 
 namespace FlexFlow {
@@ -313,10 +313,10 @@ void forward_kernel(ffStream_t stream,
 
 void backward_kernel(ffStream_t stream,
                      Conv2DPerDeviceState const &m,
-                     float const *input_ptr,
-                     float *input_grad_ptr,
                      float const *output_ptr,
                      float *output_grad_ptr,
+                     float const *input_ptr,
+                     float *input_grad_ptr,
                      float const *filter_ptr,
                      float *filter_grad_ptr,
                      float *bias_grad_ptr,
diff --git a/lib/kernels/src/cuda/ops/dropout_kernels.cu b/lib/kernels/src/cuda/ops/dropout_kernels.cu
index adf0cd8e89..c5fa56bc78 100644
--- a/lib/kernels/src/cuda/ops/dropout_kernels.cu
+++ b/lib/kernels/src/cuda/ops/dropout_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/dropout_kernels.h"
 #include "kernels/ff_handle.h"
 
diff --git a/lib/kernels/src/cuda/ops/element_binary_kernels.cu b/lib/kernels/src/cuda/ops/element_binary_kernels.cu
index 44273a323f..3a4a77b3dd 100644
--- a/lib/kernels/src/cuda/ops/element_binary_kernels.cu
+++ b/lib/kernels/src/cuda/ops/element_binary_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/element_binary_kernels.h"
 #include "kernels/ff_handle.h"
 #include "op-attrs/datatype.h"
diff --git a/lib/kernels/src/cuda/ops/element_unary_kernels.cu b/lib/kernels/src/cuda/ops/element_unary_kernels.cu
index 056c80ecf6..218e74b939 100644
--- a/lib/kernels/src/cuda/ops/element_unary_kernels.cu
+++ b/lib/kernels/src/cuda/ops/element_unary_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/datatype_dispatch.h"
 #include "kernels/element_unary_kernels.h"
 #include "op-attrs/get_op_type.h"
@@ -290,10 +290,10 @@ struct BackwardKernel {
                   OperatorType op_type,
                   std::optional<float> scalar,
                   PerDeviceFFHandle const &handle,
-                  GenericTensorAccessorR const &input,
-                  GenericTensorAccessorW const &input_grad,
                   GenericTensorAccessorR const &output,
-                  GenericTensorAccessorR const &output_grad) {
+                  GenericTensorAccessorR const &output_grad,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &input_grad) {
     checkCUDNN(cudnnSetStream(handle.dnn, stream));
 
     if (use_cudnn(op_type)) {
@@ -356,20 +356,20 @@ void backward_kernel(ffStream_t stream,
                      ElementUnaryPerDeviceState const &device_state,
                      ElementUnaryAttrs const &attrs,
                      PerDeviceFFHandle const &handle,
-                     GenericTensorAccessorR const &input,
-                     GenericTensorAccessorW const &input_grad,
                      GenericTensorAccessorR const &output,
-                     GenericTensorAccessorR const &output_grad) {
+                     GenericTensorAccessorR const &output_grad,
+                     GenericTensorAccessorR const &input,
+                     GenericTensorAccessorW const &input_grad) {
   DataTypeDispatch1<BackwardKernel>{}(input.data_type,
                                       stream,
                                       device_state,
                                       get_op_type(attrs),
                                       attrs.scalar,
                                       handle,
-                                      input,
-                                      input_grad,
                                       output,
-                                      output_grad);
+                                      output_grad,
+                                      input,
+                                      input_grad);
 }
 
 } // namespace ElementUnary
diff --git a/lib/kernels/src/cuda/ops/flat_kernels.cu b/lib/kernels/src/cuda/ops/flat_kernels.cu
index 973d05f596..594a183ff0 100644
--- a/lib/kernels/src/cuda/ops/flat_kernels.cu
+++ b/lib/kernels/src/cuda/ops/flat_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/accessor.h"
 #include "kernels/flat_kernels.h"
 
@@ -35,8 +35,8 @@ void forward_kernel(cudaStream_t stream,
 
 void backward_kernel(cudaStream_t stream,
                      GenericTensorAccessorR input,
-                     float *input_grad_ptr,
-                     float const *output_grad_ptr) {
+                     float const *output_grad_ptr,
+                     float *input_grad_ptr) {
 
   float alpha = 1.0f;
   apply_add_with_scale<float>
diff --git a/lib/kernels/src/cuda/ops/gather_kernels.cu b/lib/kernels/src/cuda/ops/gather_kernels.cu
index 31c1bac217..19e495a540 100644
--- a/lib/kernels/src/cuda/ops/gather_kernels.cu
+++ b/lib/kernels/src/cuda/ops/gather_kernels.cu
@@ -13,14 +13,12 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/datatype_dispatch.h"
 #include "kernels/device.h"
 #include "kernels/gather_kernels.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Gather {
+namespace FlexFlow::Kernels::Gather {
 
 template <typename IndexType>
 __global__ void gather_forward(float const *input,
@@ -125,11 +123,15 @@ void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorR const &index,
                     GenericTensorAccessorW const &output) {
   checkCUDA(get_legion_stream(&stream));
-
   coord_t stride =
-      output.shape.sub_shape(std::nullopt, add_to_legion_dim(m.legion_dim, 1))
+      output.shape
+          .sub_shape(legion_dim_t{0_n}, add_to_legion_dim(m.legion_dim, 1))
           .num_elements()
           .unwrap_nonnegative();
+  if (m.legion_dim.value == 0_n) {
+    stride = 1;
+  }
+
   coord_t output_dim_size = output.shape.at(m.legion_dim).unwrap_nonnegative();
   coord_t input_dim_size = input.shape.at(m.legion_dim).unwrap_nonnegative();
 
@@ -157,9 +159,13 @@ void backward_kernel(ffStream_t stream,
 
   coord_t stride =
       output_grad.shape
-          .sub_shape(std::nullopt, add_to_legion_dim(m.legion_dim, 1))
-          .get_volume()
+          .sub_shape(legion_dim_t{0_n}, add_to_legion_dim(m.legion_dim, 1))
+          .num_elements()
           .unwrap_nonnegative();
+  if (m.legion_dim.value == 0_n) {
+    stride = 1;
+  }
+
   coord_t output_dim_size =
       output_grad.shape.at(m.legion_dim).unwrap_nonnegative();
   coord_t input_dim_size =
@@ -180,6 +186,4 @@ void backward_kernel(ffStream_t stream,
       output_dim_size);
 }
 
-} // namespace Gather
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Gather
diff --git a/lib/kernels/src/cuda/ops/linear_kernels.cu b/lib/kernels/src/cuda/ops/linear_kernels.cu
index ca51f0d216..02bda55828 100644
--- a/lib/kernels/src/cuda/ops/linear_kernels.cu
+++ b/lib/kernels/src/cuda/ops/linear_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/allocation.h"
 #include "kernels/linear_kernels.h"
 #include "utils/integer_conversions.h"
@@ -108,10 +108,10 @@ LinearPerDeviceState init_kernel(PerDeviceFFHandle handle,
 
 void forward_kernel(cudaStream_t stream,
                     LinearPerDeviceState const &m,
-                    void const *input_ptr,
-                    void *output_ptr,
-                    void const *weight_ptr,
-                    void const *bias_ptr,
+                    float const *input_ptr,
+                    float *output_ptr,
+                    float const *weight_ptr,
+                    float const *bias_ptr,
                     int in_dim,
                     int out_dim,
                     int batch_size) {
@@ -135,14 +135,14 @@ void forward_kernel(cudaStream_t stream,
                            batch_size,
                            in_dim,
                            &alpha,
-                           weight_ptr,
+                           static_cast<void const *>(weight_ptr),
                            weight_type,
                            in_dim,
-                           input_ptr,
+                           static_cast<void const *>(input_ptr),
                            input_type,
                            in_dim,
                            &beta,
-                           output_ptr,
+                           static_cast<void *>(output_ptr),
                            output_type,
                            out_dim,
                            compute_type,
@@ -156,14 +156,14 @@ void forward_kernel(cudaStream_t stream,
                              batch_size,
                              1,
                              &alpha,
-                             bias_ptr,
+                             static_cast<void const *>(bias_ptr),
                              weight_type,
                              1,
-                             m.one_ptr,
+                             static_cast<void const *>(m.one_ptr),
                              CUDA_R_32F,
                              1,
                              &alpha,
-                             output_ptr,
+                             static_cast<void *>(output_ptr),
                              output_type,
                              out_dim,
                              compute_type,
@@ -174,10 +174,10 @@ void forward_kernel(cudaStream_t stream,
                                       m.actiDesc,
                                       &alpha,
                                       m.outputTensor,
-                                      output_ptr,
+                                      static_cast<void *>(output_ptr),
                                       &beta,
                                       m.outputTensor,
-                                      output_ptr));
+                                      static_cast<void *>(output_ptr)));
   } else if (m.activation == Activation::GELU) {
     size_t elements = size_t_from_int(out_dim) * size_t_from_int(batch_size);
     constexpr float B = 0.7978845608028654f;   // sqrt(2.0/M_PI)
@@ -191,13 +191,13 @@ void forward_kernel(cudaStream_t stream,
 
 void backward_kernel(cudaStream_t stream,
                      LinearPerDeviceState const &m,
-                     void const *input_ptr,
-                     void *input_grad_ptr,
-                     void const *output_ptr,
-                     void *output_grad_ptr,
-                     void const *kernel_ptr,
-                     void *kernel_grad_ptr,
-                     void *bias_grad_ptr,
+                     float const *output_ptr,
+                     float *output_grad_ptr,
+                     float const *input_ptr,
+                     float *input_grad_ptr,
+                     float const *kernel_ptr,
+                     float *kernel_grad_ptr,
+                     float *bias_grad_ptr,
                      int in_dim,
                      int out_dim,
                      int batch_size) {
@@ -216,11 +216,17 @@ void backward_kernel(cudaStream_t stream,
   int output_size = out_dim * batch_size;
   if (m.activation.has_value()) {
     if (m.activation == Activation::RELU) {
-      relu_backward_kernel(
-          m.output_type, output_grad_ptr, output_ptr, output_size, stream);
+      relu_backward_kernel(m.output_type,
+                           static_cast<void *>(output_grad_ptr),
+                           static_cast<void const *>(output_ptr),
+                           output_size,
+                           stream);
     } else if (m.activation == Activation::SIGMOID) {
-      sigmoid_backward_kernel(
-          m.output_type, output_grad_ptr, output_ptr, output_size, stream);
+      sigmoid_backward_kernel(m.output_type,
+                              static_cast<void *>(output_grad_ptr),
+                              static_cast<void const *>(output_ptr),
+                              output_size,
+                              stream);
     } else {
       // TODO: only support relu and sigmoid for now
       assert(false && "Unsupported activation for Linear");
@@ -235,14 +241,14 @@ void backward_kernel(cudaStream_t stream,
                            out_dim,
                            batch_size,
                            &alpha,
-                           input_ptr,
+                           static_cast<void const *>(input_ptr),
                            input_type,
                            in_dim,
-                           output_grad_ptr,
+                           static_cast<void *>(output_grad_ptr),
                            output_type,
                            out_dim,
                            &alpha,
-                           kernel_grad_ptr,
+                           static_cast<void *>(kernel_grad_ptr),
                            weight_type,
                            in_dim,
                            compute_type,
@@ -261,12 +267,12 @@ void backward_kernel(cudaStream_t stream,
                               in_dim,
                               out_dim,
                               &alpha,
-                              (float *)kernel_grad_ptr,
+                              kernel_grad_ptr,
                               in_dim,
                               &lambda,
-                              (float *)kernel_ptr,
+                              kernel_ptr,
                               in_dim,
-                              (float *)kernel_grad_ptr,
+                              kernel_grad_ptr,
                               in_dim));
     } else {
       assert(false && "Only L2 regularization is supported");
@@ -284,14 +290,14 @@ void backward_kernel(cudaStream_t stream,
                              out_dim,
                              batch_size,
                              &alpha,
-                             m.one_ptr,
+                             static_cast<void const *>(m.one_ptr),
                              CUDA_R_32F,
                              1,
-                             output_grad_ptr,
+                             static_cast<void *>(output_grad_ptr),
                              output_type,
                              out_dim,
                              &alpha,
-                             bias_grad_ptr,
+                             static_cast<void *>(bias_grad_ptr),
                              weight_type,
                              1,
                              compute_type,
@@ -307,14 +313,14 @@ void backward_kernel(cudaStream_t stream,
                              batch_size,
                              out_dim,
                              &alpha,
-                             kernel_ptr,
+                             static_cast<void const *>(kernel_ptr),
                              weight_type,
                              in_dim,
-                             output_grad_ptr,
+                             static_cast<void *>(output_grad_ptr),
                              output_type,
                              out_dim,
                              &alpha,
-                             input_grad_ptr,
+                             static_cast<void *>(input_grad_ptr),
                              input_type,
                              in_dim,
                              compute_type,
diff --git a/lib/kernels/src/cuda/ops/partition_kernels.cu b/lib/kernels/src/cuda/ops/partition_kernels.cu
index 2831562f58..b8dfac5204 100644
--- a/lib/kernels/src/cuda/ops/partition_kernels.cu
+++ b/lib/kernels/src/cuda/ops/partition_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/datatype_dispatch.h"
 #include "kernels/partition_kernels.h"
 
@@ -40,8 +40,8 @@ template <DataType T>
 struct BackwardKernel {
   void operator()(cudaStream_t stream,
                   RepartitionPerDeviceState const &m,
-                  GenericTensorAccessorW const &input_grad,
-                  GenericTensorAccessorR const &output_grad) {
+                  GenericTensorAccessorR const &output_grad,
+                  GenericTensorAccessorW const &input_grad) {
     add_kernel<real_type_t<T>>
         <<<GET_BLOCKS(input_grad.shape.num_elements().unwrap_nonnegative()),
            CUDA_NUM_THREADS,
@@ -67,10 +67,10 @@ void forward_kernel(cudaStream_t stream,
 
 void backward_kernel(cudaStream_t stream,
                      RepartitionPerDeviceState const &m,
-                     GenericTensorAccessorW const &input_grad,
-                     GenericTensorAccessorR const &output_grad) {
+                     GenericTensorAccessorR const &output_grad,
+                     GenericTensorAccessorW const &input_grad) {
   DataTypeDispatch1<BackwardKernel>{}(
-      m.data_type, stream, m, input_grad, output_grad);
+      m.data_type, stream, m, output_grad, input_grad);
 }
 
 } // namespace Repartition
diff --git a/lib/kernels/src/cuda/ops/pool_2d_kernels.cu b/lib/kernels/src/cuda/ops/pool_2d_kernels.cu
index 51fa29d289..e8ea3f64c2 100644
--- a/lib/kernels/src/cuda/ops/pool_2d_kernels.cu
+++ b/lib/kernels/src/cuda/ops/pool_2d_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/pool_2d_kernels.h"
 
 namespace FlexFlow {
@@ -112,10 +112,10 @@ void forward_kernel(cudaStream_t stream,
 
 void backward_kernel(cudaStream_t stream,
                      Pool2DPerDeviceState const &m,
-                     void const *input_ptr,
-                     void *input_grad_ptr,
                      void const *output_ptr,
-                     void const *output_grad_ptr) {
+                     void const *output_grad_ptr,
+                     void const *input_ptr,
+                     void *input_grad_ptr) {
 
   checkCUDNN(cudnnSetStream(m.handle.dnn, stream));
 
diff --git a/lib/kernels/src/cuda/ops/reduce_kernels.cu b/lib/kernels/src/cuda/ops/reduce_kernels.cu
index 02a89da807..563bbae21d 100644
--- a/lib/kernels/src/cuda/ops/reduce_kernels.cu
+++ b/lib/kernels/src/cuda/ops/reduce_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/reduce_kernels.h"
 
 namespace FlexFlow {
diff --git a/lib/kernels/src/cuda/ops/reduction_kernels.cu b/lib/kernels/src/cuda/ops/reduction_kernels.cu
index 5d95a3766a..d9c09b082d 100644
--- a/lib/kernels/src/cuda/ops/reduction_kernels.cu
+++ b/lib/kernels/src/cuda/ops/reduction_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/datatype_dispatch.h"
 #include "kernels/reduction_kernels.h"
 
@@ -55,8 +55,8 @@ struct ForwardKernel {
 template <DataType T>
 struct BackwardKernel {
   void operator()(cudaStream_t stream,
-                  GenericTensorAccessorW const &input,
-                  GenericTensorAccessorR const &output) {
+                  GenericTensorAccessorR const &output,
+                  GenericTensorAccessorW const &input) {
     checkCUDA(cudaMemcpyAsync(input.get<T>(),
                               output.get<T>(),
                               input.shape.num_elements().unwrap_nonnegative() *
@@ -75,9 +75,9 @@ void forward_kernel(cudaStream_t stream,
 }
 
 void backward_kernel(cudaStream_t stream,
-                     GenericTensorAccessorW const &input,
-                     GenericTensorAccessorR const &output) {
-  DataTypeDispatch1<BackwardKernel>{}(input.data_type, stream, input, output);
+                     GenericTensorAccessorR const &output,
+                     GenericTensorAccessorW const &input) {
+  DataTypeDispatch1<BackwardKernel>{}(output.data_type, stream, output, input);
 }
 
 } // namespace Reduction
diff --git a/lib/kernels/src/cuda/ops/replicate_kernels.cu b/lib/kernels/src/cuda/ops/replicate_kernels.cu
index 4706f38fd4..4685fd7a2d 100644
--- a/lib/kernels/src/cuda/ops/replicate_kernels.cu
+++ b/lib/kernels/src/cuda/ops/replicate_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/datatype_dispatch.h"
 #include "kernels/replicate_kernels.h"
 
@@ -22,8 +22,8 @@ namespace Kernels {
 namespace Replicate {
 
 template <typename T>
-__global__ void replicate_backward_kernel(T *input_ptr,
-                                          T const *output_ptr,
+__global__ void replicate_backward_kernel(T const *output_ptr,
+                                          T *input_ptr,
                                           size_t num_elements,
                                           size_t num_replicas) {
   CUDA_KERNEL_LOOP(i, num_elements) {
@@ -38,7 +38,6 @@ struct ForwardKernel {
   void operator()(cudaStream_t stream,
                   GenericTensorAccessorR const &input,
                   GenericTensorAccessorW const &output) {
-
     checkCUDA(cudaMemcpyAsync((void *)output.get<T>(),
                               (void *)input.get<T>(),
                               input.shape.num_elements().unwrap_nonnegative() *
@@ -51,15 +50,15 @@ struct ForwardKernel {
 template <DataType T>
 struct BackwardKernel {
   void operator()(cudaStream_t stream,
-                  GenericTensorAccessorW const &input,
                   GenericTensorAccessorR const &output,
+                  GenericTensorAccessorW const &input,
                   size_t num_replicas) {
     size_t total_elements =
         input.shape.num_elements().unwrap_nonnegative() * num_replicas;
     replicate_backward_kernel<real_type_t<T>>
         <<<GET_BLOCKS(total_elements), CUDA_NUM_THREADS, 0, stream>>>(
-            input.get<T>(),
             output.get<T>(),
+            input.get<T>(),
             input.shape.num_elements().unwrap_nonnegative(),
             num_replicas);
   }
@@ -72,11 +71,11 @@ void forward_kernel(cudaStream_t stream,
 }
 
 void backward_kernel(cudaStream_t stream,
-                     GenericTensorAccessorW const &input,
                      GenericTensorAccessorR const &output,
+                     GenericTensorAccessorW const &input,
                      size_t num_replicas) {
   DataTypeDispatch1<BackwardKernel>{}(
-      input.data_type, stream, input, output, num_replicas);
+      input.data_type, stream, output, input, num_replicas);
 }
 
 } // namespace Replicate
diff --git a/lib/kernels/src/cuda/ops/reshape_kernels.cu b/lib/kernels/src/cuda/ops/reshape_kernels.cu
index c5a289ce6b..a6a390b38e 100644
--- a/lib/kernels/src/cuda/ops/reshape_kernels.cu
+++ b/lib/kernels/src/cuda/ops/reshape_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/datatype_dispatch.h"
 #include "kernels/reshape_kernels.h"
 
@@ -43,8 +43,8 @@ struct ForwardKernel {
 template <DataType T>
 struct BackwardKernel {
   void operator()(cudaStream_t stream,
-                  GenericTensorAccessorW const &input,
-                  GenericTensorAccessorR const &output) {
+                  GenericTensorAccessorR const &output,
+                  GenericTensorAccessorW const &input) {
     float alpha = 1.0f;
     apply_add_with_scale<real_type_t<T>>
         <<<GET_BLOCKS(input.shape.num_elements().unwrap_nonnegative()),
@@ -66,9 +66,9 @@ void forward_kernel(cudaStream_t stream,
 
 void backward_kernel(cudaStream_t stream,
                      ReshapePerDeviceState const &m,
-                     GenericTensorAccessorW const &input,
-                     GenericTensorAccessorR const &output) {
-  DataTypeDispatch1<BackwardKernel>{}(m.data_type, stream, input, output);
+                     GenericTensorAccessorR const &output,
+                     GenericTensorAccessorW const &input) {
+  DataTypeDispatch1<BackwardKernel>{}(m.data_type, stream, output, input);
 }
 
 } // namespace Reshape
diff --git a/lib/kernels/src/cuda/ops/reverse_kernels.cu b/lib/kernels/src/cuda/ops/reverse_kernels.cu
index 8391a499df..582aa02386 100644
--- a/lib/kernels/src/cuda/ops/reverse_kernels.cu
+++ b/lib/kernels/src/cuda/ops/reverse_kernels.cu
@@ -13,13 +13,11 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/reverse_kernels.h"
+#include "kernels/reverse_kernels_params.h"
 
-namespace FlexFlow {
-
-namespace Kernels {
-namespace Reverse {
+namespace FlexFlow::Kernels::Reverse {
 
 __global__ void reverse_forward_kernel(float const *in_ptr,
                                        float *out_ptr,
@@ -27,23 +25,24 @@ __global__ void reverse_forward_kernel(float const *in_ptr,
                                        coord_t reverse_dim_size,
                                        coord_t in_blk_size) {
   CUDA_KERNEL_LOOP(i, num_out_blks * reverse_dim_size * in_blk_size) {
+    coord_t out_idx = i;
     coord_t blk_idx = i / (reverse_dim_size * in_blk_size);
     i = i - blk_idx * (reverse_dim_size * in_blk_size);
     coord_t reverse_dim_idx = i / in_blk_size;
     i = i - reverse_dim_idx * in_blk_size;
     coord_t in_idx = blk_idx * (reverse_dim_size * in_blk_size) +
                      (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size + i;
-    out_ptr[i] = in_ptr[in_idx];
+    out_ptr[out_idx] = in_ptr[in_idx];
   }
 }
 
-void forward_kernel(cudaStream_t stream,
-                    float const *in_ptr,
-                    float *out_ptr,
-                    coord_t num_out_blks,
-                    coord_t reverse_dim_size,
-                    coord_t in_blk_size,
-                    coord_t output_size) {
+static void forward_kernel_internal(cudaStream_t stream,
+                                    float const *in_ptr,
+                                    float *out_ptr,
+                                    coord_t num_out_blks,
+                                    coord_t reverse_dim_size,
+                                    coord_t in_blk_size,
+                                    coord_t output_size) {
 
   reverse_forward_kernel<<<GET_BLOCKS(output_size),
                            CUDA_NUM_THREADS,
@@ -52,13 +51,31 @@ void forward_kernel(cudaStream_t stream,
       in_ptr, out_ptr, num_out_blks, reverse_dim_size, in_blk_size);
 }
 
-void backward_kernel(cudaStream_t stream,
-                     float const *out_grad_ptr,
-                     float *in_grad_ptr,
-                     coord_t num_out_blks,
-                     coord_t reverse_dim_size,
-                     coord_t in_blk_size,
-                     coord_t input_size) {
+void forward_kernel(ffStream_t stream,
+                    GenericTensorAccessorR const &input_accessor,
+                    GenericTensorAccessorW &output_accessor,
+                    ReverseAttrs const &attrs) {
+
+  auto reverse_kernels_params =
+      compute_reverse_kernels_params(output_accessor.shape, attrs);
+
+  forward_kernel_internal(
+      stream,
+      input_accessor.get_float_ptr(),
+      output_accessor.get_float_ptr(),
+      reverse_kernels_params.num_out_blks.unwrap_nonnegative(),
+      reverse_kernels_params.reverse_dim_size.unwrap_nonnegative(),
+      reverse_kernels_params.in_blk_size.unwrap_nonnegative(),
+      reverse_kernels_params.out_size.unwrap_nonnegative());
+}
+
+void backward_kernel_internal(cudaStream_t stream,
+                              float const *out_grad_ptr,
+                              float *in_grad_ptr,
+                              coord_t num_out_blks,
+                              coord_t reverse_dim_size,
+                              coord_t in_blk_size,
+                              coord_t input_size) {
 
   reverse_forward_kernel<<<GET_BLOCKS(input_size),
                            CUDA_NUM_THREADS,
@@ -67,6 +84,21 @@ void backward_kernel(cudaStream_t stream,
       out_grad_ptr, in_grad_ptr, num_out_blks, reverse_dim_size, in_blk_size);
 }
 
-} // namespace Reverse
-} // namespace Kernels
-} // namespace FlexFlow
+void backward_kernel(ffStream_t stream,
+                     GenericTensorAccessorR const &output_grad_accessor,
+                     GenericTensorAccessorW &input_grad_accessor,
+                     ReverseAttrs const &attrs) {
+  auto reverse_kernels_params =
+      compute_reverse_kernels_params(input_grad_accessor.shape, attrs);
+
+  backward_kernel_internal(
+      stream,
+      output_grad_accessor.get_float_ptr(),
+      input_grad_accessor.get_float_ptr(),
+      reverse_kernels_params.num_out_blks.unwrap_nonnegative(),
+      reverse_kernels_params.reverse_dim_size.unwrap_nonnegative(),
+      reverse_kernels_params.in_blk_size.unwrap_nonnegative(),
+      reverse_kernels_params.out_size.unwrap_nonnegative());
+}
+
+} // namespace FlexFlow::Kernels::Reverse
diff --git a/lib/kernels/src/cuda/ops/softmax_kernels.cu b/lib/kernels/src/cuda/ops/softmax_kernels.cu
index 93ed85de18..da0ffd846e 100644
--- a/lib/kernels/src/cuda/ops/softmax_kernels.cu
+++ b/lib/kernels/src/cuda/ops/softmax_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/softmax_kernels.h"
 
 namespace FlexFlow {
@@ -61,8 +61,8 @@ void forward_kernel(cudaStream_t stream,
 }
 
 void backward_kernel(cudaStream_t stream,
-                     float *input_grad_ptr,
                      float const *output_grad_ptr,
+                     float *input_grad_ptr,
                      size_t num_elements) {
 
   checkCUDA(cudaMemcpyAsync(input_grad_ptr,
diff --git a/lib/kernels/src/cuda/ops/split_kernels.cu b/lib/kernels/src/cuda/ops/split_kernels.cu
index f01393732d..5c8b305851 100644
--- a/lib/kernels/src/cuda/ops/split_kernels.cu
+++ b/lib/kernels/src/cuda/ops/split_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/split_kernels.h"
 
 namespace FlexFlow {
diff --git a/lib/kernels/src/cuda/ops/topk_kernels.cu b/lib/kernels/src/cuda/ops/topk_kernels.cu
index c8f183172e..3824c57b32 100644
--- a/lib/kernels/src/cuda/ops/topk_kernels.cu
+++ b/lib/kernels/src/cuda/ops/topk_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/topk_kernels.h"
 
 namespace FlexFlow {
diff --git a/lib/kernels/src/cuda/ops/transpose_kernels.cu b/lib/kernels/src/cuda/ops/transpose_kernels.cu
index 60d2f7f342..91f3d48a35 100644
--- a/lib/kernels/src/cuda/ops/transpose_kernels.cu
+++ b/lib/kernels/src/cuda/ops/transpose_kernels.cu
@@ -13,10 +13,10 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/accessor.h"
+#include "kernels/legion_ordered/transform.h"
 #include "kernels/transpose_kernels.h"
-#include "op-attrs/dim_ordered/transform.h"
 #include "utils/exception.h"
 #include "utils/nonnegative_int/num_elements.h"
 
@@ -100,8 +100,8 @@ void forward_kernel(cudaStream_t stream,
 
 void backward_kernel(cudaStream_t stream,
                      TransposeAttrs const &m,
-                     GenericTensorAccessorW const &in_grad,
-                     GenericTensorAccessorR const &out_grad) {
+                     GenericTensorAccessorR const &out_grad,
+                     GenericTensorAccessorW const &in_grad) {
 
   TransposeStrides info;
   info.num_dim = in_grad.shape.num_dims().unwrap_nonnegative();
diff --git a/lib/kernels/src/cuda/optimizer_kernel.cu b/lib/kernels/src/cuda/optimizer_kernel.cu
deleted file mode 100644
index 439eed9dec..0000000000
--- a/lib/kernels/src/cuda/optimizer_kernel.cu
+++ /dev/null
@@ -1,216 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernels/optimizer_kernels.h"
-
-namespace FlexFlow {
-
-__global__ void sgd_update(size_t count,
-                           float lr,
-                           float weight_decay,
-                           float momentum,
-                           bool nesterov,
-                           float const *WGrad,
-                           float *V,
-                           float *W) {
-  // Refernce https://pytorch.org/docs/stable/_modules/torch/optim/sgd.html#SGD
-  CUDA_KERNEL_LOOP(i, count) {
-    float gt = WGrad[i] + weight_decay * W[i];
-    if (momentum > 0.0f) {
-      V[i] = V[i] * momentum + gt;
-      if (nesterov) {
-        gt = gt + momentum * V[i];
-      } else {
-        gt = V[i];
-      }
-    }
-    W[i] -= lr * gt;
-  }
-}
-
-__host__ void SGDOptimizer::ps_update_task_gpu(SGDOptimizer const *op,
-                                               float const *w_grad_ptr,
-                                               size_t size,
-                                               int num_replicas,
-                                               float *w_ptr,
-                                               float *v_ptr) {
-  cudaStream_t stream;
-  checkCUDA(get_legion_stream(&stream));
-  // Step 1: Gather gradients in the first replica
-  for (int i = 1; i < num_replicas; i++) {
-    float const *src = w_grad_ptr + i * size;
-    apply_add_with_scale<float>
-        <<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
-            (float *)w_grad_ptr, src, size, 1.0f);
-  }
-  // checkCUDA(cudaDeviceSynchronize());
-  //  Step 2: SGD update
-  sgd_update<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
-      size,
-      op->lr,
-      op->weight_decay,
-      op->momentum,
-      op->nesterov,
-      w_grad_ptr,
-      v_ptr,
-      w_ptr);
-  // checkCUDA(cudaDeviceSynchronize());
-}
-
-#ifdef FF_USE_NCCL
-__host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op,
-                                                 PerDeviceOpState const *meta,
-                                                 float const *w_grad_ptr,
-                                                 size_t size,
-                                                 float *w_ptr,
-                                                 float *v_ptr) {
-  // Use NCCL to sync gradients
-  // fprintf(stderr, "weight(%p) Before ncclAllReduce...\n", w_grad_ptr);
-  cudaStream_t stream;
-  checkCUDA(get_legion_stream(&stream));
-  checkNCCL(ncclAllReduce(w_grad_ptr,
-                          (float *)w_grad_ptr,
-                          size,
-                          ncclFloat,
-                          ncclSum,
-                          meta->handle.ncclComm,
-                          stream));
-  // fprintf(stderr, "weight(%p) After ncclAllReduce...\n", w_grad_ptr);
-  // print_tensor<float>((float*)w_grad_ptr, 16, "[After ncclAllReduce]");
-
-  // Step 2: SGD update
-  sgd_update<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
-      size,
-      op->lr,
-      op->weight_decay,
-      op->momentum,
-      op->nesterov,
-      w_grad_ptr,
-      v_ptr,
-      w_ptr);
-  // checkCUDA(cudaDeviceSynchronize());
-}
-#endif
-
-// ==================================================================
-//                        Adam Optimizer
-// ==================================================================
-__global__ void
-    add_kernel(int count, float scale, float const *src, float *dst) {
-  CUDA_KERNEL_LOOP(i, count) {
-    dst[i] += src[i] * scale;
-  }
-}
-
-__global__ void scale_kernel(int count, float a, float b, float *ptr) {
-  CUDA_KERNEL_LOOP(i, count) {
-    ptr[i] = (b - a) * ptr[i] + a;
-  }
-}
-
-__global__ void adam_update(int count,
-                            float alpha_t,
-                            float beta1,
-                            float beta2,
-                            float weight_decay,
-                            float epsilon,
-                            float const *WGrad,
-                            float *M,
-                            float *V,
-                            float *W) {
-  // Reference for weight decay
-  // https://www.fast.ai/2018/07/02/adam-weight-decay/
-  CUDA_KERNEL_LOOP(i, count) {
-    // W[i] -= weight_decay * alpha_t * W[i];
-    // float gt = WGrad[i];
-    float gt = WGrad[i] + weight_decay * W[i];
-    float mt = beta1 * M[i] + (1 - beta1) * gt;
-    float vt = beta2 * V[i] + (1 - beta2) * gt * gt;
-    M[i] = mt;
-    V[i] = vt;
-    W[i] -= alpha_t * mt / (sqrt(vt) + epsilon);
-  }
-}
-
-__host__ void AdamOptimizer::ps_update_task_gpu(AdamOptimizer const *op,
-                                                float const *w_grad_ptr,
-                                                size_t size,
-                                                int num_replicas,
-                                                float *w_ptr,
-                                                float *v_ptr,
-                                                float *m_ptr) {
-  cudaStream_t stream;
-  checkCUDA(get_legion_stream(&stream));
-  // Step 1: Gather gradients in the first replica
-  for (int i = 1; i < num_replicas; i++) {
-    float const *src = w_grad_ptr + i * size;
-    add_kernel<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
-        size, 1.0f, src, (float *)w_grad_ptr);
-  }
-  // checkCUDA(cudaDeviceSynchronize());
-  // fprintf(stderr, "alpha = %.8lf alpha_t = %.8lf decay = %.8lf\n",
-  //         op->alpha, op->alpha_t, op->weight_decay);
-  //  Step 2: Adam update
-  adam_update<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
-      size,
-      op->alpha_t,
-      op->beta1,
-      op->beta2,
-      op->weight_decay,
-      op->epsilon,
-      w_grad_ptr,
-      m_ptr,
-      v_ptr,
-      w_ptr);
-  // checkCUDA(cudaDeviceSynchronize());
-}
-
-#ifdef FF_USE_NCCL
-__host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op,
-                                                  PerDeviceOpState const *meta,
-                                                  float const *w_grad_ptr,
-                                                  size_t size,
-                                                  float *w_ptr,
-                                                  float *v_ptr,
-                                                  float *m_ptr) {
-  // Use NCCL to sync gradients
-  cudaStream_t stream;
-  checkCUDA(get_legion_stream(&stream));
-  checkNCCL(ncclAllReduce(w_grad_ptr,
-                          (float *)w_grad_ptr,
-                          size,
-                          ncclFloat,
-                          ncclSum,
-                          meta->handle.ncclComm,
-                          stream));
-  // fprintf(stderr, "alpha = %.8lf alpha_t = %.8lf decay = %.8lf\n",
-  //         op->alpha, op->alpha_t, op->weight_decay);
-  //  Step 2: Adam update
-  adam_update<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
-      size,
-      op->alpha_t,
-      op->beta1,
-      op->beta2,
-      op->weight_decay,
-      op->epsilon,
-      w_grad_ptr,
-      m_ptr,
-      v_ptr,
-      w_ptr);
-  // checkCUDA(cudaDeviceSynchronize());
-}
-#endif
-
-} // namespace FlexFlow
diff --git a/lib/kernels/src/cuda/optimizer_kernels.cu b/lib/kernels/src/cuda/optimizer_kernels.cu
new file mode 100644
index 0000000000..fe817876ce
--- /dev/null
+++ b/lib/kernels/src/cuda/optimizer_kernels.cu
@@ -0,0 +1,205 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "internal/device.h"
+#include "kernels/nccl.h"
+#include "kernels/optimizer_kernels.h"
+#include "utils/exception.h"
+
+namespace FlexFlow {
+
+__global__ void sgd_update(size_t count,
+                           float lr,
+                           float weight_decay,
+                           float momentum,
+                           bool nesterov,
+                           float const *WGrad,
+                           float *V,
+                           float *W) {
+  // Refernce https://pytorch.org/docs/stable/_modules/torch/optim/sgd.html#SGD
+  CUDA_KERNEL_LOOP(i, count) {
+    float gt = WGrad[i] + weight_decay * W[i];
+    if (momentum > 0.0f) {
+      V[i] = V[i] * momentum + gt;
+      if (nesterov) {
+        gt = gt + momentum * V[i];
+      } else {
+        gt = V[i];
+      }
+    }
+    W[i] -= lr * gt;
+  }
+}
+
+__host__ void sgd_ps_update_task_gpu(ffStream_t stream,
+                                     float lr,
+                                     float momentum,
+                                     bool nesterov,
+                                     float weight_decay,
+                                     float const *weight_grad_ptr,
+                                     size_t size,
+                                     int num_replicas,
+                                     float *weight_ptr,
+                                     float *sgd_v_ptr) {
+  // Step 1: Gather gradients in the first replica
+  for (int i = 1; i < num_replicas; i++) {
+    float const *src = weight_grad_ptr + i * size;
+    apply_add_with_scale<float>
+        <<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
+            (float *)weight_grad_ptr, src, size, 1.0f);
+  }
+
+  //  Step 2: SGD update
+  sgd_update<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(size,
+                                                                lr,
+                                                                weight_decay,
+                                                                momentum,
+                                                                nesterov,
+                                                                weight_grad_ptr,
+                                                                sgd_v_ptr,
+                                                                weight_ptr);
+}
+
+#ifdef FF_USE_NCCL
+__host__ void sgd_nccl_update_task_gpu(ffStream_t stream,
+                                       float lr,
+                                       float momentum,
+                                       bool nesterov,
+                                       float weight_decay,
+                                       PerDeviceFFHandle const &handle,
+                                       float const *w_grad_ptr,
+                                       size_t size,
+                                       float *w_ptr,
+                                       float *v_ptr) {
+  // Step 1: Use NCCL to sync gradients
+  ncclComm_t comm = handle.ncclComm;
+  checkNCCL(ncclAllReduce(
+      w_grad_ptr, (float *)w_grad_ptr, size, ncclFloat, ncclSum, comm, stream));
+
+  //  Step 2: SGD update
+  sgd_update<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
+      size, lr, weight_decay, momentum, nesterov, w_grad_ptr, v_ptr, w_ptr);
+}
+#endif
+
+// ==================================================================
+//                        Adam Optimizer
+// ==================================================================
+__global__ void
+    add_kernel(int count, float scale, float const *src, float *dst) {
+  CUDA_KERNEL_LOOP(i, count) {
+    dst[i] += src[i] * scale;
+  }
+}
+
+__global__ void scale_kernel(int count, float a, float b, float *ptr) {
+  CUDA_KERNEL_LOOP(i, count) {
+    ptr[i] = (b - a) * ptr[i] + a;
+  }
+}
+
+__global__ void adam_update(int count,
+                            float alpha_t,
+                            float beta1,
+                            float beta2,
+                            float weight_decay,
+                            float epsilon,
+                            float const *WGrad,
+                            float *M,
+                            float *V,
+                            float *W) {
+  // Reference for weight decay
+  // https://www.fast.ai/2018/07/02/adam-weight-decay/
+  CUDA_KERNEL_LOOP(i, count) {
+    // W[i] -= weight_decay * alpha_t * W[i];
+    // float gt = WGrad[i];
+    float gt = WGrad[i] + weight_decay * W[i];
+    float mt = beta1 * M[i] + (1 - beta1) * gt;
+    float vt = beta2 * V[i] + (1 - beta2) * gt * gt;
+    M[i] = mt;
+    V[i] = vt;
+    W[i] -= alpha_t * mt / (sqrt(vt) + epsilon);
+  }
+}
+
+__host__ void adam_ps_update_task_gpu(ffStream_t stream,
+                                      float alpha_t,
+                                      float beta1,
+                                      float beta2,
+                                      float weight_decay,
+                                      float epsilon,
+                                      float const *w_grad_ptr,
+                                      size_t size,
+                                      int num_replicas,
+                                      float *w_ptr,
+                                      float *v_ptr,
+                                      float *m_ptr) {
+  // Step 1: Gather gradients in the first replica
+  for (int i = 1; i < num_replicas; i++) {
+    float const *src = w_grad_ptr + i * size;
+    add_kernel<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
+        (float *)w_grad_ptr, src, size);
+  }
+
+  //  Step 2: Adam update
+  adam_update<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(size,
+                                                                 alpha_t,
+                                                                 beta1,
+                                                                 beta2,
+                                                                 weight_decay,
+                                                                 epsilon,
+                                                                 w_grad_ptr,
+                                                                 m_ptr,
+                                                                 v_ptr,
+                                                                 w_ptr);
+}
+
+#ifdef FF_USE_NCCL
+__host__ void nccl_update_task_gpu(ffStream_t stream,
+                                   float alpha_t,
+                                   float beta1,
+                                   float beta2,
+                                   float weight_decay,
+                                   float epsilon,
+                                   PerDeviceFFHandle const &handle,
+                                   float const *w_grad_ptr,
+                                   size_t size,
+                                   float *w_ptr,
+                                   float *v_ptr,
+                                   float *m_ptr) {
+  // Step 1: Use NCCL to sync gradients
+  checkNCCL(ncclAllReduce(w_grad_ptr,
+                          (float *)w_grad_ptr,
+                          size,
+                          ncclFloat,
+                          ncclSum,
+                          handle.ncclComm,
+                          stream));
+
+  //  Step 2: Adam update
+  adam_update<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(size,
+                                                                 alpha_t,
+                                                                 beta1,
+                                                                 beta2,
+                                                                 weight_decay,
+                                                                 epsilon,
+                                                                 w_grad_ptr,
+                                                                 m_ptr,
+                                                                 v_ptr,
+                                                                 w_ptr);
+}
+#endif
+
+} // namespace FlexFlow
diff --git a/lib/kernels/src/hip/embedding_kernels.cpp b/lib/kernels/src/hip/embedding_kernels.cpp
index 7ca3149f2f..aefe53cc46 100644
--- a/lib/kernels/src/hip/embedding_kernels.cpp
+++ b/lib/kernels/src/hip/embedding_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/embedding_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/datatype_dispatch.h"
 #include <hip/hip_runtime.h>
 
@@ -364,8 +364,8 @@ struct ForwardKernel {
            weight.data_type == DataType::FLOAT ||
            weight.data_type == DataType::DOUBLE);
 
-    if (aggr == AggregateOp::NONE) {
-      hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_forward_no_aggr<TI, TD>),
+    if (aggr == AggregateOp::AVG || aggr == AggregateOp::SUM) {
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_forward_with_aggr<TI, TD>),
                          GET_BLOCKS(output.shape.get_volume()),
                          CUDA_NUM_THREADS,
                          0,
@@ -374,10 +374,11 @@ struct ForwardKernel {
                          output.get<TD>(),
                          weight.get<TD>(),
                          out_dim,
-                         batch_size);
+                         in_dim,
+                         batch_size,
+                         aggr);
     } else {
-      assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM);
-      hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_forward_with_aggr<TI, TD>),
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_forward_no_aggr<TI, TD>),
                          GET_BLOCKS(output.shape.get_volume()),
                          CUDA_NUM_THREADS,
                          0,
@@ -386,9 +387,7 @@ struct ForwardKernel {
                          output.get<TD>(),
                          weight.get<TD>(),
                          out_dim,
-                         in_dim,
-                         batch_size,
-                         aggr);
+                         batch_size);
     }
   }
 }
@@ -408,8 +407,9 @@ struct BackwardKernel {
     assert(output.data_type == DataType::HALF ||
            output.data_type == DataType::FLOAT ||
            output.data_type == DataType::DOUBLE);
-    if (aggr == AggregateOp::NONE) {
-      hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_backward_no_aggr<TI, TD>),
+
+    if (aggr == AggregateOp::AVG || aggr == AggregateOp::SUM) {
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_backward_with_aggr<TI, TD>),
                          GET_BLOCKS(output.shape.get_volume()),
                          CUDA_NUM_THREADS,
                          0,
@@ -418,9 +418,11 @@ struct BackwardKernel {
                          output.get<TD>(),
                          weight_grad.get<TD>(),
                          out_dim,
-                         batch_size);
+                         in_dim,
+                         batch_size,
+                         aggr);
     } else {
-      hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_backward_with_aggr<TI, TD>),
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_backward_no_aggr<TI, TD>),
                          GET_BLOCKS(output.shape.get_volume()),
                          CUDA_NUM_THREADS,
                          0,
@@ -429,9 +431,7 @@ struct BackwardKernel {
                          output.get<TD>(),
                          weight_grad.get<TD>(),
                          out_dim,
-                         in_dim,
-                         batch_size,
-                         aggr);
+                         batch_size);
     }
   }
 }
diff --git a/lib/kernels/src/hip/loss_function_kernels.cpp b/lib/kernels/src/hip/loss_function_kernels.cpp
index e82b5c96d5..05068f1bd0 100644
--- a/lib/kernels/src/hip/loss_function_kernels.cpp
+++ b/lib/kernels/src/hip/loss_function_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/loss_function_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include <hip/hip_runtime.h>
 
 namespace FlexFlow {
diff --git a/lib/kernels/src/hip/ops/attention_kernels.cpp b/lib/kernels/src/hip/ops/attention_kernels.cpp
index 005cef30d1..b374ead305 100644
--- a/lib/kernels/src/hip/ops/attention_kernels.cpp
+++ b/lib/kernels/src/hip/ops/attention_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/attention_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include <hip/hip_runtime.h>
 
 namespace FlexFlow {
diff --git a/lib/kernels/src/hip/ops/batch_matmul_kernels.cpp b/lib/kernels/src/hip/ops/batch_matmul_kernels.cpp
index c4b3be823f..6d9ae8a268 100644
--- a/lib/kernels/src/hip/ops/batch_matmul_kernels.cpp
+++ b/lib/kernels/src/hip/ops/batch_matmul_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/batch_matmul_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include <hip/hip_runtime.h>
 
 namespace FlexFlow {
diff --git a/lib/kernels/src/hip/ops/batch_norm_kernels.cpp b/lib/kernels/src/hip/ops/batch_norm_kernels.cpp
index 8e94b462cd..764a3e0b58 100644
--- a/lib/kernels/src/hip/ops/batch_norm_kernels.cpp
+++ b/lib/kernels/src/hip/ops/batch_norm_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/batch_norm_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/allocation.h"
 #include "kernels/ff_handle.h"
 #include <hip/hip_runtime.h>
diff --git a/lib/kernels/src/hip/ops/cast_kernels.cpp b/lib/kernels/src/hip/ops/cast_kernels.cpp
index fa0c37ffa1..1035657c04 100644
--- a/lib/kernels/src/hip/ops/cast_kernels.cpp
+++ b/lib/kernels/src/hip/ops/cast_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/cast_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/datatype_dispatch.h"
 #include <hip/hip_runtime.h>
 
diff --git a/lib/kernels/src/hip/ops/combine_kernels.cpp b/lib/kernels/src/hip/ops/combine_kernels.cpp
index aa01f02276..f1e0422747 100644
--- a/lib/kernels/src/hip/ops/combine_kernels.cpp
+++ b/lib/kernels/src/hip/ops/combine_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/combine_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/accessor.h"
 #include "kernels/datatype_dispatch.h"
 #include <hip/hip_runtime.h>
diff --git a/lib/kernels/src/hip/ops/concat_kernels.cpp b/lib/kernels/src/hip/ops/concat_kernels.cpp
index aa38be739b..a215d67942 100644
--- a/lib/kernels/src/hip/ops/concat_kernels.cpp
+++ b/lib/kernels/src/hip/ops/concat_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/concat_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include <cassert>
 #include <hip/hip_runtime.h>
 
diff --git a/lib/kernels/src/hip/ops/conv_2d_kernels.h b/lib/kernels/src/hip/ops/conv_2d_kernels.h
index bcf015d561..76a73ab08c 100644
--- a/lib/kernels/src/hip/ops/conv_2d_kernels.h
+++ b/lib/kernels/src/hip/ops/conv_2d_kernels.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_KERNELS_HIP_CONV_2D_KERNELS_H
 #define _FLEXFLOW_KERNELS_HIP_CONV_2D_KERNELS_H
 
-#include "device.h"
+#include "kernels/device.h"
 
 namespace FlexFlow {
 namespace Kernels {
diff --git a/lib/kernels/src/hip/ops/dropout_kernels.cpp b/lib/kernels/src/hip/ops/dropout_kernels.cpp
index baaf8e6902..d85c0ae054 100644
--- a/lib/kernels/src/hip/ops/dropout_kernels.cpp
+++ b/lib/kernels/src/hip/ops/dropout_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/dropout_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/ff_handle.h"
 #include <hip/hip_runtime.h>
 
diff --git a/lib/kernels/src/hip/ops/element_binary_kernels.cpp b/lib/kernels/src/hip/ops/element_binary_kernels.cpp
index bc66bbff2f..9e0452b09b 100644
--- a/lib/kernels/src/hip/ops/element_binary_kernels.cpp
+++ b/lib/kernels/src/hip/ops/element_binary_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/element_binary_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/ff_handle.h"
 #include "op-attrs/datatype.h"
 #include "op-attrs/operator_type.dtg.h"
diff --git a/lib/kernels/src/hip/ops/element_unary_kernels.cpp b/lib/kernels/src/hip/ops/element_unary_kernels.cpp
index f4b0ccb82d..163f13a6da 100644
--- a/lib/kernels/src/hip/ops/element_unary_kernels.cpp
+++ b/lib/kernels/src/hip/ops/element_unary_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/element_unary_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/datatype_dispatch.h"
 #include "op-attrs/get_op_type.h"
 #include <hip/hip_runtime.h>
diff --git a/lib/kernels/src/hip/ops/flat_kernels.cpp b/lib/kernels/src/hip/ops/flat_kernels.cpp
index 763fb9e322..dedfb4b9a9 100644
--- a/lib/kernels/src/hip/ops/flat_kernels.cpp
+++ b/lib/kernels/src/hip/ops/flat_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/flat_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/accessor.h"
 #include <hip/hip_runtime.h>
 
diff --git a/lib/kernels/src/hip/ops/gather_kernels.cpp b/lib/kernels/src/hip/ops/gather_kernels.cpp
index 17c0014e98..6e9e4c6a2c 100644
--- a/lib/kernels/src/hip/ops/gather_kernels.cpp
+++ b/lib/kernels/src/hip/ops/gather_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/gather_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/datatype_dispatch.h"
 #include <hip/hip_runtime.h>
 
diff --git a/lib/kernels/src/hip/ops/partition_kernels.cpp b/lib/kernels/src/hip/ops/partition_kernels.cpp
index 4591247faa..26748a7e45 100644
--- a/lib/kernels/src/hip/ops/partition_kernels.cpp
+++ b/lib/kernels/src/hip/ops/partition_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/partition_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/datatype_dispatch.h"
 #include <hip/hip_runtime.h>
 
diff --git a/lib/kernels/src/hip/ops/pool_2d_kernels.cpp b/lib/kernels/src/hip/ops/pool_2d_kernels.cpp
index ed942c105c..7e5ae2ab80 100644
--- a/lib/kernels/src/hip/ops/pool_2d_kernels.cpp
+++ b/lib/kernels/src/hip/ops/pool_2d_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/pool_2d_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include <hip/hip_runtime.h>
 
 namespace FlexFlow {
diff --git a/lib/kernels/src/hip/ops/reduce_kernels.cpp b/lib/kernels/src/hip/ops/reduce_kernels.cpp
index 468543dd5b..c0bcc84d48 100644
--- a/lib/kernels/src/hip/ops/reduce_kernels.cpp
+++ b/lib/kernels/src/hip/ops/reduce_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/reduce_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include <hip/hip_runtime.h>
 
 namespace FlexFlow {
diff --git a/lib/kernels/src/hip/ops/replicate_kernels.cpp b/lib/kernels/src/hip/ops/replicate_kernels.cpp
index 8d27bb1908..ee7bf701c0 100644
--- a/lib/kernels/src/hip/ops/replicate_kernels.cpp
+++ b/lib/kernels/src/hip/ops/replicate_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/replicate_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/datatype_dispatch.h"
 #include <hip/hip_runtime.h>
 
diff --git a/lib/kernels/src/hip/ops/reshape_kernels.cpp b/lib/kernels/src/hip/ops/reshape_kernels.cpp
index 47978a5f4a..810b929e24 100644
--- a/lib/kernels/src/hip/ops/reshape_kernels.cpp
+++ b/lib/kernels/src/hip/ops/reshape_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/reshape_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/datatype_dispatch.h"
 #include <hip/hip_runtime.h>
 
diff --git a/lib/kernels/src/hip/ops/reverse_kernels.cpp b/lib/kernels/src/hip/ops/reverse_kernels.cpp
index 03e97245bf..a56ff3540a 100644
--- a/lib/kernels/src/hip/ops/reverse_kernels.cpp
+++ b/lib/kernels/src/hip/ops/reverse_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/reverse_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include <hip/hip_runtime.h>
 
 namespace FlexFlow {
diff --git a/lib/kernels/src/hip/ops/softmax_kernels.cpp b/lib/kernels/src/hip/ops/softmax_kernels.cpp
index 3a8f2813b7..610675850b 100644
--- a/lib/kernels/src/hip/ops/softmax_kernels.cpp
+++ b/lib/kernels/src/hip/ops/softmax_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/softmax_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include <hip/hip_runtime.h>
 
 namespace FlexFlow {
diff --git a/lib/kernels/src/hip/ops/split_kernels.cpp b/lib/kernels/src/hip/ops/split_kernels.cpp
index 5599ae6d6f..3034b633a6 100644
--- a/lib/kernels/src/hip/ops/split_kernels.cpp
+++ b/lib/kernels/src/hip/ops/split_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/split_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include <hip/hip_runtime.h>
 
 namespace FlexFlow {
diff --git a/lib/kernels/src/hip/ops/topk_kernels.cpp b/lib/kernels/src/hip/ops/topk_kernels.cpp
index f085c5831f..777d9edffa 100644
--- a/lib/kernels/src/hip/ops/topk_kernels.cpp
+++ b/lib/kernels/src/hip/ops/topk_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/topk_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include <hip/hip_runtime.h>
 
 namespace FlexFlow {
diff --git a/lib/kernels/src/hip/ops/transpose_kernels.cpp b/lib/kernels/src/hip/ops/transpose_kernels.cpp
index ef9dd58c63..c5122f34bf 100644
--- a/lib/kernels/src/hip/ops/transpose_kernels.cpp
+++ b/lib/kernels/src/hip/ops/transpose_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/transpose_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/accessor.h"
 #include "utils/exception.h"
 #include <hip/hip_runtime.h>
diff --git a/lib/kernels/src/device.cc b/lib/kernels/src/internal/device.cc
similarity index 97%
rename from lib/kernels/src/device.cc
rename to lib/kernels/src/internal/device.cc
index f46099c79a..eb3d229c2a 100644
--- a/lib/kernels/src/device.cc
+++ b/lib/kernels/src/internal/device.cc
@@ -1,4 +1,4 @@
-#include "device.h"
+#include "internal/device.h"
 
 namespace FlexFlow {
 
diff --git a/lib/kernels/src/device.h b/lib/kernels/src/internal/device.h
similarity index 98%
rename from lib/kernels/src/device.h
rename to lib/kernels/src/internal/device.h
index ceff2f92ff..226c7ad174 100644
--- a/lib/kernels/src/device.h
+++ b/lib/kernels/src/internal/device.h
@@ -1,5 +1,5 @@
-#ifndef _FLEXFLOW_KERNELS_SRC_DEVICE_H
-#define _FLEXFLOW_KERNELS_SRC_DEVICE_H
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_INTERNAL_DEVICE_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_INTERNAL_DEVICE_H
 
 #include "kernels/array_shape.h"
 #include "kernels/device.h"
diff --git a/lib/kernels/src/kernels/accessor.cc b/lib/kernels/src/kernels/accessor.cc
new file mode 100644
index 0000000000..b5042f77a0
--- /dev/null
+++ b/lib/kernels/src/kernels/accessor.cc
@@ -0,0 +1,249 @@
+#include "kernels/accessor.h"
+#include "kernels/allocation.h"
+#include "kernels/datatype_dispatch.h"
+#include "utils/containers/reversed.h"
+#include "utils/containers/vector_of.h"
+#include "utils/nonnegative_int/nonnegative_range.h"
+#include <libassert/assert.hpp>
+
+namespace FlexFlow {
+
+nonnegative_int
+    calculate_accessor_offset(LegionOrdered<nonnegative_int> const &indices,
+                              ArrayShape const &shape) {
+  ASSERT(indices.size() == shape.num_dims(),
+         "Number of indices does not match the number of dimensions");
+
+  nonnegative_int offset = 0_n;
+  nonnegative_int multiplier = 1_n;
+
+  for (legion_dim_t dim : reversed(vector_of(key_range(shape.dims)))) {
+    ASSERT(indices.at(dim) < shape.at(legion_dim_t{dim}),
+           "Out of bounds access",
+           dim);
+
+    offset += indices.at(dim) * multiplier;
+    multiplier *= shape.at(legion_dim_t{dim});
+  }
+
+  return offset;
+}
+
+void copy_accessor_data_to_l_from_r(
+    GenericTensorAccessorW &dst_accessor,
+    GenericTensorAccessorR const &src_accessor) {
+  size_t num_bytes =
+      dst_accessor.shape.get_volume().unwrap_nonnegative() *
+      size_of_datatype(dst_accessor.data_type).unwrap_nonnegative();
+
+  DeviceType dst_device_type = dst_accessor.device_type;
+  DeviceType src_device_type = src_accessor.device_type;
+
+  if (src_device_type == DeviceType::CPU &&
+      dst_device_type == DeviceType::CPU) {
+    memcpy(dst_accessor.ptr, src_accessor.ptr, num_bytes);
+  } else if (src_device_type == DeviceType::CPU &&
+             dst_device_type == DeviceType::GPU) {
+    checkCUDA(cudaMemcpy(
+        dst_accessor.ptr, src_accessor.ptr, num_bytes, cudaMemcpyHostToDevice));
+  } else if (src_device_type == DeviceType::GPU &&
+             dst_device_type == DeviceType::CPU) {
+    checkCUDA(cudaMemcpy(
+        dst_accessor.ptr, src_accessor.ptr, num_bytes, cudaMemcpyDeviceToHost));
+  } else {
+    assert(src_device_type == DeviceType::GPU);
+    assert(dst_device_type == DeviceType::GPU);
+    checkCUDA(cudaMemcpy(dst_accessor.ptr,
+                         src_accessor.ptr,
+                         num_bytes,
+                         cudaMemcpyDeviceToDevice));
+  }
+}
+
+GenericTensorAccessorW::operator GenericTensorAccessorR() const {
+  return read_only_accessor_from_write_accessor(*this);
+}
+
+GenericTensorAccessorW::GenericTensorAccessorW(
+    DataType data_type,
+    ArrayShape const &shape,
+    void *ptr,
+    DeviceType device_type = DeviceType::GPU)
+    : data_type(data_type), shape(shape), ptr(ptr), device_type(device_type) {}
+
+std::tuple<DataType const &,
+           ArrayShape const &,
+           void *const &,
+           DeviceType const &>
+    GenericTensorAccessorW::tie() const {
+  return std::tie(this->data_type, this->shape, this->ptr, this->device_type);
+}
+
+bool GenericTensorAccessorW::operator==(
+    GenericTensorAccessorW const &other) const {
+  return this->tie() == other.tie();
+}
+
+bool GenericTensorAccessorW::operator!=(
+    GenericTensorAccessorW const &other) const {
+  return this->tie() != other.tie();
+}
+
+int32_t *GenericTensorAccessorW::get_int32_ptr() const {
+  return this->get<DataType::INT32>();
+}
+
+int64_t *GenericTensorAccessorW::get_int64_ptr() const {
+  return this->get<DataType::INT64>();
+}
+
+float *GenericTensorAccessorW::get_float_ptr() const {
+  return this->get<DataType::FLOAT>();
+}
+
+double *GenericTensorAccessorW::get_double_ptr() const {
+  return this->get<DataType::DOUBLE>();
+}
+
+half *GenericTensorAccessorW::get_half_ptr() const {
+  return this->get<DataType::HALF>();
+}
+
+std::string format_as(GenericTensorAccessorW const &a) {
+  return fmt::format("<GenericTensorAccessorW data_type={} shape={} ptr={}>",
+                     a.data_type,
+                     a.shape,
+                     a.ptr);
+}
+
+std::ostream &operator<<(std::ostream &s, GenericTensorAccessorW const &a) {
+  return (s << fmt::to_string(a));
+}
+
+GenericTensorAccessorR::GenericTensorAccessorR(
+    DataType data_type,
+    ArrayShape const &shape,
+    void const *ptr,
+    DeviceType device_type = DeviceType::GPU)
+    : data_type(data_type), shape(shape), ptr(ptr), device_type(device_type) {}
+
+std::tuple<DataType const &,
+           ArrayShape const &,
+           void const *const &,
+           DeviceType const &>
+    GenericTensorAccessorR::tie() const {
+  return std::tie(this->data_type, this->shape, this->ptr, this->device_type);
+}
+
+bool GenericTensorAccessorR::operator==(
+    GenericTensorAccessorR const &other) const {
+  return this->tie() == other.tie();
+}
+
+bool GenericTensorAccessorR::operator!=(
+    GenericTensorAccessorR const &other) const {
+  return this->tie() != other.tie();
+}
+
+int32_t const *GenericTensorAccessorR::get_int32_ptr() const {
+  return this->get<DataType::INT32>();
+}
+
+int64_t const *GenericTensorAccessorR::get_int64_ptr() const {
+  return this->get<DataType::INT64>();
+}
+
+float const *GenericTensorAccessorR::get_float_ptr() const {
+  return this->get<DataType::FLOAT>();
+}
+
+double const *GenericTensorAccessorR::get_double_ptr() const {
+  return this->get<DataType::DOUBLE>();
+}
+
+half const *GenericTensorAccessorR::get_half_ptr() const {
+  return get<DataType::HALF>();
+}
+
+std::string format_as(GenericTensorAccessorR const &a) {
+  return fmt::format("<GenericTensorAccessorR data_type={} shape={} ptr={}>",
+                     a.data_type,
+                     a.shape,
+                     a.ptr);
+}
+
+std::ostream &operator<<(std::ostream &s, GenericTensorAccessorR const &a) {
+  return (s << fmt::to_string(a));
+}
+
+int32_t const *get_int32_ptr(GenericTensorAccessorR const &a) {
+  return get<DataType::INT32>(a);
+}
+
+int64_t const *get_int64_ptr(GenericTensorAccessorR const &a) {
+  return get<DataType::INT64>(a);
+}
+
+float const *get_float_ptr(GenericTensorAccessorR const &a) {
+  return get<DataType::FLOAT>(a);
+}
+
+double const *get_double_ptr(GenericTensorAccessorR const &a) {
+  return get<DataType::DOUBLE>(a);
+}
+
+half const *get_half_ptr(GenericTensorAccessorR const &a) {
+  return get<DataType::HALF>(a);
+}
+
+std::vector<int32_t const *>
+    get_int32_ptrs(std::vector<GenericTensorAccessorR> const &a) {
+  return get<DataType::INT32>(a);
+}
+
+std::vector<int64_t const *>
+    get_int64_ptrs(std::vector<GenericTensorAccessorR> const &a) {
+  return get<DataType::INT64>(a);
+}
+
+std::vector<float const *>
+    get_float_ptrs(std::vector<GenericTensorAccessorR> const &a) {
+  return get<DataType::FLOAT>(a);
+}
+
+std::vector<double const *>
+    get_double_ptrs(std::vector<GenericTensorAccessorR> const &a) {
+  return get<DataType::DOUBLE>(a);
+}
+
+std::vector<half const *>
+    get_half_ptrs(std::vector<GenericTensorAccessorR> const &a) {
+  return get<DataType::HALF>(a);
+}
+
+GenericTensorAccessorR read_only_accessor_from_write_accessor(
+    GenericTensorAccessorW const &writable) {
+  return GenericTensorAccessorR{writable.data_type,
+                                writable.shape,
+                                req<void const *>(writable.ptr),
+                                writable.device_type};
+}
+
+bool is_shape_and_dtype_equal(GenericTensorAccessorR const &acc1,
+                              GenericTensorAccessorR const &acc2) {
+  return acc1.shape == acc2.shape && acc1.data_type == acc2.data_type;
+}
+
+bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor,
+                             ArrayShape const &expected_shape,
+                             DataType const &expected_dtype) {
+  return accessor.shape == expected_shape &&
+         accessor.data_type == expected_dtype;
+}
+
+std::pair<ArrayShape, DataType>
+    get_shape_and_datatype(GenericTensorAccessorR const &accessor) {
+  return std::make_pair(accessor.shape, accessor.data_type);
+}
+
+} // namespace FlexFlow
diff --git a/lib/kernels/src/kernels/allocation.cc b/lib/kernels/src/kernels/allocation.cc
new file mode 100644
index 0000000000..b9f253bcff
--- /dev/null
+++ b/lib/kernels/src/kernels/allocation.cc
@@ -0,0 +1,38 @@
+#include "kernels/allocation.h"
+#include "op-attrs/tensor_shape.h"
+
+namespace FlexFlow {
+
+void *Allocator::allocate(size_t mem_size) {
+  return this->i_allocator->allocate(mem_size);
+}
+
+void Allocator::deallocate(void *ptr) {
+  this->i_allocator->deallocate(ptr);
+}
+
+DeviceType Allocator::get_allocation_device_type() const {
+  return this->i_allocator->get_allocation_device_type();
+}
+
+GenericTensorAccessorW
+    Allocator::allocate_tensor(TensorShape const &tensor_shape) {
+  void *ptr =
+      this->allocate(get_size_in_bytes(tensor_shape).unwrap_nonnegative());
+  return GenericTensorAccessorW{
+      tensor_shape.data_type,
+      array_shape_from_tensor_shape(tensor_shape),
+      ptr,
+      this->get_allocation_device_type(),
+  };
+}
+
+void Allocator::deallocate_tensor(GenericTensorAccessorW const &t) {
+  this->deallocate(t.ptr);
+}
+
+void Allocator::deallocate_tensor(GenericTensorAccessorR const &t) {
+  this->deallocate(const_cast<void *>(t.ptr));
+}
+
+} // namespace FlexFlow
diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/kernels/array_shape.cc
similarity index 51%
rename from lib/kernels/src/array_shape.cc
rename to lib/kernels/src/kernels/array_shape.cc
index 243185ada4..34a53c1bb3 100644
--- a/lib/kernels/src/array_shape.cc
+++ b/lib/kernels/src/kernels/array_shape.cc
@@ -1,23 +1,20 @@
 #include "kernels/array_shape.h"
+#include "kernels/legion_ordered/slice.h"
+#include "op-attrs/ff_ordered/ff_ordered_of.h"
+#include "op-attrs/ff_ordered/slice.h"
+#include "utils/containers/cartesian_product.h"
 #include "utils/containers/product.h"
 #include "utils/containers/reversed.h"
+#include "utils/containers/transform.h"
+#include "utils/containers/unordered_set_of.h"
 #include "utils/containers/vector_of.h"
+#include "utils/hash/tuple.h"
+#include "utils/hash/vector.h"
 #include "utils/nonnegative_int/num_elements.h"
 
 namespace FlexFlow {
 
-static LegionOrdered<nonnegative_int>
-    legion_dims_from_ff_dims(FFOrdered<nonnegative_int> const &ff_ordered) {
-  return LegionOrdered<nonnegative_int>{reversed(vector_of(ff_ordered))};
-}
-
-ArrayShape::ArrayShape(nonnegative_int *_dims, nonnegative_int num_dims)
-    : dims(_dims, _dims + num_dims.unwrap_nonnegative()) {}
-
-ArrayShape::ArrayShape(TensorShape const &shape)
-    : dims(legion_dims_from_ff_dims(shape.dims.ff_ordered)) {}
-
-ArrayShape::ArrayShape(std::vector<nonnegative_int> const &input_dims)
+ArrayShape::ArrayShape(LegionOrdered<nonnegative_int> const &input_dims)
     : dims(input_dims) {}
 
 nonnegative_int ArrayShape::get_volume() const {
@@ -59,10 +56,19 @@ bool ArrayShape::operator!=(ArrayShape const &other) const {
   return this->tie() != other.tie();
 }
 
-ArrayShape ArrayShape::sub_shape(
-    std::optional<std::variant<ff_dim_t, legion_dim_t>> start,
-    std::optional<std::variant<ff_dim_t, legion_dim_t>> end) const {
-  NOT_IMPLEMENTED();
+ArrayShape
+    ArrayShape::sub_shape(ff_dim_t const &start,
+                          std::optional<ff_dim_t> const &maybe_end) const {
+  FFOrdered<nonnegative_int> ff_ordered_dims =
+      ff_ordered_from_legion_ordered(this->dims);
+  FFOrdered<nonnegative_int> sliced = slice(ff_ordered_dims, start, maybe_end);
+  return ArrayShape{legion_ordered_from_ff_ordered(sliced)};
+}
+
+ArrayShape
+    ArrayShape::sub_shape(legion_dim_t const &start,
+                          std::optional<legion_dim_t> const &maybe_end) const {
+  return ArrayShape{slice(this->dims, start, maybe_end)};
 }
 
 std::optional<nonnegative_int> ArrayShape::at_maybe(legion_dim_t index) const {
@@ -81,15 +87,6 @@ std::tuple<LegionOrdered<nonnegative_int> const &> ArrayShape::tie() const {
   return std::tie(this->dims);
 }
 
-nonnegative_int get_volume(ArrayShape const &shape) {
-  return shape.get_volume();
-}
-
-TensorShape get_tensor_shape(ArrayShape const &shape, DataType dtype) {
-  return TensorShape{TensorDims{ff_ordered_from_legion_ordered(shape.dims)},
-                     dtype};
-}
-
 std::string format_as(ArrayShape const &x) {
   std::ostringstream oss;
   oss << "<ArrayShape";
@@ -102,4 +99,44 @@ std::ostream &operator<<(std::ostream &s, ArrayShape const &x) {
   return (s << fmt::to_string(x));
 }
 
+nonnegative_int get_volume(ArrayShape const &shape) {
+  return shape.get_volume();
+}
+
+ArrayShape array_shape_from_tensor_shape(TensorShape const &tensor_shape) {
+  return ArrayShape{
+      legion_ordered_from_ff_ordered(tensor_shape.dims.ff_ordered)};
+}
+
+TensorShape get_tensor_shape(ArrayShape const &shape, DataType dtype) {
+  return TensorShape{TensorDims{ff_ordered_from_legion_ordered(shape.dims)},
+                     dtype};
+}
+
+std::unordered_set<ArrayCoord> get_array_coord_set(ArrayShape const &shape) {
+  std::vector<std::vector<nonnegative_int>> per_dim_ranges =
+      transform(vector_of(ff_ordered_from_legion_ordered(shape.dims)),
+                [](nonnegative_int dim_size) -> std::vector<nonnegative_int> {
+                  return nonnegative_range(dim_size);
+                });
+
+  std::unordered_set<std::vector<nonnegative_int>> raw_points =
+      unordered_set_of(cartesian_product(per_dim_ranges));
+
+  return transform(raw_points,
+                   [](std::vector<nonnegative_int> const &raw_point) {
+                     return ArrayCoord{ff_ordered_of(raw_point)};
+                   });
+}
+
 } // namespace FlexFlow
+
+namespace std {
+
+using namespace FlexFlow;
+
+size_t hash<ArrayShape>::operator()(ArrayShape const &s) const {
+  return get_std_hash(s.tie());
+}
+
+} // namespace std
diff --git a/lib/kernels/src/kernels/copy_tensor_accessor.cc b/lib/kernels/src/kernels/copy_tensor_accessor.cc
new file mode 100644
index 0000000000..d8619d8ce6
--- /dev/null
+++ b/lib/kernels/src/kernels/copy_tensor_accessor.cc
@@ -0,0 +1,66 @@
+#include "kernels/copy_tensor_accessor.h"
+#include "kernels/datatype_dispatch.h"
+
+namespace FlexFlow {
+
+template <DataType DT>
+struct CopyTensorAccessorW {
+  GenericTensorAccessorW operator()(GenericTensorAccessorW const &src_accessor,
+                                    Allocator &allocator) {
+    TensorShape shape =
+        get_tensor_shape(src_accessor.shape, src_accessor.data_type);
+    GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape);
+
+    copy_accessor_data_to_l_from_r(dst_accessor, src_accessor);
+
+    return dst_accessor;
+  }
+};
+
+GenericTensorAccessorW
+    copy_tensor_accessor_w(GenericTensorAccessorW const &src_accessor,
+                           Allocator &allocator) {
+  return DataTypeDispatch1<CopyTensorAccessorW>{}(
+      src_accessor.data_type, src_accessor, allocator);
+}
+
+template <DataType DT>
+struct CopyTensorAccessorR {
+  GenericTensorAccessorR operator()(GenericTensorAccessorR const &src_accessor,
+                                    Allocator &allocator) {
+    TensorShape shape =
+        get_tensor_shape(src_accessor.shape, src_accessor.data_type);
+    GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape);
+
+    copy_accessor_data_to_l_from_r(dst_accessor, src_accessor);
+
+    return read_only_accessor_from_write_accessor(dst_accessor);
+  }
+};
+
+GenericTensorAccessorR
+    copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor,
+                           Allocator &allocator) {
+  return DataTypeDispatch1<CopyTensorAccessorR>{}(
+      src_accessor.data_type, src_accessor, allocator);
+}
+
+GenericTensorAccessorR copy_tensor_accessor_r_to_cpu_if_necessary(
+    GenericTensorAccessorR const &accessor, Allocator &cpu_allocator) {
+  if (accessor.device_type == DeviceType::GPU) {
+    return copy_tensor_accessor_r(accessor, cpu_allocator);
+  } else {
+    return accessor;
+  }
+}
+
+GenericTensorAccessorW copy_tensor_accessor_w_to_cpu_if_necessary(
+    GenericTensorAccessorW const &accessor, Allocator &cpu_allocator) {
+  if (accessor.device_type == DeviceType::GPU) {
+    return copy_tensor_accessor_w(accessor, cpu_allocator);
+  } else {
+    return accessor;
+  }
+}
+
+} // namespace FlexFlow
diff --git a/lib/kernels/src/kernels/format_accessor_contents.cc b/lib/kernels/src/kernels/format_accessor_contents.cc
new file mode 100644
index 0000000000..1b8ab35d89
--- /dev/null
+++ b/lib/kernels/src/kernels/format_accessor_contents.cc
@@ -0,0 +1,184 @@
+#include "kernels/format_accessor_contents.h"
+#include "kernels/copy_tensor_accessor.h"
+#include "kernels/datatype_dispatch.h"
+#include "kernels/local_cpu_allocator.h"
+#include "utils/indent.h"
+#include <libassert/assert.hpp>
+
+namespace FlexFlow {
+
+template <DataType DT>
+struct Print1DCPUAccessorR {
+  void operator()(GenericTensorAccessorR const &accessor,
+                  std::ostream &stream) {
+    ASSERT(accessor.device_type == DeviceType::CPU);
+    nonnegative_int dims = accessor.shape.num_dims();
+    ASSERT(dims == 1_n);
+
+    nonnegative_int ncols = accessor.shape.at(ff_dim_t{0_n});
+
+    stream << "["
+           << join_strings(nonnegative_range(ncols),
+                           " ",
+                           [&](nonnegative_int col_idx) -> std::string {
+                             return fmt::to_string(
+                                 accessor.at<DT>(FFOrdered{col_idx}));
+                           })
+           << "]";
+  }
+};
+
+static std::string
+    format_1d_accessor_r_contents(GenericTensorAccessorR const &accessor) {
+  ASSERT(accessor.device_type == DeviceType::CPU);
+  ASSERT(accessor.shape.num_dims() == 1_n);
+
+  std::ostringstream oss;
+  DataTypeDispatch1<Print1DCPUAccessorR>{}(accessor.data_type, accessor, oss);
+  return oss.str();
+}
+
+template <DataType DT>
+struct Print2DCPUAccessorR {
+  void operator()(GenericTensorAccessorR const &accessor,
+                  std::ostream &stream) {
+    ASSERT(accessor.device_type == DeviceType::CPU);
+    nonnegative_int dims = accessor.shape.num_dims();
+    ASSERT(dims == 2_n);
+    nonnegative_int dim0_size = accessor.shape.at(ff_dim_t{0_n});
+    nonnegative_int dim1_size = accessor.shape.at(ff_dim_t{1_n});
+
+    auto render_1d = [&](nonnegative_int dim0_idx) -> std::string {
+      return "[" +
+             join_strings(nonnegative_range(dim1_size),
+                          " ",
+                          [&](nonnegative_int dim1_idx) -> std::string {
+                            return fmt::to_string(
+                                accessor.at<DT>(FFOrdered{dim0_idx, dim1_idx}));
+                          }) +
+             "]";
+    };
+
+    stream << "[\n"
+           << indent(
+                  join_strings(nonnegative_range(dim0_size), "\n", render_1d))
+           << "\n]";
+  }
+};
+
+static std::string
+    format_2d_accessor_r_contents(GenericTensorAccessorR const &accessor) {
+  ASSERT(accessor.device_type == DeviceType::CPU);
+  ASSERT(accessor.shape.num_dims() == 2_n);
+
+  std::ostringstream oss;
+  DataTypeDispatch1<Print2DCPUAccessorR>{}(accessor.data_type, accessor, oss);
+  return oss.str();
+}
+
+template <DataType DT>
+struct Print3DCPUAccessorR {
+  void operator()(GenericTensorAccessorR const &accessor,
+                  std::ostream &stream) {
+    ASSERT(accessor.device_type == DeviceType::CPU);
+    nonnegative_int dims = accessor.shape.num_dims();
+    ASSERT(dims == 3_n);
+
+    nonnegative_int dim0_size = accessor.shape.at(ff_dim_t{0_n});
+    nonnegative_int dim1_size = accessor.shape.at(ff_dim_t{1_n});
+    nonnegative_int dim2_size = accessor.shape.at(ff_dim_t{2_n});
+
+    auto render_1d = [&](nonnegative_int dim0_idx,
+                         nonnegative_int dim1_idx) -> std::string {
+      return "[" +
+             join_strings(nonnegative_range(dim2_size),
+                          " ",
+                          [&](nonnegative_int dim2_idx) -> std::string {
+                            return fmt::to_string(accessor.at<DT>(
+                                FFOrdered{dim0_idx, dim1_idx, dim2_idx}));
+                          }) +
+             "]";
+    };
+
+    auto render_2d = [&](nonnegative_int dim0_idx) -> std::string {
+      return "[\n" +
+             indent(join_strings(nonnegative_range(dim1_size),
+                                 "\n",
+                                 [&](nonnegative_int dim1_idx) -> std::string {
+                                   return render_1d(dim0_idx, dim1_idx);
+                                 })) +
+             "\n]";
+    };
+
+    stream << "[\n"
+           << indent(
+                  join_strings(nonnegative_range(dim0_size), "\n", render_2d))
+           << "\n]";
+  }
+};
+
+static std::string
+    format_3d_accessor_r_contents(GenericTensorAccessorR const &accessor) {
+  ASSERT(accessor.device_type == DeviceType::CPU);
+  ASSERT(accessor.shape.num_dims() == 3_n);
+
+  std::ostringstream oss;
+  DataTypeDispatch1<Print3DCPUAccessorR>{}(accessor.data_type, accessor, oss);
+  return oss.str();
+}
+
+static std::string
+    format_1d_accessor_w_contents(GenericTensorAccessorW const &accessor) {
+  return format_1d_accessor_r_contents(
+      read_only_accessor_from_write_accessor(accessor));
+}
+
+static std::string
+    format_2d_accessor_w_contents(GenericTensorAccessorW const &accessor) {
+  return format_2d_accessor_r_contents(
+      read_only_accessor_from_write_accessor(accessor));
+}
+
+static std::string
+    format_3d_accessor_w_contents(GenericTensorAccessorW const &accessor) {
+  return format_3d_accessor_r_contents(
+      read_only_accessor_from_write_accessor(accessor));
+}
+
+std::string format_accessor_r_contents(GenericTensorAccessorR const &accessor) {
+  Allocator cpu_allocator = create_local_cpu_memory_allocator();
+  GenericTensorAccessorR cpu_accessor =
+      copy_tensor_accessor_r_to_cpu_if_necessary(accessor, cpu_allocator);
+
+  int num_dims = accessor.shape.num_dims().unwrap_nonnegative();
+  switch (num_dims) {
+    case 1:
+      return format_1d_accessor_r_contents(accessor);
+    case 2:
+      return format_2d_accessor_r_contents(accessor);
+    case 3:
+      return format_3d_accessor_r_contents(accessor);
+    default:
+      PANIC("Unhandled accessor dimensionality", num_dims);
+  }
+}
+
+std::string format_accessor_w_contents(GenericTensorAccessorW const &accessor) {
+  Allocator cpu_allocator = create_local_cpu_memory_allocator();
+  GenericTensorAccessorW cpu_accessor =
+      copy_tensor_accessor_w_to_cpu_if_necessary(accessor, cpu_allocator);
+
+  int num_dims = cpu_accessor.shape.num_dims().unwrap_nonnegative();
+  switch (num_dims) {
+    case 1:
+      return format_1d_accessor_w_contents(cpu_accessor);
+    case 2:
+      return format_2d_accessor_w_contents(cpu_accessor);
+    case 3:
+      return format_3d_accessor_w_contents(cpu_accessor);
+    default:
+      PANIC("Unhandled accessor dimensionality", num_dims);
+  }
+}
+
+} // namespace FlexFlow
diff --git a/lib/kernels/src/legion_dim.cc b/lib/kernels/src/kernels/legion_dim.cc
similarity index 78%
rename from lib/kernels/src/legion_dim.cc
rename to lib/kernels/src/kernels/legion_dim.cc
index bbb15c5636..f3482b1d9b 100644
--- a/lib/kernels/src/legion_dim.cc
+++ b/lib/kernels/src/kernels/legion_dim.cc
@@ -1,7 +1,11 @@
 #include "kernels/legion_dim.h"
+#include "utils/archetypes/value_type.h"
 
 namespace FlexFlow {
 
+using T = value_type<0>;
+template std::set<legion_dim_t> key_range(LegionOrdered<T> const &);
+
 legion_dim_t add_to_legion_dim(legion_dim_t legion_dim, int value) {
   return legion_dim_t{
       nonnegative_int{legion_dim.value.unwrap_nonnegative() + value}};
@@ -11,6 +15,7 @@ legion_dim_t legion_dim_from_ff_dim(ff_dim_t ff_dim,
                                     nonnegative_int num_dimensions) {
   return legion_dim_t{nonnegative_int{num_dimensions.unwrap_nonnegative() -
                                       ff_dim.value.unwrap_nonnegative() - 1}};
+  ;
 }
 
 } // namespace FlexFlow
diff --git a/lib/kernels/src/kernels/legion_ordered/legion_ordered.cc b/lib/kernels/src/kernels/legion_ordered/legion_ordered.cc
new file mode 100644
index 0000000000..8af44173b0
--- /dev/null
+++ b/lib/kernels/src/kernels/legion_ordered/legion_ordered.cc
@@ -0,0 +1,10 @@
+#include "kernels/legion_ordered/legion_ordered.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using T = value_type<0>;
+
+template struct LegionOrdered<T>;
+
+} // namespace FlexFlow
diff --git a/lib/kernels/src/kernels/legion_ordered/slice.cc b/lib/kernels/src/kernels/legion_ordered/slice.cc
new file mode 100644
index 0000000000..69fcf570aa
--- /dev/null
+++ b/lib/kernels/src/kernels/legion_ordered/slice.cc
@@ -0,0 +1,12 @@
+#include "kernels/legion_ordered/slice.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using T = value_type<0>;
+
+template LegionOrdered<T> slice(LegionOrdered<T> const &,
+                                legion_dim_t const &,
+                                std::optional<legion_dim_t> const &);
+
+} // namespace FlexFlow
diff --git a/lib/kernels/src/kernels/legion_ordered/transform.cc b/lib/kernels/src/kernels/legion_ordered/transform.cc
new file mode 100644
index 0000000000..d9fb38198e
--- /dev/null
+++ b/lib/kernels/src/kernels/legion_ordered/transform.cc
@@ -0,0 +1,12 @@
+#include "kernels/legion_ordered/transform.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using T = value_type<0>;
+using Out = value_type<1>;
+using F = std::function<Out(T const &)>;
+
+template LegionOrdered<Out> transform(LegionOrdered<T> const &, F &&);
+
+} // namespace FlexFlow
diff --git a/lib/local-execution/src/local_cpu_allocator.cc b/lib/kernels/src/kernels/local_cpu_allocator.cc
similarity index 52%
rename from lib/local-execution/src/local_cpu_allocator.cc
rename to lib/kernels/src/kernels/local_cpu_allocator.cc
index 4ca5f987a8..738d1abf27 100644
--- a/lib/local-execution/src/local_cpu_allocator.cc
+++ b/lib/kernels/src/kernels/local_cpu_allocator.cc
@@ -1,20 +1,27 @@
-#include "local-execution/local_cpu_allocator.h"
+#include "kernels/local_cpu_allocator.h"
+#include "kernels/device.h"
 #include "utils/containers/contains_key.h"
+#include <libassert/assert.hpp>
+#include <stdlib.h>
 
 namespace FlexFlow {
 void *LocalCPUAllocator::allocate(size_t requested_memory_size) {
   void *ptr = malloc(requested_memory_size);
+  ASSERT(ptr != nullptr);
   this->ptrs.insert({ptr, std::unique_ptr<void, decltype(&free)>(ptr, free)});
   return ptr;
 }
 
 void LocalCPUAllocator::deallocate(void *ptr) {
-  if (contains_key(this->ptrs, ptr)) {
-    this->ptrs.erase(ptr);
-  } else {
-    throw std::runtime_error(
-        "Deallocating a pointer that was not allocated by this Allocator");
-  }
+  ASSERT(contains_key(this->ptrs, ptr),
+         "Deallocating a pointer that was not allocated by this Allocator");
+
+  free(ptr);
+  this->ptrs.erase(ptr);
+}
+
+DeviceType LocalCPUAllocator::get_allocation_device_type() const {
+  return DeviceType::CPU;
 }
 
 Allocator create_local_cpu_memory_allocator() {
diff --git a/lib/kernels/src/local_cuda_allocator.cc b/lib/kernels/src/kernels/local_cuda_allocator.cc
similarity index 59%
rename from lib/kernels/src/local_cuda_allocator.cc
rename to lib/kernels/src/kernels/local_cuda_allocator.cc
index cdcfb017a0..1b081517bf 100644
--- a/lib/kernels/src/local_cuda_allocator.cc
+++ b/lib/kernels/src/kernels/local_cuda_allocator.cc
@@ -1,6 +1,7 @@
 #include "kernels/local_cuda_allocator.h"
 #include "kernels/device.h"
 #include "utils/containers/contains.h"
+#include <libassert/assert.hpp>
 
 namespace FlexFlow {
 void *LocalCudaAllocator::allocate(size_t requested_memory_size) {
@@ -11,13 +12,15 @@ void *LocalCudaAllocator::allocate(size_t requested_memory_size) {
 }
 
 void LocalCudaAllocator::deallocate(void *ptr) {
-  if (contains(this->ptrs, ptr)) {
-    checkCUDA(cudaFree(ptr));
-    this->ptrs.erase(ptr);
-  } else {
-    throw std::runtime_error(
-        "Deallocating a pointer that was not allocated by this Allocator");
-  }
+  ASSERT(contains(this->ptrs, ptr),
+         "Deallocating a pointer that was not allocated by this Allocator");
+
+  checkCUDA(cudaFree(ptr));
+  this->ptrs.erase(ptr);
+}
+
+DeviceType LocalCudaAllocator::get_allocation_device_type() const {
+  return DeviceType::GPU;
 }
 
 LocalCudaAllocator::~LocalCudaAllocator() {
@@ -27,7 +30,8 @@ LocalCudaAllocator::~LocalCudaAllocator() {
 }
 
 Allocator create_local_cuda_memory_allocator() {
-  return Allocator::create<LocalCudaAllocator>();
+  Allocator allocator = Allocator::create<LocalCudaAllocator>();
+  return allocator;
 }
 
 } // namespace FlexFlow
diff --git a/lib/kernels/src/kernels/reverse_kernels_params.cc b/lib/kernels/src/kernels/reverse_kernels_params.cc
new file mode 100644
index 0000000000..c647181872
--- /dev/null
+++ b/lib/kernels/src/kernels/reverse_kernels_params.cc
@@ -0,0 +1,30 @@
+#include "kernels/reverse_kernels_params.h"
+
+namespace FlexFlow {
+
+ReverseKernelsParams
+    compute_reverse_kernels_params(ArrayShape const &output_shape,
+                                   ReverseAttrs const &attrs) {
+  auto axis = attrs.axis;
+  nonnegative_int in_blk_size = 1_n;
+  nonnegative_int reverse_dim_size = 1_n;
+  nonnegative_int num_out_blks = 1_n;
+  for (nonnegative_int i : nonnegative_range(output_shape.get_dim())) {
+    if (i < axis.value) {
+      in_blk_size *= output_shape.at(ff_dim_t{i});
+    } else if (i == axis.value) {
+      reverse_dim_size = output_shape.at(ff_dim_t{i});
+    } else {
+      num_out_blks *= output_shape.at(ff_dim_t{i});
+    }
+  }
+
+  return ReverseKernelsParams{
+      num_out_blks,
+      reverse_dim_size,
+      in_blk_size,
+      output_shape.get_volume(),
+  };
+}
+
+} // namespace FlexFlow
diff --git a/lib/kernels/src/managed_ff_stream.cc b/lib/kernels/src/managed_ff_stream.cc
index 7385b6cc3e..f0348aa91c 100644
--- a/lib/kernels/src/managed_ff_stream.cc
+++ b/lib/kernels/src/managed_ff_stream.cc
@@ -1,28 +1,36 @@
 #include "kernels/managed_ff_stream.h"
+#include "utils/exception.h"
 
 namespace FlexFlow {
 
 ManagedFFStream::ManagedFFStream() : stream(new ffStream_t) {
-  checkCUDA(cudaStreamCreate(stream));
+  checkCUDA(cudaStreamCreate(this->stream));
 }
 
 ManagedFFStream::ManagedFFStream(ManagedFFStream &&other) noexcept
     : stream(std::exchange(other.stream, nullptr)) {}
 
 ManagedFFStream &ManagedFFStream::operator=(ManagedFFStream &&other) noexcept {
-  std::swap(this->stream, other.stream);
+  if (this != &other) {
+    this->cleanup();
+    this->stream = std::exchange(other.stream, nullptr);
+  }
   return *this;
 }
 
 ManagedFFStream::~ManagedFFStream() {
-  if (stream != nullptr) {
-    checkCUDA(cudaStreamDestroy(*stream));
-    delete stream;
+  this->cleanup();
+}
+
+void ManagedFFStream::cleanup() {
+  if (this->stream != nullptr) {
+    checkCUDA(cudaStreamDestroy(*this->stream));
+    delete this->stream;
   }
 }
 
 ffStream_t const &ManagedFFStream::raw_stream() const {
-  return *stream;
+  return *this->stream;
 }
 
 } // namespace FlexFlow
diff --git a/lib/kernels/src/managed_per_device_ff_handle.cc b/lib/kernels/src/managed_per_device_ff_handle.cc
index c050e887b6..ea26d2350c 100644
--- a/lib/kernels/src/managed_per_device_ff_handle.cc
+++ b/lib/kernels/src/managed_per_device_ff_handle.cc
@@ -1,16 +1,17 @@
 #include "kernels/managed_per_device_ff_handle.h"
-#include "device.h"
+#include "internal/device.h"
 
 namespace FlexFlow {
 
-ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle() {
-  handle = new PerDeviceFFHandle;
-  handle->workSpaceSize = 1024 * 1024;
-  handle->allowTensorOpMathConversion = true;
-
-  checkCUDNN(cudnnCreate(&handle->dnn));
-  checkCUBLAS(cublasCreate(&handle->blas));
-  checkCUDA(cudaMalloc(&handle->workSpace, handle->workSpaceSize));
+ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle(
+    size_t workSpaceSize, bool allowTensorOpMathConversion) {
+  this->handle = new PerDeviceFFHandle{};
+  this->handle->workSpaceSize = workSpaceSize;
+  this->handle->allowTensorOpMathConversion = allowTensorOpMathConversion;
+
+  checkCUDNN(cudnnCreate(&this->handle->dnn));
+  checkCUBLAS(cublasCreate(&this->handle->blas));
+  checkCUDA(cudaMalloc(&this->handle->workSpace, this->handle->workSpaceSize));
 }
 
 ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle(
@@ -19,16 +20,23 @@ ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle(
 
 ManagedPerDeviceFFHandle &ManagedPerDeviceFFHandle::operator=(
     ManagedPerDeviceFFHandle &&other) noexcept {
-  std::swap(this->handle, other.handle);
+  if (this != &other) {
+    this->cleanup();
+    this->handle = std::exchange(other.handle, nullptr);
+  }
   return *this;
 }
 
 ManagedPerDeviceFFHandle::~ManagedPerDeviceFFHandle() {
-  if (handle != nullptr) {
-    checkCUDNN(cudnnDestroy(handle->dnn));
-    checkCUBLAS(cublasDestroy(handle->blas));
-    checkCUDA(cudaFree(handle->workSpace));
-    delete handle;
+  this->cleanup();
+}
+
+void ManagedPerDeviceFFHandle::cleanup() {
+  if (this->handle != nullptr) {
+    checkCUDNN(cudnnDestroy(this->handle->dnn));
+    checkCUBLAS(cublasDestroy(this->handle->blas));
+    checkCUDA(cudaFree(this->handle->workSpace));
+    delete this->handle;
   }
 }
 
diff --git a/lib/kernels/test/CMakeLists.txt b/lib/kernels/test/CMakeLists.txt
index 00da2d0d70..066cb96753 100644
--- a/lib/kernels/test/CMakeLists.txt
+++ b/lib/kernels/test/CMakeLists.txt
@@ -14,6 +14,7 @@ ff_add_test_executable(
     cudnn
     cudart
     cublas
+    pcg
 )
 
 set(FF_TEST_EXEC_NAME "kernels-tests")
diff --git a/lib/kernels/test/src/cpu/ops/replicate_kernels.cc b/lib/kernels/test/src/cpu/ops/replicate_kernels.cc
new file mode 100644
index 0000000000..8630dcd8cd
--- /dev/null
+++ b/lib/kernels/test/src/cpu/ops/replicate_kernels.cc
@@ -0,0 +1,57 @@
+#include "internal/test_utils.h"
+#include "kernels/format_accessor_contents.h"
+#include "kernels/replicate_kernels_cpu.h"
+#include "test/utils/doctest/check_kv.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("Replicate::cpu_forward_kernel") {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    GenericTensorAccessorR input =
+        create_1d_accessor_r_with_contents({1, 3, 2}, cpu_allocator);
+
+    TensorShape result_shape = TensorShape{
+        TensorDims{FFOrdered{3_n}},
+        DataType::FLOAT,
+    };
+    GenericTensorAccessorW result =
+        create_zero_filled_accessor_w(result_shape, cpu_allocator);
+
+    GenericTensorAccessorR correct = input;
+
+    Kernels::Replicate::cpu_forward_kernel(input, result);
+
+    CHECK_MESSAGE(accessors_are_equal(result, correct),
+                  "result=",
+                  format_accessor_w_contents(result));
+  }
+
+  TEST_CASE("Replicate::cpu_backward_kernel") {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    GenericTensorAccessorR output = create_2d_accessor_r_with_contents(
+        {
+            {1, 2, 3},
+            {4, 3, 3},
+            {1, 3, 5},
+        },
+        cpu_allocator);
+
+    GenericTensorAccessorR correct = create_1d_accessor_r_with_contents(
+        {1 + 2 + 3, 4 + 3 + 3, 1 + 3 + 5}, cpu_allocator);
+
+    TensorShape result_shape = TensorShape{
+        TensorDims{FFOrdered{3_n}},
+        DataType::FLOAT,
+    };
+    GenericTensorAccessorW result =
+        create_zero_filled_accessor_w(result_shape, cpu_allocator);
+    Kernels::Replicate::cpu_backward_kernel(output, result, 3);
+
+    CHECK_MESSAGE(accessors_are_equal(result, correct),
+                  check_kv("result", format_accessor_w_contents(result)));
+  }
+}
diff --git a/lib/kernels/test/src/cpu/ops/reverse_kernels.cc b/lib/kernels/test/src/cpu/ops/reverse_kernels.cc
new file mode 100644
index 0000000000..db0016cb0b
--- /dev/null
+++ b/lib/kernels/test/src/cpu/ops/reverse_kernels.cc
@@ -0,0 +1,206 @@
+#include "internal/test_utils.h"
+#include "kernels/format_accessor_contents.h"
+#include "kernels/reverse_kernels_cpu.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("Reverse::cpu_forward_kernel") {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    GenericTensorAccessorR input = create_3d_accessor_r_with_contents(
+        {
+            {
+                {1, 3, 2},
+                {4, 2, 1},
+            },
+            {
+                {3, 3, 6},
+                {2, 1, 5},
+            },
+        },
+        cpu_allocator);
+
+    GenericTensorAccessorW result = create_zero_filled_accessor_w(
+        TensorShape{
+            TensorDims{FFOrdered{2_n, 2_n, 3_n}},
+            DataType::FLOAT,
+        },
+        cpu_allocator);
+
+    SUBCASE("axis = ff_dim_t{0}") {
+      ReverseAttrs attrs = ReverseAttrs{
+          /*axis=*/ff_dim_t{0_n},
+      };
+
+      GenericTensorAccessorR correct = create_3d_accessor_r_with_contents(
+          {
+              {
+                  {3, 3, 6},
+                  {2, 1, 5},
+              },
+              {
+                  {1, 3, 2},
+                  {4, 2, 1},
+              },
+          },
+          cpu_allocator);
+
+      Kernels::Reverse::cpu_forward_kernel(input, result, attrs);
+
+      CHECK_MESSAGE(accessors_are_equal(result, correct),
+                    "result=",
+                    format_accessor_w_contents(result));
+    }
+
+    SUBCASE("axis = ff_dim_t{1}") {
+      ReverseAttrs attrs = ReverseAttrs{
+          /*axis=*/ff_dim_t{1_n},
+      };
+
+      GenericTensorAccessorR correct = create_3d_accessor_r_with_contents(
+          {
+              {
+                  {4, 2, 1},
+                  {1, 3, 2},
+              },
+              {
+                  {2, 1, 5},
+                  {3, 3, 6},
+              },
+          },
+          cpu_allocator);
+
+      Kernels::Reverse::cpu_forward_kernel(input, result, attrs);
+
+      CHECK_MESSAGE(accessors_are_equal(result, correct),
+                    "result=",
+                    format_accessor_w_contents(result));
+    }
+
+    SUBCASE("axis = ff_dim_t{2}") {
+      ReverseAttrs attrs = ReverseAttrs{
+          /*axis=*/ff_dim_t{2_n},
+      };
+
+      GenericTensorAccessorR correct = create_3d_accessor_r_with_contents(
+          {
+              {
+                  {2, 3, 1},
+                  {1, 2, 4},
+              },
+              {
+                  {6, 3, 3},
+                  {5, 1, 2},
+              },
+          },
+          cpu_allocator);
+
+      Kernels::Reverse::cpu_forward_kernel(input, result, attrs);
+
+      CHECK_MESSAGE(accessors_are_equal(result, correct),
+                    "result=",
+                    format_accessor_w_contents(result));
+    }
+  }
+
+  TEST_CASE("Reverse::cpu_backward_kernel") {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    GenericTensorAccessorR input = create_3d_accessor_r_with_contents(
+        {
+            {
+                {1, 3, 2},
+                {4, 2, 1},
+            },
+            {
+                {3, 3, 6},
+                {2, 1, 5},
+            },
+        },
+        cpu_allocator);
+
+    GenericTensorAccessorW result = create_zero_filled_accessor_w(
+        TensorShape{
+            TensorDims{FFOrdered{2_n, 2_n, 3_n}},
+            DataType::FLOAT,
+        },
+        cpu_allocator);
+
+    SUBCASE("axis = ff_dim_t{0}") {
+      ReverseAttrs attrs = ReverseAttrs{
+          /*axis=*/ff_dim_t{0_n},
+      };
+
+      GenericTensorAccessorR correct = create_3d_accessor_r_with_contents(
+          {
+              {
+                  {3, 3, 6},
+                  {2, 1, 5},
+              },
+              {
+                  {1, 3, 2},
+                  {4, 2, 1},
+              },
+          },
+          cpu_allocator);
+
+      Kernels::Reverse::cpu_forward_kernel(input, result, attrs);
+
+      CHECK_MESSAGE(accessors_are_equal(result, correct),
+                    "result=",
+                    format_accessor_w_contents(result));
+    }
+
+    SUBCASE("axis = ff_dim_t{1}") {
+      ReverseAttrs attrs = ReverseAttrs{
+          /*axis=*/ff_dim_t{1_n},
+      };
+
+      GenericTensorAccessorR correct = create_3d_accessor_r_with_contents(
+          {
+              {
+                  {4, 2, 1},
+                  {1, 3, 2},
+              },
+              {
+                  {2, 1, 5},
+                  {3, 3, 6},
+              },
+          },
+          cpu_allocator);
+
+      Kernels::Reverse::cpu_forward_kernel(input, result, attrs);
+
+      CHECK_MESSAGE(accessors_are_equal(result, correct),
+                    "result=",
+                    format_accessor_w_contents(result));
+    }
+
+    SUBCASE("axis = ff_dim_t{2}") {
+      ReverseAttrs attrs = ReverseAttrs{
+          /*axis=*/ff_dim_t{2_n},
+      };
+
+      GenericTensorAccessorR correct = create_3d_accessor_r_with_contents(
+          {
+              {
+                  {2, 3, 1},
+                  {1, 2, 4},
+              },
+              {
+                  {6, 3, 3},
+                  {5, 1, 2},
+              },
+          },
+          cpu_allocator);
+
+      Kernels::Reverse::cpu_forward_kernel(input, result, attrs);
+
+      CHECK_MESSAGE(accessors_are_equal(result, correct),
+                    "result=",
+                    format_accessor_w_contents(result));
+    }
+  }
+}
diff --git a/lib/kernels/test/src/internal/test_utils.cc b/lib/kernels/test/src/internal/test_utils.cc
new file mode 100644
index 0000000000..0f34a6aa06
--- /dev/null
+++ b/lib/kernels/test/src/internal/test_utils.cc
@@ -0,0 +1,392 @@
+#include "internal/test_utils.h"
+#include "op-attrs/tensor_shape.h"
+#include "utils/containers/require_all_same1.h"
+#include "utils/join_strings.h"
+#include <random>
+
+namespace FlexFlow {
+
+GenericTensorAccessorW create_zero_filled_accessor_w(TensorShape const &shape,
+                                                     Allocator &allocator) {
+  GenericTensorAccessorW result_accessor = allocator.allocate_tensor(shape);
+  fill_with_zeros(result_accessor);
+  return result_accessor;
+}
+
+GenericTensorAccessorR create_zero_filled_accessor_r(TensorShape const &shape,
+                                                     Allocator &allocator) {
+  GenericTensorAccessorW accessor =
+      create_zero_filled_accessor_w(shape, allocator);
+  return read_only_accessor_from_write_accessor(accessor);
+}
+
+GenericTensorAccessorW
+    create_1d_accessor_w_with_contents(std::vector<float> const &contents,
+                                       Allocator &allocator) {
+  nonnegative_int ncols = num_elements(contents);
+  ASSERT(ncols > 0);
+
+  TensorShape shape = TensorShape{
+      TensorDims{FFOrdered{ncols}},
+      DataType::FLOAT,
+  };
+
+  Allocator cpu_allocator = create_local_cpu_memory_allocator();
+  GenericTensorAccessorW cpu_accessor = cpu_allocator.allocate_tensor(shape);
+
+  for (nonnegative_int col_idx : nonnegative_range(ncols)) {
+    cpu_accessor.at<DataType::FLOAT>(FFOrdered{col_idx}) =
+        contents.at(col_idx.unwrap_nonnegative());
+  }
+
+  GenericTensorAccessorW result = allocator.allocate_tensor(shape);
+  copy_accessor_data_to_l_from_r(
+      result, read_only_accessor_from_write_accessor(cpu_accessor));
+
+  return result;
+}
+
+GenericTensorAccessorW create_2d_accessor_w_with_contents(
+    std::vector<std::vector<float>> const &contents, Allocator &allocator) {
+  nonnegative_int nrows = num_elements(contents);
+  ASSERT(nrows > 0);
+
+  nonnegative_int ncols = throw_if_unexpected(
+      require_all_same1(transform(contents, [](std::vector<float> const &row) {
+        return num_elements(row);
+      })));
+  ASSERT(ncols > 0);
+
+  TensorShape shape = TensorShape{
+      TensorDims{FFOrdered{nrows, ncols}},
+      DataType::FLOAT,
+  };
+
+  Allocator cpu_allocator = create_local_cpu_memory_allocator();
+  GenericTensorAccessorW cpu_accessor = cpu_allocator.allocate_tensor(shape);
+
+  for (nonnegative_int row_idx : nonnegative_range(nrows)) {
+    for (nonnegative_int col_idx : nonnegative_range(ncols)) {
+      cpu_accessor.at<DataType::FLOAT>(FFOrdered{row_idx, col_idx}) =
+          contents.at(row_idx.unwrap_nonnegative())
+              .at(col_idx.unwrap_nonnegative());
+    }
+  }
+
+  GenericTensorAccessorW result = allocator.allocate_tensor(shape);
+  copy_accessor_data_to_l_from_r(
+      result, read_only_accessor_from_write_accessor(cpu_accessor));
+
+  return result;
+}
+
+GenericTensorAccessorW create_3d_accessor_w_with_contents(
+    std::vector<std::vector<std::vector<float>>> const &contents,
+    Allocator &allocator) {
+  nonnegative_int dim0_size = num_elements(contents);
+  ASSERT(dim0_size > 0);
+
+  nonnegative_int dim1_size = throw_if_unexpected(require_all_same1(
+      transform(contents, [](std::vector<std::vector<float>> const &m) {
+        return num_elements(m);
+      })));
+  ASSERT(dim1_size > 0);
+
+  nonnegative_int dim2_size = throw_if_unexpected(require_all_same1(
+      transform(contents, [](std::vector<std::vector<float>> const &m) {
+        return throw_if_unexpected(
+            require_all_same1(transform(m, [](std::vector<float> const &vec) {
+              return num_elements(vec);
+            })));
+      })));
+  ASSERT(dim2_size > 0);
+
+  TensorShape shape = TensorShape{
+      TensorDims{FFOrdered{dim0_size, dim1_size, dim2_size}},
+      DataType::FLOAT,
+  };
+
+  Allocator cpu_allocator = create_local_cpu_memory_allocator();
+  GenericTensorAccessorW cpu_accessor = cpu_allocator.allocate_tensor(shape);
+
+  for (nonnegative_int dim0_idx : nonnegative_range(dim0_size)) {
+    for (nonnegative_int dim1_idx : nonnegative_range(dim1_size)) {
+      for (nonnegative_int dim2_idx : nonnegative_range(dim2_size)) {
+        cpu_accessor.at<DataType::FLOAT>(
+            FFOrdered{dim0_idx, dim1_idx, dim2_idx}) =
+            contents.at(dim0_idx.unwrap_nonnegative())
+                .at(dim1_idx.unwrap_nonnegative())
+                .at(dim2_idx.unwrap_nonnegative());
+      }
+    }
+  }
+
+  GenericTensorAccessorW result = allocator.allocate_tensor(shape);
+  copy_accessor_data_to_l_from_r(
+      result, read_only_accessor_from_write_accessor(cpu_accessor));
+
+  return result;
+}
+
+GenericTensorAccessorW create_4d_accessor_w_with_contents(
+    std::vector<std::vector<std::vector<std::vector<float>>>> const &contents,
+    Allocator &allocator) {
+  nonnegative_int dim0_size = num_elements(contents);
+  ASSERT(dim0_size > 0);
+
+  nonnegative_int dim1_size = throw_if_unexpected(require_all_same1(transform(
+      contents, [](std::vector<std::vector<std::vector<float>>> const &t) {
+        return num_elements(t);
+      })));
+  ASSERT(dim1_size > 0);
+
+  nonnegative_int dim2_size = throw_if_unexpected(require_all_same1(transform(
+      contents, [](std::vector<std::vector<std::vector<float>>> const &m) {
+        return throw_if_unexpected(require_all_same1(
+            transform(m, [](std::vector<std::vector<float>> const &vec) {
+              return num_elements(vec);
+            })));
+      })));
+  ASSERT(dim2_size > 0);
+
+  nonnegative_int dim3_size = throw_if_unexpected(require_all_same1(transform(
+      contents, [](std::vector<std::vector<std::vector<float>>> const &t) {
+        return throw_if_unexpected(require_all_same1(
+            transform(t, [](std::vector<std::vector<float>> const &mat) {
+              return throw_if_unexpected(require_all_same1(
+                  transform(mat, [](std::vector<float> const &vec) {
+                    return num_elements(vec);
+                  })));
+            })));
+      })));
+  ASSERT(dim3_size > 0);
+
+  TensorShape shape = TensorShape{
+      TensorDims{FFOrdered{dim0_size, dim1_size, dim2_size, dim3_size}},
+      DataType::FLOAT,
+  };
+
+  GenericTensorAccessorW accessor = allocator.allocate_tensor(shape);
+
+  for (nonnegative_int dim0_idx : nonnegative_range(dim0_size)) {
+    for (nonnegative_int dim1_idx : nonnegative_range(dim1_size)) {
+      for (nonnegative_int dim2_idx : nonnegative_range(dim2_size)) {
+        for (nonnegative_int dim3_idx : nonnegative_range(dim3_size)) {
+          accessor.at<DataType::FLOAT>(
+              FFOrdered{dim0_idx, dim1_idx, dim2_idx, dim3_idx}) =
+              contents.at(dim0_idx.unwrap_nonnegative())
+                  .at(dim1_idx.unwrap_nonnegative())
+                  .at(dim2_idx.unwrap_nonnegative())
+                  .at(dim3_idx.unwrap_nonnegative());
+        }
+      }
+    }
+  }
+
+  return accessor;
+}
+
+GenericTensorAccessorR
+    create_1d_accessor_r_with_contents(std::vector<float> const &contents,
+                                       Allocator &allocator) {
+  return read_only_accessor_from_write_accessor(
+      create_1d_accessor_w_with_contents(contents, allocator));
+}
+
+GenericTensorAccessorR create_2d_accessor_r_with_contents(
+    std::vector<std::vector<float>> const &contents, Allocator &allocator) {
+  return read_only_accessor_from_write_accessor(
+      create_2d_accessor_w_with_contents(contents, allocator));
+}
+
+GenericTensorAccessorR create_3d_accessor_r_with_contents(
+    std::vector<std::vector<std::vector<float>>> const &contents,
+    Allocator &allocator) {
+  return read_only_accessor_from_write_accessor(
+      create_3d_accessor_w_with_contents(contents, allocator));
+}
+
+GenericTensorAccessorR create_4d_accessor_r_with_contents(
+    std::vector<std::vector<std::vector<std::vector<float>>>> const &contents,
+    Allocator &allocator) {
+  return read_only_accessor_from_write_accessor(
+      create_4d_accessor_w_with_contents(contents, allocator));
+}
+
+template <DataType DT>
+struct CreateRandomFilledAccessorW {
+  GenericTensorAccessorW operator()(TensorShape const &shape,
+                                    Allocator &allocator) {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+    GenericTensorAccessorW src_accessor = cpu_allocator.allocate_tensor(shape);
+
+    using T = real_type_t<DT>;
+    T *data_ptr = src_accessor.get<DT>();
+
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    size_t num_elements = get_num_elements(shape).unwrap_nonnegative();
+    if constexpr (std::is_same<T, bool>::value) {
+      std::bernoulli_distribution dist(0.5);
+      for (size_t i = 0; i < num_elements; i++) {
+        data_ptr[i] = dist(gen);
+      }
+    } else if constexpr (std::is_floating_point<T>::value) {
+      std::uniform_real_distribution<T> dist(-1.0, 1.0);
+      for (size_t i = 0; i < num_elements; i++) {
+        data_ptr[i] = dist(gen);
+      }
+    } else if constexpr (std::is_integral<T>::value) {
+      std::uniform_int_distribution<T> dist(0, 99);
+      for (size_t i = 0; i < num_elements; i++) {
+        data_ptr[i] = dist(gen);
+      }
+    }
+
+    GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape);
+    copy_accessor_data_to_l_from_r(dst_accessor, src_accessor);
+
+    return dst_accessor;
+  }
+};
+
+GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape,
+                                                       Allocator &allocator) {
+  return DataTypeDispatch1<CreateRandomFilledAccessorW>{}(
+      shape.data_type, shape, allocator);
+}
+
+GenericTensorAccessorR create_random_filled_accessor_r(TensorShape const &shape,
+                                                       Allocator &allocator) {
+  GenericTensorAccessorW accessor =
+      create_random_filled_accessor_w(shape, allocator);
+
+  return read_only_accessor_from_write_accessor(accessor);
+}
+
+template <DataType DT>
+struct FillWithZeros {
+  void operator()(GenericTensorAccessorW const &accessor) {
+    using T = real_type_t<DT>;
+
+    if (accessor.device_type == DeviceType::CPU) {
+      memset(accessor.ptr,
+             0,
+             accessor.shape.get_volume().unwrap_nonnegative() * sizeof(T));
+    } else {
+      checkCUDA(cudaMemset(accessor.ptr,
+                           0,
+                           accessor.shape.get_volume().unwrap_nonnegative() *
+                               sizeof(T)));
+    }
+  }
+};
+
+void fill_with_zeros(GenericTensorAccessorW const &accessor) {
+  DataTypeDispatch1<FillWithZeros>{}(accessor.data_type, accessor);
+}
+
+template <DataType DT>
+struct CPUAccessorRContainsNonZero {
+  bool operator()(GenericTensorAccessorR const &accessor) {
+    using T = real_type_t<DT>;
+
+    T const *data_ptr = accessor.get<DT>();
+
+    int volume = accessor.shape.num_elements().unwrap_nonnegative();
+    for (size_t i = 0; i < volume; i++) {
+      if (data_ptr[i] != 0) {
+        return true;
+      }
+    }
+
+    return false;
+  }
+};
+
+bool contains_non_zero(GenericTensorAccessorR const &accessor) {
+  Allocator cpu_allocator = create_local_cpu_memory_allocator();
+  GenericTensorAccessorR cpu_accessor =
+      copy_tensor_accessor_r_to_cpu_if_necessary(accessor, cpu_allocator);
+  return DataTypeDispatch1<CPUAccessorRContainsNonZero>{}(
+      cpu_accessor.data_type, cpu_accessor);
+}
+
+template <DataType DT>
+struct AccessorsAreEqual {
+  bool operator()(GenericTensorAccessorR const &accessor_a,
+                  GenericTensorAccessorR const &accessor_b) {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+    GenericTensorAccessorR cpu_accessor_a =
+        copy_tensor_accessor_r_to_cpu_if_necessary(accessor_a, cpu_allocator);
+    GenericTensorAccessorR cpu_accessor_b =
+        copy_tensor_accessor_r_to_cpu_if_necessary(accessor_b, cpu_allocator);
+
+    using T = real_type_t<DT>;
+    T const *a_data_ptr = cpu_accessor_a.get<DT>();
+    T const *b_data_ptr = cpu_accessor_b.get<DT>();
+
+    int volume = accessor_a.shape.num_elements().unwrap_nonnegative();
+    for (size_t i = 0; i < volume; i++) {
+      if (a_data_ptr[i] != b_data_ptr[i]) {
+        return false;
+      }
+    }
+
+    return true;
+  }
+};
+
+bool accessors_are_equal(GenericTensorAccessorR const &accessor_a,
+                         GenericTensorAccessorR const &accessor_b) {
+  ASSERT(accessor_a.shape == accessor_b.shape,
+         "accessors_are_equal expects accessors to have the same shape");
+
+  return DataTypeDispatch1<AccessorsAreEqual>{}(
+      accessor_a.data_type, accessor_a, accessor_b);
+}
+
+template <DataType DT>
+struct CreateFilledAccessorW {
+  GenericTensorAccessorW operator()(TensorShape const &shape,
+                                    Allocator &allocator,
+                                    DataTypeValue val) {
+    using T = real_type_t<DT>;
+    if (!val.template has<T>()) {
+      throw mk_runtime_error("create_filed_accessor expected data type of "
+                             "shape and passed-in value to match");
+    }
+
+    auto unwrapped_value = val.get<T>();
+    GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape);
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+    GenericTensorAccessorW src_accessor = cpu_allocator.allocate_tensor(shape);
+
+    T *data_ptr = src_accessor.get<DT>();
+
+    int volume = dst_accessor.shape.num_elements().unwrap_nonnegative();
+    for (size_t i = 0; i < volume; i++) {
+      data_ptr[i] = unwrapped_value;
+    }
+
+    copy_accessor_data_to_l_from_r(dst_accessor, src_accessor);
+    return dst_accessor;
+  }
+};
+
+GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape,
+                                                Allocator &allocator,
+                                                DataTypeValue val) {
+
+  return DataTypeDispatch1<CreateFilledAccessorW>{}(
+      shape.data_type, shape, allocator, val);
+}
+
+GenericTensorAccessorR create_filled_accessor_r(TensorShape const &shape,
+                                                Allocator &allocator,
+                                                DataTypeValue val) {
+  GenericTensorAccessorW w_accessor =
+      create_filled_accessor_w(shape, allocator, val);
+  return read_only_accessor_from_write_accessor(w_accessor);
+}
+} // namespace FlexFlow
diff --git a/lib/kernels/test/src/internal/test_utils.h b/lib/kernels/test/src/internal/test_utils.h
new file mode 100644
index 0000000000..a4fc9b88c8
--- /dev/null
+++ b/lib/kernels/test/src/internal/test_utils.h
@@ -0,0 +1,78 @@
+#ifndef _FLEXFLOW_KERNELS_TEST_SRC_INTERNAL_TEST_UTILS_H
+#define _FLEXFLOW_KERNELS_TEST_SRC_INTERNAL_TEST_UTILS_H
+
+#include "kernels/copy_tensor_accessor.h"
+#include "kernels/datatype_dispatch.h"
+#include "kernels/device.h"
+#include "kernels/local_cpu_allocator.h"
+#include "kernels/local_cuda_allocator.h"
+#include "kernels/managed_ff_stream.h"
+#include "kernels/managed_per_device_ff_handle.h"
+#include "op-attrs/datatype.h"
+#include "op-attrs/datatype_value.dtg.h"
+#include <doctest/doctest.h>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace FlexFlow {
+
+GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape,
+                                                       Allocator &allocator);
+
+GenericTensorAccessorR create_random_filled_accessor_r(TensorShape const &shape,
+                                                       Allocator &allocator);
+
+GenericTensorAccessorW create_zero_filled_accessor_w(TensorShape const &shape,
+                                                     Allocator &allocator);
+
+GenericTensorAccessorR create_zero_filled_accessor_r(TensorShape const &shape,
+                                                     Allocator &allocator);
+
+GenericTensorAccessorW
+    create_1d_accessor_w_with_contents(std::vector<float> const &contents,
+                                       Allocator &allocator);
+GenericTensorAccessorR
+    create_1d_accessor_r_with_contents(std::vector<float> const &contents,
+                                       Allocator &allocator);
+
+GenericTensorAccessorW create_2d_accessor_w_with_contents(
+    std::vector<std::vector<float>> const &contents, Allocator &allocator);
+GenericTensorAccessorR create_2d_accessor_r_with_contents(
+    std::vector<std::vector<float>> const &contents, Allocator &allocator);
+
+GenericTensorAccessorW create_3d_accessor_w_with_contents(
+    std::vector<std::vector<std::vector<float>>> const &contents,
+    Allocator &allocator);
+GenericTensorAccessorR create_3d_accessor_r_with_contents(
+    std::vector<std::vector<std::vector<float>>> const &contents,
+    Allocator &allocator);
+
+GenericTensorAccessorW create_4d_accessor_w_with_contents(
+    std::vector<std::vector<std::vector<std::vector<float>>>> const &contents,
+    Allocator &allocator);
+GenericTensorAccessorR create_4d_accessor_r_with_contents(
+    std::vector<std::vector<std::vector<std::vector<float>>>> const &contents,
+    Allocator &allocator);
+
+bool contains_non_zero(GenericTensorAccessorR const &accessor);
+
+void fill_with_zeros(GenericTensorAccessorW const &accessor);
+
+void print_2d_tensor_accessor_contents(GenericTensorAccessorR const &accessor,
+                                       std::ostream &stream);
+
+bool accessors_are_equal(GenericTensorAccessorR const &accessor_a,
+                         GenericTensorAccessorR const &accessor_b);
+
+GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape,
+                                                Allocator &allocator,
+                                                DataTypeValue val);
+
+GenericTensorAccessorR create_filled_accessor_r(TensorShape const &shape,
+                                                Allocator &allocator,
+                                                DataTypeValue val);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/kernels/test/src/kernels/accessor.cc b/lib/kernels/test/src/kernels/accessor.cc
new file mode 100644
index 0000000000..98f8471212
--- /dev/null
+++ b/lib/kernels/test/src/kernels/accessor.cc
@@ -0,0 +1,73 @@
+#include "kernels/accessor.h"
+#include "internal/test_utils.h"
+#include "kernels/local_cpu_allocator.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("calculate_accessor_offset") {
+    SUBCASE("one dimension") {
+      std::vector<nonnegative_int> indices = {4_n};
+      ArrayShape shape = ArrayShape{
+          std::vector<nonnegative_int>{
+              13_n,
+          },
+      };
+
+      nonnegative_int result = calculate_accessor_offset(indices, shape);
+      nonnegative_int correct = 4_n;
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("multiple dimensions") {
+      std::vector<nonnegative_int> indices = {2_n, 4_n};
+      ArrayShape shape = ArrayShape{
+          std::vector<nonnegative_int>{
+              6_n,
+              5_n,
+          },
+      };
+
+      nonnegative_int result = calculate_accessor_offset(indices, shape);
+      nonnegative_int correct = 2_n * 5_n + 4_n;
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("zero dimensions") {
+      std::vector<nonnegative_int> indices = {};
+      ArrayShape shape = ArrayShape{std::vector<nonnegative_int>{}};
+
+      nonnegative_int result = calculate_accessor_offset(indices, shape);
+      nonnegative_int correct = 0_n;
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("index and shape dimensions do not match") {
+      std::vector<nonnegative_int> indices = {1_n, 2_n, 4_n};
+      ArrayShape shape = ArrayShape{
+          std::vector<nonnegative_int>{
+              6_n,
+              5_n,
+          },
+      };
+
+      CHECK_THROWS(calculate_accessor_offset(indices, shape));
+    }
+
+    SUBCASE("out of bounds index") {
+      std::vector<nonnegative_int> indices = {2_n, 5_n};
+      ArrayShape shape = ArrayShape{
+          std::vector<nonnegative_int>{
+              6_n,
+              5_n,
+          },
+      };
+
+      CHECK_THROWS(calculate_accessor_offset(indices, shape));
+    }
+  }
+}
diff --git a/lib/kernels/test/src/kernels/array_shape.cc b/lib/kernels/test/src/kernels/array_shape.cc
new file mode 100644
index 0000000000..1fb4c0b541
--- /dev/null
+++ b/lib/kernels/test/src/kernels/array_shape.cc
@@ -0,0 +1,49 @@
+#include "kernels/array_shape.h"
+#include "test/utils/doctest/fmt/unordered_set.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("get_array_coord_set") {
+    SUBCASE("ArrayShape is not empty") {
+      ArrayShape input = ArrayShape{
+          LegionOrdered{2_n, 1_n, 3_n},
+      };
+
+      std::unordered_set<ArrayCoord> result = get_array_coord_set(input);
+      std::unordered_set<ArrayCoord> correct = {
+          ArrayCoord{FFOrdered{0_n, 0_n, 0_n}},
+          ArrayCoord{FFOrdered{0_n, 0_n, 1_n}},
+          ArrayCoord{FFOrdered{1_n, 0_n, 0_n}},
+          ArrayCoord{FFOrdered{1_n, 0_n, 1_n}},
+          ArrayCoord{FFOrdered{2_n, 0_n, 0_n}},
+          ArrayCoord{FFOrdered{2_n, 0_n, 1_n}},
+      };
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("ArrayShape has a dimension of size zero") {
+      ArrayShape input = ArrayShape{
+          LegionOrdered{2_n, 0_n, 3_n},
+      };
+
+      std::unordered_set<ArrayCoord> result = get_array_coord_set(input);
+      std::unordered_set<ArrayCoord> correct = {};
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("ArrayShape is zero-dimensional") {
+      ArrayShape input = ArrayShape{LegionOrdered<nonnegative_int>{}};
+
+      std::unordered_set<ArrayCoord> result = get_array_coord_set(input);
+      std::unordered_set<ArrayCoord> correct = {
+          ArrayCoord{FFOrdered<nonnegative_int>{}},
+      };
+
+      CHECK(result == correct);
+    }
+  }
+}
diff --git a/lib/kernels/test/src/kernels/format_accessor_contents.cc b/lib/kernels/test/src/kernels/format_accessor_contents.cc
new file mode 100644
index 0000000000..915a84c335
--- /dev/null
+++ b/lib/kernels/test/src/kernels/format_accessor_contents.cc
@@ -0,0 +1,94 @@
+#include "kernels/format_accessor_contents.h"
+#include "internal/test_utils.h"
+#include "kernels/local_cpu_allocator.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("format_accessor_r_contents(GenericTensorAccessorR)") {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    SUBCASE("accessor is 1d") {
+      GenericTensorAccessorR accessor =
+          create_1d_accessor_r_with_contents({1, 2, 3, 2}, cpu_allocator);
+
+      std::string correct = "[1 2 3 2]";
+
+      std::string result = format_accessor_r_contents(accessor);
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("accessor is 2d") {
+      GenericTensorAccessorR accessor = create_2d_accessor_r_with_contents(
+          {
+              {1, 2, 3, 5},
+              {4, 3, 3, 2},
+              {1, 1, 5, 8},
+          },
+          cpu_allocator);
+
+      std::string correct = "[\n"
+                            "  [1 2 3 5]\n"
+                            "  [4 3 3 2]\n"
+                            "  [1 1 5 8]\n"
+                            "]";
+
+      std::string result = format_accessor_r_contents(accessor);
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("accessor is 3d") {
+      GenericTensorAccessorR accessor = create_3d_accessor_r_with_contents(
+          {
+              {
+                  {1, 2, 3, 6},
+                  {4, 3, 3, 9},
+                  {1, 1, 5, 1},
+              },
+              {
+                  {4, 1, 8, 7},
+                  {9, 4, 2, 4},
+                  {1, 0, 0, 6},
+              },
+              {
+                  {2, 1, 1, 9},
+                  {1, 3, 6, 2},
+                  {1, 9, 8, 9},
+              },
+          },
+          cpu_allocator);
+
+      std::string correct = "[\n"
+                            "  [\n"
+                            "    [1 2 3 6]\n"
+                            "    [4 3 3 9]\n"
+                            "    [1 1 5 1]\n"
+                            "  ]\n"
+                            "  [\n"
+                            "    [4 1 8 7]\n"
+                            "    [9 4 2 4]\n"
+                            "    [1 0 0 6]\n"
+                            "  ]\n"
+                            "  [\n"
+                            "    [2 1 1 9]\n"
+                            "    [1 3 6 2]\n"
+                            "    [1 9 8 9]\n"
+                            "  ]\n"
+                            "]";
+
+      std::string result = format_accessor_r_contents(accessor);
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("accessor is some other dimension") {
+      GenericTensorAccessorR accessor =
+          create_4d_accessor_r_with_contents({{{{5}}}}, cpu_allocator);
+
+      CHECK_THROWS(format_accessor_r_contents(accessor));
+    }
+  }
+}
diff --git a/lib/kernels/test/src/kernels/legion_dim.cc b/lib/kernels/test/src/kernels/legion_dim.cc
new file mode 100644
index 0000000000..34822ed1c3
--- /dev/null
+++ b/lib/kernels/test/src/kernels/legion_dim.cc
@@ -0,0 +1,32 @@
+#include "kernels/legion_dim.h"
+#include "test/utils/doctest/fmt/set.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
+  TEST_CASE("key_range(LegionOrdered<T>)") {
+    SUBCASE("input is non-empty") {
+      LegionOrdered<int> input = {5, 3, 2, 3};
+
+      std::set<legion_dim_t> result = key_range(input);
+      std::set<legion_dim_t> correct = {
+          legion_dim_t{0_n},
+          legion_dim_t{1_n},
+          legion_dim_t{2_n},
+          legion_dim_t{3_n},
+      };
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("input is empty") {
+      LegionOrdered<int> input = {};
+
+      std::set<legion_dim_t> result = key_range(input);
+      std::set<legion_dim_t> correct = {};
+
+      CHECK(result == correct);
+    }
+  }
+}
diff --git a/lib/kernels/test/src/kernels/legion_ordered/legion_ordered.cc b/lib/kernels/test/src/kernels/legion_ordered/legion_ordered.cc
new file mode 100644
index 0000000000..4b50cad735
--- /dev/null
+++ b/lib/kernels/test/src/kernels/legion_ordered/legion_ordered.cc
@@ -0,0 +1,12 @@
+#include "kernels/legion_ordered/legion_ordered.h"
+#include "test/utils/rapidcheck.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
+  TEST_CASE_TEMPLATE(
+      "Arbitrary<LegionOrdered<T>> with T=", T, int, double, char) {
+    RC_SUBCASE([](LegionOrdered<T>) {});
+  }
+}
diff --git a/lib/kernels/test/src/kernels/legion_ordered/slice.cc b/lib/kernels/test/src/kernels/legion_ordered/slice.cc
new file mode 100644
index 0000000000..d0211d270e
--- /dev/null
+++ b/lib/kernels/test/src/kernels/legion_ordered/slice.cc
@@ -0,0 +1,30 @@
+#include "kernels/legion_ordered/slice.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
+  TEST_CASE("slice(LegionOrdered<T>, ..., ...") {
+    LegionOrdered<size_t> d = LegionOrdered<size_t>{
+        1,
+        2,
+        3,
+        4,
+    };
+    SUBCASE("legion_dim_t, legion_dim_t") {
+      LegionOrdered<size_t> result = slice(d,
+                                           legion_dim_t{nonnegative_int{1}},
+                                           legion_dim_t{nonnegative_int{3}});
+      LegionOrdered<size_t> correct = LegionOrdered<size_t>{2, 3};
+
+      CHECK(result == correct);
+    }
+    SUBCASE("legion_dim_t, std::nullopt_t") {
+      LegionOrdered<size_t> result =
+          slice(d, legion_dim_t{nonnegative_int{1}}, std::nullopt);
+      LegionOrdered<size_t> correct = LegionOrdered<size_t>{2, 3, 4};
+
+      CHECK(result == correct);
+    }
+  }
+}
diff --git a/lib/kernels/test/src/kernels/legion_ordered/transform.cc b/lib/kernels/test/src/kernels/legion_ordered/transform.cc
new file mode 100644
index 0000000000..759507264f
--- /dev/null
+++ b/lib/kernels/test/src/kernels/legion_ordered/transform.cc
@@ -0,0 +1,36 @@
+#include "kernels/legion_ordered/transform.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
+  TEST_CASE("transform(LegionOrdered<T>, F)") {
+    SUBCASE("input is empty") {
+      LegionOrdered<std::string> input = {};
+
+      LegionOrdered<int> result =
+          transform(input, [](std::string const &) -> int {
+            CHECK(false);
+            return 0;
+          });
+      LegionOrdered<int> correct = {};
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("input is not empty") {
+      LegionOrdered<int> input = {2, 1, 2, 5};
+
+      LegionOrdered<std::string> result =
+          transform(input, [](int x) { return fmt::to_string(x); });
+      LegionOrdered<std::string> correct = LegionOrdered<std::string>{
+          "2",
+          "1",
+          "2",
+          "5",
+      };
+
+      CHECK(result == correct);
+    }
+  }
+}
diff --git a/lib/kernels/test/src/test_attention_kernel.cc b/lib/kernels/test/src/test_attention_kernel.cc
index 64264f6c39..9064ae4824 100644
--- a/lib/kernels/test/src/test_attention_kernel.cc
+++ b/lib/kernels/test/src/test_attention_kernel.cc
@@ -1,10 +1,10 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/attention_kernels.h"
-#include "test_utils.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
 
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test multi-head attention kernel") {
     nonnegative_int num_samples = 10_n;
     nonnegative_int num_heads = 4_n;
@@ -19,7 +19,9 @@ TEST_SUITE(FF_TEST_SUITE) {
     nonnegative_int kvSeqLength = 20_n;
 
     ManagedFFStream managed_stream{};
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
@@ -39,16 +41,26 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*kvSeqLength=*/kvSeqLength.unwrap_nonnegative(),
         /*add_bias_kv=*/false);
 
-    TensorShape query_shape = make_float_tensor_shape_from_legion_dims(
-        {qoSeqLength, num_samples, qSize});
-    TensorShape key_shape = make_float_tensor_shape_from_legion_dims(
-        {kvSeqLength, num_samples, kSize});
-    TensorShape value_shape = make_float_tensor_shape_from_legion_dims(
-        {kvSeqLength, num_samples, vSize});
-    TensorShape output_shape = make_float_tensor_shape_from_legion_dims(
-        {qoSeqLength, num_samples, oProjSize});
-    TensorShape weight_shape = make_float_tensor_shape_from_legion_dims(
-        {nonnegative_int{state.weightSize}});
+    TensorShape query_shape = TensorShape{
+        TensorDims{FFOrdered{qoSeqLength, num_samples, qSize}},
+        DataType::FLOAT,
+    };
+    TensorShape key_shape = TensorShape{
+        TensorDims{FFOrdered{kvSeqLength, num_samples, kSize}},
+        DataType::FLOAT,
+    };
+    TensorShape value_shape = TensorShape{
+        TensorDims{FFOrdered{kvSeqLength, num_samples, vSize}},
+        DataType::FLOAT,
+    };
+    TensorShape output_shape = TensorShape{
+        TensorDims{FFOrdered{qoSeqLength, num_samples, oProjSize}},
+        DataType::FLOAT,
+    };
+    TensorShape weight_shape = TensorShape{
+        TensorDims{FFOrdered{nonnegative_int{state.weightSize}}},
+        DataType::FLOAT,
+    };
 
     GenericTensorAccessorW query_accessor =
         create_random_filled_accessor_w(query_shape, allocator);
@@ -72,9 +84,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           weight_accessor.get_float_ptr(),
           output_accessor.get_float_ptr());
 
-      std::vector<float> host_output = load_data_to_host_from_device<float>(
-          read_only_accessor_from_write_accessor(output_accessor));
-      CHECK(contains_non_zero(host_output));
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
diff --git a/lib/kernels/test/src/test_batch_matmul_kernel.cc b/lib/kernels/test/src/test_batch_matmul_kernel.cc
index cacd5b60fb..5f63b48198 100644
--- a/lib/kernels/test/src/test_batch_matmul_kernel.cc
+++ b/lib/kernels/test/src/test_batch_matmul_kernel.cc
@@ -1,10 +1,10 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/batch_matmul_kernels.h"
-#include "test_utils.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
 
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test BatchMatmul Kernel") {
     nonnegative_int m = 10_n;
     nonnegative_int n = 10_n;
@@ -15,16 +15,24 @@ TEST_SUITE(FF_TEST_SUITE) {
     int seq_length = -1;
 
     ManagedFFStream managed_stream{};
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    TensorShape input_shape_a =
-        make_float_tensor_shape_from_legion_dims({m, k, batch});
-    TensorShape input_shape_b =
-        make_float_tensor_shape_from_legion_dims({k, n, batch});
-    TensorShape output_shape =
-        make_float_tensor_shape_from_legion_dims({m, n, batch});
+    TensorShape input_shape_a = TensorShape{
+        TensorDims{FFOrdered{batch, k, m}},
+        DataType::FLOAT,
+    };
+    TensorShape input_shape_b = TensorShape{
+        TensorDims{FFOrdered{batch, n, k}},
+        DataType::FLOAT,
+    };
+    TensorShape output_shape = TensorShape{
+        TensorDims{FFOrdered{batch, n, m}},
+        DataType::FLOAT,
+    };
 
     GenericTensorAccessorW a_accessor =
         create_random_filled_accessor_w(input_shape_a, allocator);
diff --git a/lib/kernels/test/src/test_batch_norm_kernel.cc b/lib/kernels/test/src/test_batch_norm_kernel.cc
index b4c43cf1d8..903ad8cc43 100644
--- a/lib/kernels/test/src/test_batch_norm_kernel.cc
+++ b/lib/kernels/test/src/test_batch_norm_kernel.cc
@@ -1,10 +1,11 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/batch_norm_kernels.h"
-#include "test_utils.h"
+#include "op-attrs/datatype_value.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
 
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test BatchNorm Kernel") {
     nonnegative_int output_n = 1_n;
     nonnegative_int output_c = 10_n;
@@ -12,7 +13,9 @@ TEST_SUITE(FF_TEST_SUITE) {
     nonnegative_int output_w = 10_n;
 
     ManagedFFStream managed_stream{};
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
@@ -26,25 +29,33 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*output_w=*/output_w.unwrap_nonnegative(),
         /*relu=*/true);
 
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims(
-        {output_n, output_c, output_h, output_w});
-    TensorShape output_shape = make_float_tensor_shape_from_legion_dims(
-        {output_n, output_c, output_h, output_w});
-    TensorShape scale_shape = make_float_tensor_shape_from_legion_dims(
-        {output_n, output_c, output_h, output_w});
-    TensorShape bias_shape = make_float_tensor_shape_from_legion_dims(
-        {output_n, output_c, output_h, output_w});
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{output_n, output_c, output_h, output_w}},
+        DataType::FLOAT,
+    };
+    TensorShape output_shape = TensorShape{
+        TensorDims{FFOrdered{output_n, output_c, output_h, output_w}},
+        DataType::FLOAT,
+    };
+    TensorShape scale_shape = TensorShape{
+        TensorDims{FFOrdered{output_n, output_c, output_h, output_w}},
+        DataType::FLOAT,
+    };
+    TensorShape bias_shape = TensorShape{
+        TensorDims{FFOrdered{output_n, output_c, output_h, output_w}},
+        DataType::FLOAT,
+    };
 
     GenericTensorAccessorW input_accessor =
         create_random_filled_accessor_w(input_shape, allocator);
     GenericTensorAccessorW output_accessor =
         create_random_filled_accessor_w(output_shape, allocator);
-    GenericTensorAccessorW scale_accessor =
-        create_filled_accessor_w(scale_shape, allocator, 1.0f);
+    GenericTensorAccessorW scale_accessor = create_filled_accessor_w(
+        scale_shape, allocator, make_float_data_type_value(1));
 
     SUBCASE("forward_kernel") {
-      GenericTensorAccessorW bias_accessor =
-          create_filled_accessor_w(bias_shape, allocator, 0.0f);
+      GenericTensorAccessorW bias_accessor = create_filled_accessor_w(
+          bias_shape, allocator, make_float_data_type_value(0));
 
       Kernels::BatchNorm::forward_kernel(
           /*stream=*/managed_stream.raw_stream(),
@@ -54,10 +65,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           /*scale_ptr=*/scale_accessor.get_float_ptr(),
           /*bias_ptr=*/bias_accessor.get_float_ptr());
 
-      std::vector<float> host_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-      CHECK(contains_non_zero(host_output_data));
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
@@ -73,9 +81,9 @@ TEST_SUITE(FF_TEST_SUITE) {
       Kernels::BatchNorm::backward_kernel(
           /*stream=*/managed_stream.raw_stream(),
           /*per_device_state=*/state,
-          /*input_ptr=*/input_accessor.get_float_ptr(),
-          /*output_grad_ptr=*/output_grad_accessor.get_float_ptr(),
           /*output_ptr=*/output_accessor.get_float_ptr(),
+          /*output_grad_ptr=*/output_grad_accessor.get_float_ptr(),
+          /*input_ptr=*/input_accessor.get_float_ptr(),
           /*input_grad_ptr=*/input_grad_accessor.get_float_ptr(),
           /*scale_ptr=*/scale_accessor.get_float_ptr(),
           /*scale_grad_ptr=*/scale_grad_accessor.get_float_ptr(),
@@ -83,19 +91,9 @@ TEST_SUITE(FF_TEST_SUITE) {
           /*numElements=*/
           input_accessor.shape.num_elements().unwrap_nonnegative());
 
-      std::vector<float> host_input_grad_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(input_grad_accessor));
-      std::vector<float> host_scale_grad_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(scale_grad_accessor));
-      std::vector<float> host_bias_grad_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(bias_grad_accessor));
-
-      CHECK(contains_non_zero(host_input_grad_data));
-      CHECK(contains_non_zero(host_scale_grad_data));
-      CHECK(contains_non_zero(host_bias_grad_data));
+      CHECK(contains_non_zero(input_grad_accessor));
+      CHECK(contains_non_zero(scale_grad_accessor));
+      CHECK(contains_non_zero(bias_grad_accessor));
     }
 
     Kernels::BatchNorm::cleanup_kernel(allocator,
diff --git a/lib/kernels/test/src/test_cast_kernel.cc b/lib/kernels/test/src/test_cast_kernel.cc
index 0e0769014d..0c41fe12ac 100644
--- a/lib/kernels/test/src/test_cast_kernel.cc
+++ b/lib/kernels/test/src/test_cast_kernel.cc
@@ -1,56 +1,86 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/cast_kernels.h"
-#include "test_utils.h"
-#include <type_traits>
+#include "kernels/cast_kernels_cpu.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Call Cast Forward and Backward Kernels") {
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    TensorShape input_shape =
-        make_float_tensor_shape_from_legion_dims({100_n, 100_n});
-    TensorShape output_shape =
-        make_double_tensor_shape_from_legion_dims({100_n, 100_n});
-
-    GenericTensorAccessorW output_accessor =
-        create_random_filled_accessor_w(output_shape, allocator);
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{100_n, 100_n}},
+        DataType::FLOAT,
+    };
+    TensorShape output_shape = TensorShape{
+        TensorDims{FFOrdered{100_n, 100_n}},
+        DataType::DOUBLE,
+    };
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorR input_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(input_shape, allocator));
-
-      Kernels::Cast::forward_kernel(managed_stream.raw_stream(),
-                                    input_accessor,
-                                    output_accessor,
-                                    DataType::FLOAT,
-                                    DataType::DOUBLE);
+          create_random_filled_accessor_r(input_shape, allocator);
+      GenericTensorAccessorW output_accessor =
+          allocator.allocate_tensor(output_shape);
 
-      std::vector<double> host_double_data =
-          load_data_to_host_from_device<double>(
-              read_only_accessor_from_write_accessor(output_accessor));
+      Kernels::Cast::forward_kernel(
+          managed_stream.raw_stream(), input_accessor, output_accessor);
 
-      CHECK(contains_non_zero(host_double_data));
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
+      GenericTensorAccessorR grad_output_accessor =
+          create_random_filled_accessor_r(output_shape, allocator);
       GenericTensorAccessorW grad_input_accessor =
-          allocator.allocate_tensor(input_shape);
-
-      Kernels::Cast::backward_kernel(
-          managed_stream.raw_stream(),
-          read_only_accessor_from_write_accessor(output_accessor),
-          grad_input_accessor,
-          DataType::DOUBLE,
-          DataType::FLOAT);
-
-      std::vector<float> host_grad_float_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(grad_input_accessor));
-      CHECK(contains_non_zero(host_grad_float_data));
+          create_zero_filled_accessor_w(input_shape, allocator);
+
+      Kernels::Cast::backward_kernel(managed_stream.raw_stream(),
+                                     grad_output_accessor,
+                                     grad_input_accessor);
+
+      CHECK(contains_non_zero(grad_input_accessor));
+    }
+  }
+
+  TEST_CASE("Check Cast Forward Kernel against CPU Kernel") {
+    ManagedFFStream managed_stream{};
+
+    Allocator gpu_allocator = create_local_cuda_memory_allocator();
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{10_n, 2_n}},
+        DataType::FLOAT,
+    };
+    TensorShape output_shape = TensorShape{
+        TensorDims{FFOrdered{10_n, 2_n}},
+        DataType::DOUBLE,
+    };
+
+    // Only calling forward kernel as backward kernel is exactly the same
+    SUBCASE("forward_kernel") {
+      // Run GPU Forward Kernel
+      GenericTensorAccessorR input_accessor_gpu =
+          create_random_filled_accessor_r(input_shape, gpu_allocator);
+      GenericTensorAccessorW output_accessor_gpu =
+          create_zero_filled_accessor_w(output_shape, gpu_allocator);
+
+      Kernels::Cast::forward_kernel(
+          managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu);
+
+      // Run CPU Forward Kernel
+      GenericTensorAccessorR input_accessor_cpu =
+          copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator);
+      GenericTensorAccessorW output_accessor_cpu =
+          create_zero_filled_accessor_w(output_shape, cpu_allocator);
+
+      Kernels::Cast::cpu_forward_kernel(input_accessor_cpu,
+                                        output_accessor_cpu);
+
+      CHECK(accessors_are_equal(output_accessor_gpu, output_accessor_cpu));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_combine_kernel.cc b/lib/kernels/test/src/test_combine_kernel.cc
index 2b6b9bf589..2040dcbd5d 100644
--- a/lib/kernels/test/src/test_combine_kernel.cc
+++ b/lib/kernels/test/src/test_combine_kernel.cc
@@ -1,39 +1,39 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/combine_kernels.h"
-#include "test_utils.h"
+#include "kernels/combine_kernels_cpu.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
-TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE("Test combine kernel") {
-    ManagedPerDeviceFFHandle managed_handle{};
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
+  TEST_CASE("Call Combine Forward and Backward Kernels") {
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    TensorShape input_shape =
-        make_float_tensor_shape_from_legion_dims({100_n, 100_n});
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{100_n, 100_n}},
+        DataType::FLOAT,
+    };
     TensorShape output_shape = input_shape;
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorR input_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(input_shape, allocator));
+          create_random_filled_accessor_r(input_shape, allocator);
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
 
       Kernels::Combine::forward_kernel(
           managed_stream.raw_stream(), input_accessor, output_accessor);
 
-      std::vector<float> host_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-      CHECK(contains_non_zero(host_output_data));
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
       GenericTensorAccessorR output_grad_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(output_shape, allocator));
+          create_random_filled_accessor_r(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
           allocator.allocate_tensor(input_shape);
 
@@ -41,9 +41,66 @@ TEST_SUITE(FF_TEST_SUITE) {
                                         output_grad_accessor,
                                         input_grad_accessor);
 
-      std::vector<float> host_input_grad = load_data_to_host_from_device<float>(
-          read_only_accessor_from_write_accessor(input_grad_accessor));
-      CHECK(contains_non_zero(host_input_grad));
+      CHECK(contains_non_zero(input_grad_accessor));
+    }
+  }
+
+  TEST_CASE("Check Combine Forward Kernel against CPU Kernel") {
+    ManagedFFStream managed_stream{};
+
+    Allocator gpu_allocator = create_local_cuda_memory_allocator();
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{5_n, 5_n}},
+        DataType::FLOAT,
+    };
+    TensorShape output_shape = input_shape;
+
+    SUBCASE("forward_kernel") {
+      // Run GPU Combine Forward Kernel
+      GenericTensorAccessorR input_accessor_gpu =
+          create_random_filled_accessor_r(input_shape, gpu_allocator);
+      GenericTensorAccessorW output_accessor_gpu =
+          gpu_allocator.allocate_tensor(output_shape);
+
+      Kernels::Combine::forward_kernel(
+          managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu);
+
+      // Run CPU Combine Forward Kernel
+      GenericTensorAccessorR input_accessor_cpu =
+          copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator);
+      GenericTensorAccessorW output_accessor_cpu =
+          cpu_allocator.allocate_tensor(output_shape);
+
+      Kernels::Combine::cpu_forward_kernel(input_accessor_cpu,
+                                           output_accessor_cpu);
+
+      CHECK(accessors_are_equal(output_accessor_gpu, output_accessor_cpu));
+    }
+
+    SUBCASE("backward_kernel") {
+      // Run GPU Combine Backward Kernel
+      GenericTensorAccessorR output_grad_accessor_gpu =
+          create_random_filled_accessor_r(output_shape, gpu_allocator);
+      GenericTensorAccessorW input_grad_accessor_gpu =
+          create_zero_filled_accessor_w(input_shape, gpu_allocator);
+
+      Kernels::Combine::backward_kernel(managed_stream.raw_stream(),
+                                        output_grad_accessor_gpu,
+                                        input_grad_accessor_gpu);
+
+      // Run CPU Combine Backward Kernel
+      GenericTensorAccessorR output_grad_accessor_cpu =
+          copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator);
+      GenericTensorAccessorW input_grad_accessor_cpu =
+          create_zero_filled_accessor_w(input_shape, cpu_allocator);
+
+      Kernels::Combine::cpu_backward_kernel(output_grad_accessor_cpu,
+                                            input_grad_accessor_cpu);
+
+      CHECK(accessors_are_equal(input_grad_accessor_gpu,
+                                input_grad_accessor_cpu));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_concat_kernel.cc b/lib/kernels/test/src/test_concat_kernel.cc
index 215e599716..c2df907917 100644
--- a/lib/kernels/test/src/test_concat_kernel.cc
+++ b/lib/kernels/test/src/test_concat_kernel.cc
@@ -1,56 +1,113 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/concat_kernels.h"
-#include "test_utils.h"
 #include "utils/containers/repeat.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test concat kernel forward and backward") {
-    nonnegative_int num_inputs = 3_n;
-    nonnegative_int size_per_input = 100_n;
-    ff_dim_t concat_axis = ff_dim_t{0_n};
-
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
-
-    TensorShape input_shape =
-        make_float_tensor_shape_from_legion_dims({size_per_input});
-    TensorShape output_shape =
-        make_float_tensor_shape_from_legion_dims({size_per_input, num_inputs});
-
     Allocator allocator = create_local_cuda_memory_allocator();
 
+    const nonnegative_int num_inputs = 4_n;
+
     SUBCASE("forward_kernel") {
-      std::vector<GenericTensorAccessorR> input_accessors =
-          repeat(num_inputs, [&]() {
-            return read_only_accessor_from_write_accessor(
-                create_random_filled_accessor_w(input_shape, allocator));
-          });
-      GenericTensorAccessorW output_accessor =
-          allocator.allocate_tensor(output_shape);
-
-      Kernels::Concat::forward_kernel(managed_stream.raw_stream(),
-                                      output_accessor,
-                                      input_accessors,
-                                      concat_axis);
-
-      std::vector<float> host_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-
-      CHECK(contains_non_zero(host_output_data));
+      auto run_forward_test = [&](nonnegative_int input_rows,
+                                  nonnegative_int input_cols,
+                                  TensorShape output_shape,
+                                  ff_dim_t concat_axis) {
+        TensorShape input_shape = TensorShape{
+            TensorDims{FFOrdered{input_rows, input_cols}},
+            DataType::FLOAT,
+        };
+
+        std::vector<GenericTensorAccessorR> input_accessors =
+            repeat(num_inputs, [&]() {
+              return create_random_filled_accessor_r(input_shape, allocator);
+            });
+
+        GenericTensorAccessorW output_accessor =
+            allocator.allocate_tensor(output_shape);
+
+        Kernels::Concat::forward_kernel(managed_stream.raw_stream(),
+                                        output_accessor,
+                                        input_accessors,
+                                        concat_axis);
+
+        CHECK(contains_non_zero(output_accessor));
+      };
+
+      SUBCASE("test forward concat, axis = 0") {
+        nonnegative_int input_rows = 2_n;
+        nonnegative_int input_cols = 4_n;
+        TensorShape output_shape = TensorShape{
+            TensorDims{FFOrdered{num_inputs * input_rows, input_cols}},
+            DataType::FLOAT,
+        };
+        run_forward_test(input_rows, input_cols, output_shape, ff_dim_t{0_n});
+      }
+
+      SUBCASE("test forward concat, axis = 1") {
+        nonnegative_int input_rows = 4_n;
+        nonnegative_int input_cols = 2_n;
+        TensorShape output_shape = TensorShape{
+            TensorDims{FFOrdered{input_rows, num_inputs * input_cols}},
+            DataType::FLOAT,
+        };
+        run_forward_test(input_rows, input_cols, output_shape, ff_dim_t{1_n});
+      }
     }
 
     SUBCASE("backward_kernel") {
-      GenericTensorAccessorR output_grad_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(output_shape, allocator));
-      std::vector<GenericTensorAccessorW> input_grad_accessors = repeat(
-          num_inputs, [&]() { return allocator.allocate_tensor(input_shape); });
-      Kernels::Concat::backward_kernel(managed_stream.raw_stream(),
-                                       output_grad_accessor,
-                                       input_grad_accessors,
-                                       concat_axis);
+      auto run_backward_test = [&](nonnegative_int input_rows,
+                                   nonnegative_int input_cols,
+                                   TensorShape output_shape,
+                                   ff_dim_t concat_axis) {
+        TensorShape input_shape = TensorShape{
+            TensorDims{FFOrdered{input_rows, input_cols}},
+            DataType::FLOAT,
+        };
+
+        GenericTensorAccessorR output_grad_accessor =
+            create_random_filled_accessor_r(output_shape, allocator);
+
+        std::vector<GenericTensorAccessorW> input_grad_accessors =
+            repeat(num_inputs, [&]() {
+              return create_zero_filled_accessor_w(input_shape, allocator);
+            });
+
+        Kernels::Concat::backward_kernel(managed_stream.raw_stream(),
+                                         output_grad_accessor,
+                                         input_grad_accessors,
+                                         concat_axis);
+
+        for (auto &accessor : input_grad_accessors) {
+          CHECK(contains_non_zero(accessor));
+        }
+      };
+
+      SUBCASE("test backward concat, axis = 0") {
+        nonnegative_int input_rows = 2_n;
+        nonnegative_int input_cols = 4_n;
+        TensorShape output_shape = TensorShape{
+            TensorDims{FFOrdered{num_inputs * input_rows, input_cols}},
+            DataType::FLOAT,
+        };
+        run_backward_test(input_rows, input_cols, output_shape, ff_dim_t{0_n});
+      }
+
+      SUBCASE("test backward concat, axis = 1") {
+        nonnegative_int input_rows = 4_n;
+        nonnegative_int input_cols = 2_n;
+        TensorShape output_shape = TensorShape{
+            TensorDims{FFOrdered{input_rows, num_inputs * input_cols}},
+            DataType::FLOAT,
+        };
+        run_backward_test(input_rows, input_cols, output_shape, ff_dim_t{1_n});
+      }
     }
   }
 }
diff --git a/lib/kernels/test/src/test_cuda.cc b/lib/kernels/test/src/test_cuda.cc
index ed5852bc31..de3215cf2d 100644
--- a/lib/kernels/test/src/test_cuda.cc
+++ b/lib/kernels/test/src/test_cuda.cc
@@ -1,10 +1,10 @@
-#include "doctest/doctest.h"
-#include "test_utils.h"
+#include "internal/test_utils.h"
+#include <doctest/doctest.h>
 
 #include <random>
 
 namespace FlexFlow {
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test CUDA") {
     int deviceCount = 0;
 
diff --git a/lib/kernels/test/src/test_dropout.cc b/lib/kernels/test/src/test_dropout.cc
index 86f8f2102b..409b06d9a9 100644
--- a/lib/kernels/test/src/test_dropout.cc
+++ b/lib/kernels/test/src/test_dropout.cc
@@ -1,38 +1,37 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/dropout_kernels.h"
-#include "test_utils.h"
 #include "utils/containers/count.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test Dropout Kernels") {
     unsigned long long seed = 12345;
     float dropout_rate = 0.1;
 
     ArrayShape shape = ArrayShape{
-        std::vector<nonnegative_int>{10_n, 10_n},
+        std::vector{10_n, 10_n},
     };
 
-    TensorShape input_shape =
-        make_float_tensor_shape_from_legion_dims({10_n, 10_n});
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{10_n, 10_n}},
+        DataType::FLOAT,
+    };
     TensorShape output_shape = input_shape;
 
     ManagedFFStream managed_stream{};
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
     DropoutPerDeviceState state = Kernels::Dropout::init_kernel(
         managed_handle.raw_handle(), dropout_rate, seed, shape, allocator);
 
-    auto get_zero_count = [](std::vector<float> const &data) {
-      return count(data, [](float x) { return x == 0.0f; });
-    };
-
     SUBCASE("forward_kernel") {
       GenericTensorAccessorR input_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(input_shape, allocator));
+          create_random_filled_accessor_r(input_shape, allocator);
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
 
@@ -41,11 +40,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                                        input_accessor.get_float_ptr(),
                                        output_accessor.get_float_ptr());
 
-      std::vector<float> host_output_accessor =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-
-      CHECK(contains_non_zero(host_output_accessor));
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
diff --git a/lib/kernels/test/src/test_flat_kernel.cc b/lib/kernels/test/src/test_flat_kernel.cc
index 83f7f0445e..f8a3abdb98 100644
--- a/lib/kernels/test/src/test_flat_kernel.cc
+++ b/lib/kernels/test/src/test_flat_kernel.cc
@@ -1,21 +1,27 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/flat_kernels.h"
-#include "test_utils.h"
+#include "op-attrs/datatype_value.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test Flat Kernel") {
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n});
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{100_n}},
+        DataType::FLOAT,
+    };
     TensorShape output_shape = input_shape;
 
     GenericTensorAccessorR input_accessor =
-        read_only_accessor_from_write_accessor(
-            create_filled_accessor_w(input_shape, allocator, 2.0f));
+        read_only_accessor_from_write_accessor(create_filled_accessor_w(
+            input_shape, allocator, make_float_data_type_value(2)));
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorW output_accessor =
@@ -25,33 +31,21 @@ TEST_SUITE(FF_TEST_SUITE) {
                                     input_accessor,
                                     output_accessor.get_float_ptr());
 
-      std::vector<float> check_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-
-      std::vector<float> expected_output_data(
-          input_accessor.shape.num_elements().unwrap_nonnegative(), 2.0f);
-      CHECK(check_output_data == expected_output_data);
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
-      GenericTensorAccessorW output_grad_accessor =
-          create_filled_accessor_w(output_shape, allocator, 0.0f);
-      GenericTensorAccessorW input_grad_accessor =
-          create_filled_accessor_w(input_shape, allocator, 1.0f);
+      GenericTensorAccessorR output_grad_accessor = create_filled_accessor_r(
+          output_shape, allocator, make_float_data_type_value(0));
+      GenericTensorAccessorW input_grad_accessor = create_filled_accessor_w(
+          input_shape, allocator, make_float_data_type_value(1));
 
       Kernels::Flat::backward_kernel(managed_stream.raw_stream(),
                                      input_accessor,
-                                     input_grad_accessor.get_float_ptr(),
-                                     output_grad_accessor.get_float_ptr());
-
-      std::vector<float> backward_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(input_grad_accessor));
+                                     output_grad_accessor.get_float_ptr(),
+                                     input_grad_accessor.get_float_ptr());
 
-      std::vector<float> expected_output_data(
-          input_accessor.shape.num_elements().unwrap_nonnegative(), 1.0f);
-      CHECK(backward_output_data == expected_output_data);
+      CHECK(contains_non_zero(input_grad_accessor));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_gather_kernels.cc b/lib/kernels/test/src/test_gather_kernels.cc
index 1a8cf5f82a..f0be809475 100644
--- a/lib/kernels/test/src/test_gather_kernels.cc
+++ b/lib/kernels/test/src/test_gather_kernels.cc
@@ -1,61 +1,107 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/gather_kernels.h"
-#include "test_utils.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
-TEST_SUITE(FF_TEST_SUITE) {
+
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test Gather Forward and Backward Kernel") {
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
-
     Allocator allocator = create_local_cuda_memory_allocator();
 
     GatherPerDeviceState state = {managed_handle.raw_handle(),
-                                  legion_dim_t{2_n}};
+                                  legion_dim_t{0_n}};
 
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n});
-    TensorShape output_shape = make_float_tensor_shape_from_legion_dims({50_n});
+    SUBCASE("forward_kernel") {
+      auto run_forward_test = [&](TensorShape input_shape,
+                                  TensorShape index_shape,
+                                  TensorShape output_shape) {
+        GenericTensorAccessorR input_accessor =
+            create_random_filled_accessor_r(input_shape, allocator);
+        GenericTensorAccessorR index_accessor =
+            create_random_filled_accessor_r(index_shape, allocator);
+        GenericTensorAccessorW output_accessor =
+            allocator.allocate_tensor(output_shape);
 
-    GenericTensorAccessorR index_accessor =
-        read_only_accessor_from_write_accessor(
-            create_random_filled_accessor_w(output_shape, allocator));
+        Kernels::Gather::forward_kernel(managed_stream.raw_stream(),
+                                        state,
+                                        input_accessor,
+                                        index_accessor,
+                                        output_accessor);
 
-    SUBCASE("forward_kernel") {
-      GenericTensorAccessorR input_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(input_shape, allocator));
-      GenericTensorAccessorW output_accessor =
-          allocator.allocate_tensor(output_shape);
-
-      Kernels::Gather::forward_kernel(managed_stream.raw_stream(),
-                                      state,
-                                      input_accessor,
-                                      index_accessor,
-                                      output_accessor);
-
-      std::vector<float> host_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-      CHECK(contains_non_zero(host_output_data));
+        CHECK(contains_non_zero(output_accessor));
+      };
+
+      SUBCASE("test gather forward, 2D") {
+        TensorShape input_shape = TensorShape{
+            TensorDims{FFOrdered{2_n, 100_n}},
+            DataType::FLOAT,
+        };
+        TensorShape index_shape = TensorShape{
+            TensorDims{FFOrdered{2_n, 20_n}},
+            DataType::INT32,
+        };
+        TensorShape output_shape = TensorShape{
+            TensorDims{FFOrdered{2_n, 20_n}},
+            DataType::FLOAT,
+        };
+        run_forward_test(input_shape, index_shape, output_shape);
+      }
+
+      SUBCASE("test gather forward, 1D") {
+        TensorShape input_shape = TensorShape{
+            TensorDims{FFOrdered{100_n}},
+            DataType::FLOAT,
+        };
+        TensorShape index_shape = TensorShape{
+            TensorDims{FFOrdered{10_n}},
+            DataType::INT32,
+        };
+        TensorShape output_shape = TensorShape{
+            TensorDims{FFOrdered{10_n}},
+            DataType::FLOAT,
+        };
+        run_forward_test(input_shape, index_shape, output_shape);
+      }
     }
 
     SUBCASE("backward_kernel") {
-      GenericTensorAccessorR output_grad_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(output_shape, allocator));
-      GenericTensorAccessorW input_grad_accessor =
-          create_random_filled_accessor_w(input_shape, allocator);
-
-      Kernels::Gather::backward_kernel(managed_stream.raw_stream(),
-                                       state,
-                                       output_grad_accessor,
-                                       index_accessor,
-                                       input_grad_accessor);
-
-      std::vector<float> host_input_grad_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(input_grad_accessor));
-      CHECK(contains_non_zero(host_input_grad_data));
+      auto run_backward_test = [&](TensorShape input_shape,
+                                   TensorShape index_shape,
+                                   TensorShape output_shape) {
+        GenericTensorAccessorR output_grad_accessor =
+            create_random_filled_accessor_r(output_shape, allocator);
+        GenericTensorAccessorR index_accessor =
+            create_random_filled_accessor_r(index_shape, allocator);
+        GenericTensorAccessorW input_grad_accessor =
+            allocator.allocate_tensor(input_shape);
+
+        Kernels::Gather::backward_kernel(managed_stream.raw_stream(),
+                                         state,
+                                         output_grad_accessor,
+                                         index_accessor,
+                                         input_grad_accessor);
+        CHECK(contains_non_zero(input_grad_accessor));
+      };
+
+      SUBCASE("test gather backward, 2D") {
+        TensorShape input_shape = TensorShape{
+            TensorDims{FFOrdered{2_n, 100_n}},
+            DataType::FLOAT,
+        };
+        TensorShape index_shape = TensorShape{
+            TensorDims{FFOrdered{2_n, 25_n}},
+            DataType::INT32,
+        };
+        TensorShape output_shape = TensorShape{
+            TensorDims{FFOrdered{2_n, 25_n}},
+            DataType::FLOAT,
+        };
+        run_backward_test(input_shape, index_shape, output_shape);
+      }
     }
   }
 }
diff --git a/lib/kernels/test/src/test_layer_norm_kernels.cc b/lib/kernels/test/src/test_layer_norm_kernels.cc
index 5386c1d943..02a95ba58a 100644
--- a/lib/kernels/test/src/test_layer_norm_kernels.cc
+++ b/lib/kernels/test/src/test_layer_norm_kernels.cc
@@ -1,23 +1,30 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/layer_norm_kernels.h"
-#include "test_utils.h"
+#include "op-attrs/datatype_value.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
 
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test LayerNorm Forward and Backward Kernel") {
     nonnegative_int batch_size = 10_n;
     nonnegative_int feature_size = 10_n;
     float epsilon = 1e-5f;
     bool elementwise_affine = true;
 
-    TensorShape input_shape =
-        make_float_tensor_shape_from_legion_dims({batch_size, feature_size});
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{batch_size, feature_size}},
+        DataType::FLOAT,
+    };
     TensorShape output_shape = input_shape;
-    TensorShape feature_shape =
-        make_float_tensor_shape_from_legion_dims({feature_size});
+    TensorShape feature_shape = TensorShape{
+        TensorDims{FFOrdered{feature_size}},
+        DataType::FLOAT,
+    };
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
@@ -31,16 +38,15 @@ TEST_SUITE(FF_TEST_SUITE) {
                                         epsilon);
 
     GenericTensorAccessorR input_accessor =
-        read_only_accessor_from_write_accessor(
-            create_random_filled_accessor_w(input_shape, allocator));
-    GenericTensorAccessorW gamma_accessor =
-        create_filled_accessor_w(feature_shape, allocator, 1.0f);
+        create_random_filled_accessor_r(input_shape, allocator);
+    GenericTensorAccessorW gamma_accessor = create_filled_accessor_w(
+        feature_shape, allocator, make_float_data_type_value(1));
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
-      GenericTensorAccessorW beta_accessor =
-          create_filled_accessor_w(feature_shape, allocator, 0.0f);
+      GenericTensorAccessorW beta_accessor = create_filled_accessor_w(
+          feature_shape, allocator, make_float_data_type_value(0));
 
       Kernels::LayerNorm::forward_kernel(managed_stream.raw_stream(),
                                          state,
@@ -52,8 +58,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("backward_kernel") {
       GenericTensorAccessorR output_grad_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(output_shape, allocator));
+          create_random_filled_accessor_r(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
           create_random_filled_accessor_w(input_shape, allocator);
       GenericTensorAccessorW gamma_grad_accessor =
diff --git a/lib/kernels/test/src/test_managed_ff_stream.cc b/lib/kernels/test/src/test_managed_ff_stream.cc
new file mode 100644
index 0000000000..fb5920adcc
--- /dev/null
+++ b/lib/kernels/test/src/test_managed_ff_stream.cc
@@ -0,0 +1,107 @@
+#include "internal/test_utils.h"
+#include "kernels/gather_kernels.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
+  TEST_CASE("Test ManagedFFStream") {
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
+    ManagedFFStream managed_stream{};
+    Allocator allocator = create_local_cuda_memory_allocator();
+
+    GatherPerDeviceState state = {managed_handle.raw_handle(),
+                                  legion_dim_t{0_n}};
+
+    SUBCASE("forward_kernel") {
+      auto run_forward_test = [&](TensorShape const &input_shape,
+                                  TensorShape const &index_shape,
+                                  TensorShape const &output_shape) {
+        GenericTensorAccessorR input_accessor =
+            create_random_filled_accessor_r(input_shape, allocator);
+        GenericTensorAccessorR index_accessor =
+            create_random_filled_accessor_r(index_shape, allocator);
+        GenericTensorAccessorW output_accessor =
+            allocator.allocate_tensor(output_shape);
+
+        Kernels::Gather::forward_kernel(/*stream=*/managed_stream.raw_stream(),
+                                        /*per_device_state=*/state,
+                                        /*input=*/input_accessor,
+                                        /*index=*/index_accessor,
+                                        /*output=*/output_accessor);
+
+        CHECK(contains_non_zero(output_accessor));
+      };
+
+      SUBCASE("test gather forward, 2D") {
+        TensorShape input_shape = TensorShape{
+            TensorDims{FFOrdered{2_n, 100_n}},
+            DataType::FLOAT,
+        };
+        TensorShape index_shape = TensorShape{
+            TensorDims{FFOrdered{2_n, 20_n}},
+            DataType::INT32,
+        };
+        TensorShape output_shape = TensorShape{
+            TensorDims{FFOrdered{2_n, 20_n}},
+            DataType::FLOAT,
+        };
+        run_forward_test(input_shape, index_shape, output_shape);
+      }
+
+      SUBCASE("test gather forward, 1D") {
+        TensorShape input_shape = TensorShape{
+            TensorDims{FFOrdered{100_n}},
+            DataType::FLOAT,
+        };
+        TensorShape index_shape = TensorShape{
+            TensorDims{FFOrdered{10_n}},
+            DataType::INT32,
+        };
+        TensorShape output_shape = TensorShape{
+            TensorDims{FFOrdered{10_n}},
+            DataType::FLOAT,
+        };
+        run_forward_test(input_shape, index_shape, output_shape);
+      }
+    }
+
+    SUBCASE("backward_kernel") {
+      auto run_backward_test = [&](TensorShape const &input_shape,
+                                   TensorShape const &index_shape,
+                                   TensorShape const &output_shape) {
+        GenericTensorAccessorR output_grad_accessor =
+            create_random_filled_accessor_r(output_shape, allocator);
+        GenericTensorAccessorR index_accessor =
+            create_random_filled_accessor_r(index_shape, allocator);
+        GenericTensorAccessorW input_grad_accessor =
+            allocator.allocate_tensor(input_shape);
+
+        Kernels::Gather::backward_kernel(/*stream=*/managed_stream.raw_stream(),
+                                         /*per_device_state=*/state,
+                                         /*output_grad=*/output_grad_accessor,
+                                         /*index=*/index_accessor,
+                                         /*input_grad=*/input_grad_accessor);
+        CHECK(contains_non_zero(input_grad_accessor));
+      };
+
+      SUBCASE("test gather backward, 2D") {
+        TensorShape input_shape = TensorShape{
+            TensorDims{FFOrdered{2_n, 100_n}},
+            DataType::FLOAT,
+        };
+        TensorShape index_shape = TensorShape{
+            TensorDims{FFOrdered{2_n, 25_n}},
+            DataType::INT32,
+        };
+        TensorShape output_shape = TensorShape{
+            TensorDims{FFOrdered{2_n, 25_n}},
+            DataType::FLOAT,
+        };
+        run_backward_test(input_shape, index_shape, output_shape);
+      }
+    }
+  }
+}
diff --git a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc
new file mode 100644
index 0000000000..fc67764cdb
--- /dev/null
+++ b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc
@@ -0,0 +1,37 @@
+#include "kernels/managed_per_device_ff_handle.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
+  TEST_CASE("Test ManagedPerDeviceFFHandle") {
+    ManagedPerDeviceFFHandle base_handle{/*workSpaceSize=*/1024 * 1024,
+                                         /*allowTensorOpMathConversion=*/true};
+    PerDeviceFFHandle const *base_handle_ptr = &base_handle.raw_handle();
+
+    SUBCASE("constructor") {
+      CHECK(base_handle.raw_handle().workSpaceSize == 1024 * 1024);
+      CHECK(base_handle.raw_handle().allowTensorOpMathConversion == true);
+    }
+
+    SUBCASE("move constructor") {
+      ManagedPerDeviceFFHandle new_handle(std::move(base_handle));
+      CHECK(&new_handle.raw_handle() == base_handle_ptr);
+    }
+
+    SUBCASE("move assignment operator") {
+      SUBCASE("move assign to other") {
+        ManagedPerDeviceFFHandle new_handle{
+            /*workSpaceSize=*/1024 * 1024,
+            /*allowTensorOpMathConversion=*/true};
+        new_handle = std::move(base_handle);
+        CHECK(&new_handle.raw_handle() == base_handle_ptr);
+      }
+
+      SUBCASE("move assign to self") {
+        base_handle = std::move(base_handle);
+        CHECK(&base_handle.raw_handle() == base_handle_ptr);
+      }
+    }
+  }
+}
diff --git a/lib/kernels/test/src/test_partition_kernel.cc b/lib/kernels/test/src/test_partition_kernel.cc
index 4fd1b53210..5452266dad 100644
--- a/lib/kernels/test/src/test_partition_kernel.cc
+++ b/lib/kernels/test/src/test_partition_kernel.cc
@@ -1,12 +1,15 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/partition_kernels.h"
-#include "test_utils.h"
+#include "op-attrs/datatype_value.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
 
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test Partition Forward and Backward") {
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
@@ -14,48 +17,36 @@ TEST_SUITE(FF_TEST_SUITE) {
     RepartitionPerDeviceState state = Kernels::Repartition::init_kernel(
         managed_handle.raw_handle(), DataType::FLOAT);
 
-    TensorShape input_shape =
-        make_float_tensor_shape_from_legion_dims({10_n, 10_n});
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{10_n, 10_n}},
+        DataType::FLOAT,
+    };
     TensorShape output_shape = input_shape;
 
     SUBCASE("forward_kernel") {
-      GenericTensorAccessorR input_accessor =
-          read_only_accessor_from_write_accessor(
-              create_filled_accessor_w(input_shape, allocator, 1.0f));
+      GenericTensorAccessorR input_accessor = create_filled_accessor_r(
+          input_shape, allocator, make_float_data_type_value(1));
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
 
       Kernels::Repartition::forward_kernel(
           managed_stream.raw_stream(), state, input_accessor, output_accessor);
 
-      std::vector<float> check_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-
-      std::vector<float> expected_output_data(
-          input_accessor.shape.num_elements().unwrap_nonnegative(), 1.0f);
-      CHECK(check_output_data == expected_output_data);
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
-      GenericTensorAccessorR output_grad_accessor =
-          read_only_accessor_from_write_accessor(
-              create_filled_accessor_w(output_shape, allocator, 1.0f));
-      GenericTensorAccessorW input_grad_accessor =
-          create_filled_accessor_w(input_shape, allocator, 2.0f);
+      GenericTensorAccessorR output_grad_accessor = create_filled_accessor_r(
+          output_shape, allocator, make_float_data_type_value(1));
+      GenericTensorAccessorW input_grad_accessor = create_filled_accessor_w(
+          input_shape, allocator, make_float_data_type_value(2));
 
       Kernels::Repartition::backward_kernel(managed_stream.raw_stream(),
                                             state,
-                                            input_grad_accessor,
-                                            output_grad_accessor);
-
-      std::vector<float> host_grad_input_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(input_grad_accessor));
+                                            output_grad_accessor,
+                                            input_grad_accessor);
 
-      std::vector<float> expected_grad_input_data(
-          input_grad_accessor.shape.num_elements().unwrap_nonnegative(), 3.0f);
-      CHECK(host_grad_input_data == expected_grad_input_data);
+      CHECK(contains_non_zero(input_grad_accessor));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_pool_2d_kernels.cc b/lib/kernels/test/src/test_pool_2d_kernels.cc
index 62b61707c6..f2ada8387e 100644
--- a/lib/kernels/test/src/test_pool_2d_kernels.cc
+++ b/lib/kernels/test/src/test_pool_2d_kernels.cc
@@ -1,9 +1,10 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/pool_2d_kernels.h"
-#include "test_utils.h"
+#include "op-attrs/datatype_value.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test Pool2D Forward and Backward Kernel") {
     nonnegative_int input_w = 10_n;
     nonnegative_int input_h = 10_n;
@@ -22,7 +23,9 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     PoolOp pool_type = PoolOp::MAX;
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
@@ -46,10 +49,14 @@ TEST_SUITE(FF_TEST_SUITE) {
                                      /*stride_w=*/stride_w.unwrap_nonnegative(),
                                      /*pool_type=*/pool_type);
 
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims(
-        {input_w, input_h, input_c, input_n});
-    TensorShape output_shape = make_float_tensor_shape_from_legion_dims(
-        {output_w, output_h, output_c, output_n});
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{input_n, input_c, input_h, input_w}},
+        DataType::FLOAT,
+    };
+    TensorShape output_shape = TensorShape{
+        TensorDims{FFOrdered{output_n, input_c, output_h, output_w}},
+        DataType::FLOAT,
+    };
 
     GenericTensorAccessorW input_accessor =
         create_random_filled_accessor_w(input_shape, allocator);
@@ -62,28 +69,23 @@ TEST_SUITE(FF_TEST_SUITE) {
                                       input_accessor.ptr,
                                       output_accessor.ptr);
 
-      std::vector<float> host_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-      CHECK(contains_non_zero(host_output_data));
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
-      GenericTensorAccessorW output_grad_accessor =
-          create_filled_accessor_w(output_shape, allocator, 1.0f);
+      GenericTensorAccessorW output_grad_accessor = create_filled_accessor_w(
+          output_shape, allocator, make_float_data_type_value(1));
       GenericTensorAccessorW input_grad_accessor =
           allocator.allocate_tensor(input_shape);
 
       Kernels::Pool2D::backward_kernel(managed_stream.raw_stream(),
                                        state,
-                                       input_accessor.ptr,
-                                       input_grad_accessor.ptr,
                                        output_accessor.ptr,
-                                       output_grad_accessor.ptr);
+                                       output_grad_accessor.ptr,
+                                       input_accessor.ptr,
+                                       input_grad_accessor.ptr);
 
-      std::vector<float> host_input_grad = load_data_to_host_from_device<float>(
-          read_only_accessor_from_write_accessor(input_grad_accessor));
-      CHECK(contains_non_zero(host_input_grad));
+      CHECK(contains_non_zero(input_grad_accessor));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_reduction_kernel.cc b/lib/kernels/test/src/test_reduction_kernel.cc
index 04a3817b84..e13b149769 100644
--- a/lib/kernels/test/src/test_reduction_kernel.cc
+++ b/lib/kernels/test/src/test_reduction_kernel.cc
@@ -1,27 +1,33 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/reduction_kernels.h"
-#include "test_utils.h"
+#include "op-attrs/datatype_value.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test Reduction Forward and Backward Kernel") {
     std::size_t num_replicas = 5;
 
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims(
-        {10_n, 10_n, 10_n, 10_n, 10_n});
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{10_n, 10_n, 10_n, 10_n, 10_n}},
+        DataType::FLOAT,
+    };
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
     SUBCASE("forward_kernel") {
-      TensorShape output_shape =
-          make_float_tensor_shape_from_legion_dims({10_n});
+      TensorShape output_shape = TensorShape{
+          TensorDims{FFOrdered{10_n}},
+          DataType::FLOAT,
+      };
 
       GenericTensorAccessorR input_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(input_shape, allocator));
+          create_random_filled_accessor_r(input_shape, allocator);
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
 
@@ -30,30 +36,22 @@ TEST_SUITE(FF_TEST_SUITE) {
                                          output_accessor,
                                          num_replicas);
 
-      std::vector<float> host_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-      CHECK(contains_non_zero(host_output_data));
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
       TensorShape output_shape = input_shape;
 
-      GenericTensorAccessorR output_grad_accessor =
-          read_only_accessor_from_write_accessor(
-              create_filled_accessor_w(output_shape, allocator, 1.0f));
+      GenericTensorAccessorR output_grad_accessor = create_filled_accessor_r(
+          output_shape, allocator, make_float_data_type_value(1));
       GenericTensorAccessorW input_grad_accessor =
           allocator.allocate_tensor(input_shape);
 
       Kernels::Reduction::backward_kernel(managed_stream.raw_stream(),
-                                          input_grad_accessor,
-                                          output_grad_accessor);
-
-      std::vector<float> expected_grad_input_data(
-          input_grad_accessor.shape.num_elements().unwrap_nonnegative(), 1.0f);
-      std::vector<float> host_grad_data = load_data_to_host_from_device<float>(
-          read_only_accessor_from_write_accessor(input_grad_accessor));
-      CHECK(host_grad_data == expected_grad_input_data);
+                                          output_grad_accessor,
+                                          input_grad_accessor);
+
+      CHECK(contains_non_zero(input_grad_accessor));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc
index fa726898f2..83a9a992f7 100644
--- a/lib/kernels/test/src/test_replicate_kernel.cc
+++ b/lib/kernels/test/src/test_replicate_kernel.cc
@@ -1,55 +1,150 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
+#include "kernels/format_accessor_contents.h"
 #include "kernels/replicate_kernels.h"
-#include "test_utils.h"
+#include "kernels/replicate_kernels_cpu.h"
+#include "test/utils/doctest/check_kv.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
-TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE("Test Replicate Kernel") {
+
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
+  TEST_CASE("Call Replicate Forward and Backward Kernels") {
     nonnegative_int num_replicas = 10_n;
 
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n});
-    TensorShape output_shape = input_shape;
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{3_n}},
+        DataType::FLOAT,
+    };
+    TensorShape output_shape = TensorShape{
+        TensorDims{FFOrdered{3_n}},
+        DataType::FLOAT,
+    };
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
-    Allocator allocator = create_local_cuda_memory_allocator();
+    Allocator gpu_allocator = create_local_cuda_memory_allocator();
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
 
     SUBCASE("forward_kernel") {
-      GenericTensorAccessorR input_accessor =
-          read_only_accessor_from_write_accessor(
-              create_filled_accessor_w(input_shape, allocator, 1.0f));
-      GenericTensorAccessorW output_accessor =
-          allocator.allocate_tensor(output_shape);
+      GenericTensorAccessorR input =
+          create_1d_accessor_r_with_contents({1, 3, 2}, gpu_allocator);
+
+      GenericTensorAccessorW output =
+          gpu_allocator.allocate_tensor(output_shape);
 
       Kernels::Replicate::forward_kernel(
-          managed_stream.raw_stream(), input_accessor, output_accessor);
+          managed_stream.raw_stream(), input, output);
 
-      std::vector<float> check_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
+      GenericTensorAccessorR correct = input;
 
-      std::vector<float> expected_output_data(
-          input_accessor.shape.num_elements().unwrap_nonnegative(), 1.0f);
-      CHECK(check_output_data == expected_output_data);
+      CHECK_MESSAGE(accessors_are_equal(output, correct),
+                    check_kv("output", format_accessor_w_contents(output)));
     }
 
     SUBCASE("backward_kernel") {
-      GenericTensorAccessorW input_grad_accessor =
-          create_filled_accessor_w(input_shape, allocator, 1.0f);
-      GenericTensorAccessorR output_grad_accessor =
-          read_only_accessor_from_write_accessor(
-              create_filled_accessor_w(output_shape, allocator, 1.0f));
+      GenericTensorAccessorR output_grad = create_2d_accessor_r_with_contents(
+          {
+              {1, 2, 3},
+              {4, 3, 3},
+              {1, 3, 5},
+          },
+          gpu_allocator);
+
+      GenericTensorAccessorR correct = create_1d_accessor_r_with_contents(
+          {1 + 2 + 3, 4 + 3 + 3, 1 + 3 + 5}, cpu_allocator);
+
+      GenericTensorAccessorW input_grad =
+          gpu_allocator.allocate_tensor(input_shape);
 
       Kernels::Replicate::backward_kernel(managed_stream.raw_stream(),
-                                          input_grad_accessor,
-                                          output_grad_accessor,
+                                          output_grad,
+                                          input_grad,
                                           num_replicas.unwrap_nonnegative());
 
-      std::vector<float> check_aggregated_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(input_grad_accessor));
-      CHECK(contains_non_zero(check_aggregated_data));
+      CHECK_MESSAGE(
+          accessors_are_equal(input_grad, correct),
+          check_kv("input_grad", format_accessor_w_contents(input_grad)));
+    }
+  }
+
+  TEST_CASE("Check Replicate Forward and Backward Kernel against CPU Kernel") {
+    nonnegative_int num_replicas = 2_n;
+
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{5_n}},
+        DataType::FLOAT,
+    };
+    TensorShape output_shape = TensorShape{
+        TensorDims{FFOrdered{5_n, num_replicas}},
+        DataType::FLOAT,
+    };
+
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
+    ManagedFFStream managed_stream{};
+
+    Allocator gpu_allocator = create_local_cuda_memory_allocator();
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    SUBCASE("forward_kernel") {
+      // Run GPU Replicate Forward Kernel
+      GenericTensorAccessorR input_accessor_gpu =
+          create_random_filled_accessor_r(input_shape, gpu_allocator);
+      GenericTensorAccessorW output_accessor_gpu =
+          create_zero_filled_accessor_w(output_shape, gpu_allocator);
+
+      Kernels::Replicate::forward_kernel(
+          managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu);
+
+      // Run CPU Replicate Forward Kernel
+      GenericTensorAccessorR input_accessor_cpu =
+          copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator);
+      GenericTensorAccessorW output_accessor_cpu =
+          create_zero_filled_accessor_w(output_shape, cpu_allocator);
+
+      Kernels::Replicate::cpu_forward_kernel(input_accessor_cpu,
+                                             output_accessor_cpu);
+
+      CHECK_MESSAGE(
+          accessors_are_equal(output_accessor_gpu, output_accessor_cpu),
+          check_kv("input", format_accessor_r_contents(input_accessor_cpu)),
+          check_kv("gpu", format_accessor_w_contents(output_accessor_gpu)),
+          check_kv("cpu", format_accessor_w_contents(output_accessor_cpu)));
+    }
+
+    SUBCASE("backward_kernel") {
+      // Run GPU Replicate Backward Kernel
+      GenericTensorAccessorR output_grad_accessor_gpu =
+          create_random_filled_accessor_r(output_shape, gpu_allocator);
+      GenericTensorAccessorW input_grad_accessor_gpu =
+          create_zero_filled_accessor_w(input_shape, gpu_allocator);
+
+      Kernels::Replicate::backward_kernel(managed_stream.raw_stream(),
+                                          output_grad_accessor_gpu,
+                                          input_grad_accessor_gpu,
+                                          num_replicas.unwrap_nonnegative());
+
+      // Run CPU Replicate Backward Kernel
+      GenericTensorAccessorR output_grad_accessor_cpu =
+          copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator);
+      GenericTensorAccessorW input_grad_accessor_cpu =
+          create_zero_filled_accessor_w(input_shape, cpu_allocator);
+
+      Kernels::Replicate::cpu_backward_kernel(
+          output_grad_accessor_cpu,
+          input_grad_accessor_cpu,
+          num_replicas.unwrap_nonnegative());
+
+      CHECK_MESSAGE(
+          accessors_are_equal(input_grad_accessor_gpu, input_grad_accessor_cpu),
+          check_kv("output_grad",
+                   format_accessor_r_contents(output_grad_accessor_cpu)),
+          check_kv("gpu", format_accessor_w_contents(input_grad_accessor_gpu)),
+          check_kv("cpu", format_accessor_w_contents(input_grad_accessor_cpu)));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_reshape_kernel.cc b/lib/kernels/test/src/test_reshape_kernel.cc
index d329a347b3..66c6bf849b 100644
--- a/lib/kernels/test/src/test_reshape_kernel.cc
+++ b/lib/kernels/test/src/test_reshape_kernel.cc
@@ -1,16 +1,21 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/reshape_kernels.h"
-#include "test_utils.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test Reshape Forward and Backward") {
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n});
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{100_n}},
+        DataType::FLOAT,
+    };
     TensorShape output_shape = input_shape;
 
     ReshapePerDeviceState state =
@@ -18,42 +23,28 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorR input_accessor =
-          read_only_accessor_from_write_accessor(
-              create_filled_accessor_w(input_shape, allocator, 1.0f));
+          create_random_filled_accessor_r(input_shape, allocator);
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
 
       Kernels::Reshape::forward_kernel(
           managed_stream.raw_stream(), state, input_accessor, output_accessor);
 
-      std::vector<float> check_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-
-      std::vector<float> expected_output_data(
-          input_accessor.shape.num_elements().unwrap_nonnegative(), 1.0f);
-      CHECK(check_output_data == expected_output_data);
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
       GenericTensorAccessorR output_grad_accessor =
-          read_only_accessor_from_write_accessor(
-              create_filled_accessor_w(output_shape, allocator, 1.0f));
+          create_random_filled_accessor_r(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
-          create_filled_accessor_w(input_shape, allocator, 2.0f);
+          allocator.allocate_tensor(input_shape);
 
       Kernels::Reshape::backward_kernel(managed_stream.raw_stream(),
                                         state,
-                                        input_grad_accessor,
-                                        output_grad_accessor);
-
-      std::vector<float> host_grad_input_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(input_grad_accessor));
+                                        output_grad_accessor,
+                                        input_grad_accessor);
 
-      std::vector<float> expected_grad_input_data(
-          input_grad_accessor.shape.num_elements().unwrap_nonnegative(), 3.0f);
-      CHECK(host_grad_input_data == expected_grad_input_data);
+      CHECK(contains_non_zero(input_grad_accessor));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc
index 9c8475f6d6..6e12c48ac3 100644
--- a/lib/kernels/test/src/test_reverse_kernels.cc
+++ b/lib/kernels/test/src/test_reverse_kernels.cc
@@ -1,63 +1,124 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/reverse_kernels.h"
-#include "test_utils.h"
+#include "kernels/reverse_kernels_cpu.h"
+#include "op-attrs/datatype_value.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Call Reverse Forward and Backward Kernels") {
-    nonnegative_int reverse_dim_size = 10_n;
-    nonnegative_int in_blk_size = 10_n;
-    nonnegative_int num_out_blks = 1_n;
-
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n});
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{1_n, 10_n, 10_n}},
+        DataType::FLOAT,
+    };
     TensorShape output_shape = input_shape;
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
+    ReverseAttrs attrs = ReverseAttrs{
+        /*axis=*/ff_dim_t{0_n},
+    };
+
     SUBCASE("forward_kernel") {
       GenericTensorAccessorR input_accessor =
-          read_only_accessor_from_write_accessor(
-              create_filled_accessor_w(input_shape, allocator, 1.0f));
+          read_only_accessor_from_write_accessor(create_filled_accessor_w(
+              input_shape, allocator, make_float_data_type_value(1)));
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
 
       Kernels::Reverse::forward_kernel(
-          managed_stream.raw_stream(),
-          input_accessor.get_float_ptr(),
-          output_accessor.get_float_ptr(),
-          num_out_blks.unwrap_nonnegative(),
-          reverse_dim_size.unwrap_nonnegative(),
-          in_blk_size.unwrap_nonnegative(),
-          input_accessor.shape.num_elements().unwrap_nonnegative());
-
-      std::vector<float> check_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-      CHECK(contains_non_zero(check_output_data));
+          managed_stream.raw_stream(), input_accessor, output_accessor, attrs);
+
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
       GenericTensorAccessorW output_grad_accessor =
           create_random_filled_accessor_w(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
-          create_random_filled_accessor_w(input_shape, allocator);
-
-      Kernels::Reverse::backward_kernel(
-          managed_stream.raw_stream(),
-          output_grad_accessor.get_float_ptr(),
-          input_grad_accessor.get_float_ptr(),
-          num_out_blks.unwrap_nonnegative(),
-          reverse_dim_size.unwrap_nonnegative(),
-          in_blk_size.unwrap_nonnegative(),
-          input_grad_accessor.shape.num_elements().unwrap_nonnegative());
-
-      std::vector<float> host_grad_input_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(input_grad_accessor));
-      CHECK(contains_non_zero(host_grad_input_data));
+          allocator.allocate_tensor(input_shape);
+
+      Kernels::Reverse::backward_kernel(managed_stream.raw_stream(),
+                                        output_grad_accessor,
+                                        input_grad_accessor,
+                                        attrs);
+
+      CHECK(contains_non_zero(input_grad_accessor));
+    }
+  }
+
+  TEST_CASE("Check Reverse Forward and Backward Kernels against CPU Kernels") {
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{1_n, 4_n, 3_n}},
+        DataType::FLOAT,
+    };
+    TensorShape output_shape = input_shape;
+
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
+    ManagedFFStream managed_stream{};
+
+    Allocator gpu_allocator = create_local_cuda_memory_allocator();
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    ReverseAttrs attrs = ReverseAttrs{
+        /*axis=*/ff_dim_t{0_n},
+    };
+
+    SUBCASE("forward_kernel") {
+      // Run GPU Cast Forward Kernel
+      GenericTensorAccessorR input_accessor_gpu =
+          create_random_filled_accessor_r(input_shape, gpu_allocator);
+      GenericTensorAccessorW output_accessor_gpu =
+          create_zero_filled_accessor_w(output_shape, gpu_allocator);
+
+      Kernels::Reverse::forward_kernel(managed_stream.raw_stream(),
+                                       input_accessor_gpu,
+                                       output_accessor_gpu,
+                                       attrs);
+
+      // Run CPU Cast Forward Kernel
+      GenericTensorAccessorR input_accessor_cpu =
+          copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator);
+      GenericTensorAccessorW output_accessor_cpu =
+          create_zero_filled_accessor_w(output_shape, cpu_allocator);
+
+      Kernels::Reverse::cpu_forward_kernel(
+          input_accessor_cpu, output_accessor_cpu, attrs);
+
+      CHECK(accessors_are_equal(output_accessor_cpu, output_accessor_cpu));
+    }
+
+    SUBCASE("backward_kernel") {
+      // Run GPU Cast Backward Kernel
+      GenericTensorAccessorR output_grad_accessor_gpu =
+          create_random_filled_accessor_r(output_shape, gpu_allocator);
+
+      GenericTensorAccessorW input_grad_accessor_gpu =
+          create_zero_filled_accessor_w(input_shape, gpu_allocator);
+
+      Kernels::Reverse::backward_kernel(managed_stream.raw_stream(),
+                                        output_grad_accessor_gpu,
+                                        input_grad_accessor_gpu,
+                                        attrs);
+
+      // Run CPU Cast Backward Kernel
+      GenericTensorAccessorR output_grad_accessor_cpu =
+          copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator);
+      GenericTensorAccessorW input_grad_accessor_cpu =
+          create_zero_filled_accessor_w(input_shape, cpu_allocator);
+
+      Kernels::Reverse::cpu_backward_kernel(
+          output_grad_accessor_cpu, input_grad_accessor_cpu, attrs);
+
+      CHECK(accessors_are_equal(input_grad_accessor_gpu,
+                                input_grad_accessor_cpu));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_softmax_kernel.cc b/lib/kernels/test/src/test_softmax_kernel.cc
index c9eaa76b86..904cca2d3e 100644
--- a/lib/kernels/test/src/test_softmax_kernel.cc
+++ b/lib/kernels/test/src/test_softmax_kernel.cc
@@ -1,10 +1,10 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/softmax_kernels.h"
-#include "test_utils.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
 
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test Softmax Kernel Operations") {
     nonnegative_int input_n = 1_n;
     nonnegative_int input_c = 1_n;
@@ -12,12 +12,17 @@ TEST_SUITE(FF_TEST_SUITE) {
     nonnegative_int input_w = 100_n;
     nonnegative_int channels = 100_n;
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n});
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{100_n}},
+        DataType::FLOAT,
+    };
     TensorShape output_shape = input_shape;
 
     SoftmaxPerDeviceState state =
@@ -40,30 +45,22 @@ TEST_SUITE(FF_TEST_SUITE) {
                                        input_accessor.get_float_ptr(),
                                        output_accessor.get_float_ptr());
 
-      std::vector<float> host_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-      CHECK(contains_non_zero(host_output_data));
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
-      GenericTensorAccessorW output_grad_accessor =
-          create_filled_accessor_w(output_shape, allocator, 1.0f);
+      GenericTensorAccessorR output_grad_accessor =
+          create_random_filled_accessor_r(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
           allocator.allocate_tensor(input_shape);
 
       Kernels::Softmax::backward_kernel(
           managed_stream.raw_stream(),
-          input_grad_accessor.get_float_ptr(),
           output_grad_accessor.get_float_ptr(),
+          input_grad_accessor.get_float_ptr(),
           output_grad_accessor.shape.num_elements().unwrap_nonnegative());
 
-      std::vector<float> expected_input_grad_data = std::vector<float>(
-          input_grad_accessor.shape.num_elements().unwrap_nonnegative(), 1.0f);
-      std::vector<float> host_input_grad_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(input_grad_accessor));
-      CHECK(host_input_grad_data == expected_input_grad_data);
+      CHECK(contains_non_zero(input_grad_accessor));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_split_kernel.cc b/lib/kernels/test/src/test_split_kernel.cc
index ea0d280f68..44e8f42f76 100644
--- a/lib/kernels/test/src/test_split_kernel.cc
+++ b/lib/kernels/test/src/test_split_kernel.cc
@@ -1,24 +1,33 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/split_kernels.h"
-#include "test_utils.h"
+#include "op-attrs/datatype_value.h"
 #include "utils/containers/repeat.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
 
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test Split Forward and Backward Kernel") {
     nonnegative_int num_outputs = 2_n;
     coord_t out_blk_sizes[] = {50, 50};
     coord_t in_blk_size = 100;
     coord_t num_blks = 1;
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n});
-    TensorShape output_shape = make_float_tensor_shape_from_legion_dims({50_n});
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{100_n}},
+        DataType::FLOAT,
+    };
+    TensorShape output_shape = TensorShape{
+        TensorDims{FFOrdered{50_n}},
+        DataType::FLOAT,
+    };
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorW input_accessor =
@@ -47,8 +56,8 @@ TEST_SUITE(FF_TEST_SUITE) {
         output_grad_ptrs[i] = output_grad_accessor.get_float_ptr();
       }
 
-      GenericTensorAccessorW input_grad_accessor =
-          create_filled_accessor_w(input_shape, allocator, 0.0f);
+      GenericTensorAccessorW input_grad_accessor = create_filled_accessor_w(
+          input_shape, allocator, make_float_data_type_value(0));
 
       Kernels::Split::backward_kernel(managed_stream.raw_stream(),
                                       input_grad_accessor.get_float_ptr(),
diff --git a/lib/kernels/test/src/test_transpose_kernel.cc b/lib/kernels/test/src/test_transpose_kernel.cc
index 02d99c86a1..3c15661396 100644
--- a/lib/kernels/test/src/test_transpose_kernel.cc
+++ b/lib/kernels/test/src/test_transpose_kernel.cc
@@ -1,58 +1,54 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/transpose_kernels.h"
-#include "test_utils.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test Transpose Kernel Operations") {
     TransposeAttrs attrs = TransposeAttrs{
-        FFOrdered<ff_dim_t>{
-            ff_dim_t{0_n},
+        FFOrdered{
             ff_dim_t{1_n},
+            ff_dim_t{0_n},
         },
     };
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    TensorShape input_shape =
-        make_float_tensor_shape_from_legion_dims({10_n, 10_n});
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{10_n, 10_n}},
+        DataType::FLOAT,
+    };
     TensorShape output_shape = input_shape;
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorR input_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(input_shape, allocator));
+          create_random_filled_accessor_r(input_shape, allocator);
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
 
       Kernels::Transpose::forward_kernel(
           managed_stream.raw_stream(), attrs, input_accessor, output_accessor);
 
-      std::vector<float> host_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-      CHECK(contains_non_zero(host_output_data));
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
       GenericTensorAccessorR output_grad_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(output_shape, allocator));
+          create_random_filled_accessor_r(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
           create_random_filled_accessor_w(input_shape, allocator);
 
       Kernels::Transpose::backward_kernel(managed_stream.raw_stream(),
                                           attrs,
-                                          input_grad_accessor,
-                                          output_grad_accessor);
+                                          output_grad_accessor,
+                                          input_grad_accessor);
 
-      std::vector<float> host_grad_input_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(input_grad_accessor));
-      CHECK(contains_non_zero(host_grad_input_data));
+      CHECK(contains_non_zero(input_grad_accessor));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_utils.cc b/lib/kernels/test/src/test_utils.cc
deleted file mode 100644
index 903b666fa9..0000000000
--- a/lib/kernels/test/src/test_utils.cc
+++ /dev/null
@@ -1,106 +0,0 @@
-#include "test_utils.h"
-
-GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape,
-                                                       Allocator &allocator,
-                                                       bool cpu_fill) {
-  GenericTensorAccessorW accessor = allocator.allocate_tensor(shape);
-  size_t volume = accessor.shape.num_elements().unwrap_nonnegative();
-  std::vector<float> host_data(volume);
-  std::random_device rd;
-  std::mt19937 gen(rd());
-  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
-
-  for (auto &val : host_data) {
-    val = dist(gen);
-  }
-
-  if (cpu_fill) {
-    memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(float));
-  } else {
-    checkCUDA(cudaMemcpy(accessor.ptr,
-                         host_data.data(),
-                         host_data.size() * sizeof(float),
-                         cudaMemcpyHostToDevice));
-  }
-
-  return accessor;
-}
-
-GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape,
-                                                Allocator &allocator,
-                                                float val,
-                                                bool cpu_fill) {
-  GenericTensorAccessorW accessor = allocator.allocate_tensor(shape);
-  size_t volume = accessor.shape.num_elements().unwrap_nonnegative();
-  std::vector<float> host_data(volume, val);
-
-  if (cpu_fill) {
-    memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(float));
-  } else {
-    checkCUDA(cudaMemcpy(accessor.ptr,
-                         host_data.data(),
-                         host_data.size() * sizeof(float),
-                         cudaMemcpyHostToDevice));
-  }
-
-  return accessor;
-}
-
-GenericTensorAccessorW create_iota_filled_accessor_w(TensorShape const &shape,
-                                                     Allocator &allocator,
-                                                     bool cpu_fill) {
-  GenericTensorAccessorW accessor = allocator.allocate_tensor(shape);
-  size_t volume = accessor.shape.num_elements().unwrap_nonnegative();
-  std::vector<float> host_data(volume);
-
-  for (size_t i = 0; i < volume; i++) {
-    host_data[i] = i;
-  }
-
-  if (cpu_fill) {
-    memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(float));
-  } else {
-    checkCUDA(cudaMemcpy(accessor.ptr,
-                         host_data.data(),
-                         host_data.size() * sizeof(float),
-                         cudaMemcpyHostToDevice));
-  }
-
-  return accessor;
-}
-
-void fill_tensor_accessor_w(GenericTensorAccessorW accessor,
-                            float val,
-                            bool cpu_fill) {
-  size_t volume = accessor.shape.num_elements().unwrap_nonnegative();
-  std::vector<float> host_data(volume, val);
-
-  if (cpu_fill) {
-    memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(float));
-  } else {
-    checkCUDA(cudaMemcpy(accessor.ptr,
-                         host_data.data(),
-                         host_data.size() * sizeof(float),
-                         cudaMemcpyHostToDevice));
-  }
-}
-
-TensorShape
-    make_float_tensor_shape_from_legion_dims(FFOrdered<nonnegative_int> dims) {
-  return TensorShape{
-      TensorDims{
-          dims,
-      },
-      DataType::FLOAT,
-  };
-}
-
-TensorShape
-    make_double_tensor_shape_from_legion_dims(FFOrdered<nonnegative_int> dims) {
-  return TensorShape{
-      TensorDims{
-          dims,
-      },
-      DataType::DOUBLE,
-  };
-}
diff --git a/lib/kernels/test/src/test_utils.h b/lib/kernels/test/src/test_utils.h
deleted file mode 100644
index 08f0f382fb..0000000000
--- a/lib/kernels/test/src/test_utils.h
+++ /dev/null
@@ -1,72 +0,0 @@
-#ifndef _FLEXFLOW_KERNELS_TEST_UTILS
-#define _FLEXFLOW_KERNELS_TEST_UTILS
-
-#include "kernels/device.h"
-#include "kernels/local_cuda_allocator.h"
-#include "kernels/managed_ff_stream.h"
-#include "kernels/managed_per_device_ff_handle.h"
-#include <doctest/doctest.h>
-#include <random>
-#include <sstream>
-#include <string>
-#include <vector>
-
-using namespace FlexFlow;
-
-GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape,
-                                                       Allocator &allocator,
-                                                       bool cpu_fill = false);
-
-GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape,
-                                                Allocator &allocator,
-                                                float val,
-                                                bool cpu_fill = false);
-
-GenericTensorAccessorW create_iota_filled_accessor_w(TensorShape const &shape,
-                                                     Allocator &allocator,
-                                                     bool cpu_fill = false);
-
-void fill_tensor_accessor_w(GenericTensorAccessorW accessor,
-                            float val,
-                            bool cpu_fill = false);
-
-TensorShape
-    make_float_tensor_shape_from_legion_dims(FFOrdered<nonnegative_int> dims);
-
-TensorShape
-    make_double_tensor_shape_from_legion_dims(FFOrdered<nonnegative_int> dims);
-
-template <typename T>
-std::vector<T> load_data_to_host_from_device(GenericTensorAccessorR accessor) {
-  int volume = accessor.shape.get_volume();
-
-  std::vector<T> local_data(volume);
-  checkCUDA(cudaMemcpy(local_data.data(),
-                       accessor.ptr,
-                       local_data.size() * sizeof(T),
-                       cudaMemcpyDeviceToHost));
-  return local_data;
-}
-
-template <typename T>
-bool contains_non_zero(std::vector<T> &data) {
-  return !all_of(
-      data.begin(), data.end(), [](T const &val) { return val == 0; });
-}
-
-// Specialize doctest's StringMaker for std::vector<float>
-template <>
-struct doctest::StringMaker<std::vector<float>> {
-  static doctest::String convert(std::vector<float> const &vec) {
-    std::ostringstream oss;
-    for (size_t i = 0; i < vec.size(); ++i) {
-      oss << vec[i];
-      if (i != vec.size() - 1) {
-        oss << ", ";
-      }
-    }
-    return doctest::String(("[" + oss.str() + "]").c_str());
-  }
-};
-
-#endif
diff --git a/lib/local-execution/include/local-execution/per_device_op_state.h b/lib/local-execution/include/local-execution/per_device_op_state.h
index 1edd5b6360..f1f357a86e 100644
--- a/lib/local-execution/include/local-execution/per_device_op_state.h
+++ b/lib/local-execution/include/local-execution/per_device_op_state.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_PER_DEVICE_STATE_H
 #define _FLEXFLOW_LOCAL_EXECUTION_PER_DEVICE_STATE_H
 
+#include "kernels/per_device_op_state.dtg.h"
 #include "local-execution/device_specific_device_states.dtg.h"
-#include "local-execution/per_device_op_state.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/task_argument_accessor.h b/lib/local-execution/include/local-execution/task_argument_accessor.h
index 54c8dfc5f1..48584588e3 100644
--- a/lib/local-execution/include/local-execution/task_argument_accessor.h
+++ b/lib/local-execution/include/local-execution/task_argument_accessor.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_ARGUMENT_ACCESSOR_H
 #define _FLEXFLOW_LOCAL_EXECUTION_TASK_ARGUMENT_ACCESSOR_H
 
+#include "kernels/per_device_op_state.dtg.h"
 #include "local-execution/device_specific.h"
 #include "local-execution/itask_argument_accessor.h"
-#include "local-execution/per_device_op_state.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/tracked_allocator.h b/lib/local-execution/include/local-execution/tracked_allocator.h
index 731e04fdc8..f697337c52 100644
--- a/lib/local-execution/include/local-execution/tracked_allocator.h
+++ b/lib/local-execution/include/local-execution/tracked_allocator.h
@@ -13,6 +13,9 @@ struct TrackedAllocator : public IAllocator {
 
   void *allocate(size_t) override;
   void deallocate(void *) override;
+
+  DeviceType get_allocation_device_type() const override;
+
   size_t get_current_mem_usage();
 
 private:
diff --git a/lib/local-execution/src/local_task_argument_accessor.cc b/lib/local-execution/src/local_task_argument_accessor.cc
index 54eca7e514..5d099c6b46 100644
--- a/lib/local-execution/src/local_task_argument_accessor.cc
+++ b/lib/local-execution/src/local_task_argument_accessor.cc
@@ -24,8 +24,8 @@ GenericTensorAccessor LocalTaskArgumentAccessor::get_tensor(
   auto tensor_backing = std::get<GenericTensorAccessorW>(
       this->tensor_slots_backing.at(slot_grad_pair));
   if (priv == Permissions::RO) {
-    GenericTensorAccessorR readonly_tensor_backing = {
-        tensor_backing.data_type, tensor_backing.shape, tensor_backing.ptr};
+    GenericTensorAccessorR readonly_tensor_backing =
+        read_only_accessor_from_write_accessor(tensor_backing);
     return readonly_tensor_backing;
   } else if (priv == Permissions::RW || priv == Permissions::WO) {
     return tensor_backing;
@@ -33,6 +33,7 @@ GenericTensorAccessor LocalTaskArgumentAccessor::get_tensor(
     throw mk_runtime_error(fmt::format("Unhandled privilege mode {}", priv));
   }
 }
+
 VariadicGenericTensorAccessor LocalTaskArgumentAccessor::get_variadic_tensor(
     slot_id_t slot, Permissions priv, IsGrad is_grad) const {
   SlotGradId slot_grad_pair = SlotGradId{slot, is_grad};
@@ -43,7 +44,7 @@ VariadicGenericTensorAccessor LocalTaskArgumentAccessor::get_variadic_tensor(
     for (GenericTensorAccessorW const &tensor_backing :
          variadic_tensor_backing) {
       readonly_variadic_tensor_backing.push_back(
-          {tensor_backing.data_type, tensor_backing.shape, tensor_backing.ptr});
+          read_only_accessor_from_write_accessor(tensor_backing));
     }
     return readonly_variadic_tensor_backing;
   } else if (priv == Permissions::RW || priv == Permissions::WO) {
diff --git a/lib/local-execution/src/ops/batch_norm.cc b/lib/local-execution/src/ops/batch_norm.cc
index 1df6da8d8e..5cf8742918 100644
--- a/lib/local-execution/src/ops/batch_norm.cc
+++ b/lib/local-execution/src/ops/batch_norm.cc
@@ -134,9 +134,9 @@ static std::optional<float>
                  profiling,
                  "[BatchNorm] backward_time = {:.2lf}ms\n",
                  per_device_state,
-                 input.get_float_ptr(),
-                 output_grad.get_float_ptr(),
                  output.get_float_ptr(),
+                 output_grad.get_float_ptr(),
+                 input.get_float_ptr(),
                  input_grad.get_float_ptr(),
                  scale.get_float_ptr(),
                  scale_grad.get_float_ptr(),
diff --git a/lib/local-execution/src/ops/cast.cc b/lib/local-execution/src/ops/cast.cc
index 3e7baf49a9..e9adf88422 100644
--- a/lib/local-execution/src/ops/cast.cc
+++ b/lib/local-execution/src/ops/cast.cc
@@ -54,9 +54,7 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
                  profiling,
                  "[Cast] forward_time = {:.2lf}ms\n",
                  input,
-                 output,
-                 input.data_type,
-                 attrs.dtype);
+                 output);
 }
 
 static std::optional<float>
@@ -73,9 +71,7 @@ static std::optional<float>
                  profiling,
                  "[Cast] forward_time = {:.2lf}ms\n",
                  input_grad,
-                 output_grad,
-                 input.data_type,
-                 attrs.dtype);
+                 output_grad);
 }
 
 TaskImplFunction get_cast_fwd_task_impl() {
diff --git a/lib/local-execution/src/ops/conv_2d.cc b/lib/local-execution/src/ops/conv_2d.cc
index bb1504a3f5..55ff354483 100644
--- a/lib/local-execution/src/ops/conv_2d.cc
+++ b/lib/local-execution/src/ops/conv_2d.cc
@@ -107,8 +107,8 @@ static std::optional<float>
       acc.get_argument<Conv2DPerDeviceState>(PER_DEVICE_STATE);
   auto attrs = acc.get_argument<Conv2DAttrs>(ATTRS);
 
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
+  auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto filter = acc.get_tensor<Permissions::RO>(FILTER);
 
   auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
@@ -120,10 +120,10 @@ static std::optional<float>
                  profiling,
                  "[Conv2d] backward_time = {:.2lf}ms\n",
                  per_device_state,
-                 input.get_float_ptr(),
-                 input_grad.get_float_ptr(),
                  output.get_float_ptr(),
                  output_grad.get_float_ptr(),
+                 input.get_float_ptr(),
+                 input_grad.get_float_ptr(),
                  filter.get_float_ptr(),
                  filter_grad.get_float_ptr(),
                  bias_grad.get_float_ptr(),
diff --git a/lib/local-execution/src/ops/element_unary.cc b/lib/local-execution/src/ops/element_unary.cc
index c5ff9199f3..311b8e7924 100644
--- a/lib/local-execution/src/ops/element_unary.cc
+++ b/lib/local-execution/src/ops/element_unary.cc
@@ -58,8 +58,10 @@ static DeviceSpecificDeviceStates
 
   ParallelTensorShape output_shape =
       throw_if_unexpected(get_output_shape(attrs, input_shape));
-  ElementUnaryPerDeviceState per_device_state = init_kernel(
-      get_piece_shape(input_shape), get_piece_shape(output_shape), attrs);
+  ElementUnaryPerDeviceState per_device_state =
+      init_kernel(array_shape_from_tensor_shape(get_piece_shape(input_shape)),
+                  array_shape_from_tensor_shape(get_piece_shape(output_shape)),
+                  attrs);
 
   return DeviceSpecificDeviceStates{
       DeviceSpecific<ElementUnaryPerDeviceState>::create(per_device_state)};
@@ -88,10 +90,10 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
 static std::optional<float>
     backward_task_impl(TaskArgumentAccessor const &acc) {
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
   auto output = acc.get_tensor<Permissions::RO>(OUTPUT);
   auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
+  auto input = acc.get_tensor<Permissions::RO>(INPUT);
+  auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
 
   auto const &attrs = acc.get_argument<ElementUnaryAttrs>(ATTRS);
   auto handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
@@ -106,10 +108,10 @@ static std::optional<float>
                  per_device_state,
                  attrs,
                  handle,
-                 input,
-                 input_grad,
                  output,
-                 output_grad);
+                 output_grad,
+                 input,
+                 input_grad);
 }
 
 TaskImplFunction get_element_unary_init_task_impl() {
diff --git a/lib/local-execution/src/ops/flat.cc b/lib/local-execution/src/ops/flat.cc
index 0f872b5d50..af6fc16272 100644
--- a/lib/local-execution/src/ops/flat.cc
+++ b/lib/local-execution/src/ops/flat.cc
@@ -40,15 +40,15 @@ static std::optional<float>
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
 
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
   auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
+  auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
 
   return profile(backward_kernel,
                  profiling,
                  "[Flat] backward_time = {:.2lf}ms\n",
                  input,
-                 input_grad.get_float_ptr(),
-                 output_grad.get_float_ptr());
+                 output_grad.get_float_ptr(),
+                 input_grad.get_float_ptr());
 }
 
 TaskImplFunction get_flat_fwd_task_impl() {
diff --git a/lib/local-execution/src/ops/linear.cc b/lib/local-execution/src/ops/linear.cc
index 6f0901e66a..9641cdbd4a 100644
--- a/lib/local-execution/src/ops/linear.cc
+++ b/lib/local-execution/src/ops/linear.cc
@@ -26,9 +26,9 @@ OpTaskInvocation init(LinearAttrs const &attrs) {
   binding.bind_arg(HANDLE, ff_handle());
   binding.bind_arg(ATTRS, attrs);
 
-  binding.bind(INPUT, input_tensor(0));   // input
-  binding.bind(WEIGHT, weight_tensor(0)); // weight
-  binding.bind(OUTPUT, output_tensor(0)); // output
+  binding.bind(INPUT, input_tensor(0));
+  binding.bind(WEIGHT, weight_tensor(0));
+  binding.bind(OUTPUT, output_tensor(0));
 
   return {task_id_t::LINEAR_INIT_TASK_ID, binding};
 }
@@ -36,11 +36,11 @@ OpTaskInvocation init(LinearAttrs const &attrs) {
 OpTaskInvocation forward(LinearAttrs const &attrs) {
   OpTaskBinding binding;
 
-  binding.bind(INPUT, input_tensor(0));   // input
-  binding.bind(WEIGHT, weight_tensor(0)); // weight
-  binding.bind(OUTPUT, output_tensor(0)); // output
+  binding.bind(INPUT, input_tensor(0));
+  binding.bind(WEIGHT, weight_tensor(0));
+  binding.bind(OUTPUT, output_tensor(0));
   if (attrs.use_bias) {
-    binding.bind(BIAS, weight_tensor(1)); // bias
+    binding.bind(BIAS, weight_tensor(1));
   }
 
   binding.bind_arg(PROFILING, profiling_settings());
@@ -124,20 +124,21 @@ static std::optional<float>
     backward_task_impl(TaskArgumentAccessor const &acc) {
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto weight = acc.get_tensor<Permissions::RO>(WEIGHT);
-  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-  auto bias = acc.get_tensor<Permissions::RO>(BIAS);
+  auto output = acc.get_tensor<Permissions::RO>(OUTPUT);
 
   auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
   auto weight_grad = acc.get_tensor_grad<Permissions::RW>(WEIGHT);
-  auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
+  auto output_grad = acc.get_tensor_grad<Permissions::RW>(OUTPUT);
+
   auto per_device_state =
       acc.get_argument<LinearPerDeviceState>(PER_DEVICE_STATE);
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto attrs = acc.get_argument<LinearAttrs>(ATTRS);
 
-  float const *bias_ptr = NULL;
+  float *bias_grad_ptr = NULL;
   if (attrs.use_bias) {
-    bias_ptr = bias.get_float_ptr();
+    auto bias_grad = acc.get_tensor_grad<Permissions::RW>(BIAS);
+    bias_grad_ptr = bias_grad.get_float_ptr();
   }
 
   nonnegative_int in_dim = input.shape.at(ff_dim_t{0_n});
@@ -148,13 +149,13 @@ static std::optional<float>
                  profiling,
                  "[Linear] backward_time = {:.2lf}ms\n",
                  per_device_state,
-                 (void *)input.get_float_ptr(),
-                 (void *)input_grad.get_float_ptr(),
-                 (void *)output.get_float_ptr(),
-                 (void *)output_grad.get_float_ptr(),
-                 (void *)weight.get_float_ptr(),
-                 (void *)weight_grad.get_float_ptr(),
-                 (void *)bias_ptr,
+                 output.get_float_ptr(),
+                 output_grad.get_float_ptr(),
+                 input.get_float_ptr(),
+                 input_grad.get_float_ptr(),
+                 weight.get_float_ptr(),
+                 weight_grad.get_float_ptr(),
+                 bias_grad_ptr,
                  in_dim.unwrap_nonnegative(),
                  out_dim.unwrap_nonnegative(),
                  batch_size.unwrap_nonnegative());
diff --git a/lib/local-execution/src/ops/pool_2d.cc b/lib/local-execution/src/ops/pool_2d.cc
index fb0635efba..f85874dc0a 100644
--- a/lib/local-execution/src/ops/pool_2d.cc
+++ b/lib/local-execution/src/ops/pool_2d.cc
@@ -115,19 +115,19 @@ static std::optional<float>
   Pool2DPerDeviceState state =
       acc.get_argument<Pool2DPerDeviceState>(PER_DEVICE_STATE);
 
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto input_grad = acc.get_tensor<Permissions::RW>(INPUT);
   auto output = acc.get_tensor<Permissions::RO>(OUTPUT);
   auto output_grad = acc.get_tensor<Permissions::RO>(OUTPUT);
+  auto input = acc.get_tensor<Permissions::RO>(INPUT);
+  auto input_grad = acc.get_tensor<Permissions::RW>(INPUT);
 
   return profile(backward_kernel,
                  profiling,
                  "[Pool2D] backward_time = {:.2lf}ms\n",
                  state,
-                 input.get_float_ptr(),
-                 input_grad.get_float_ptr(),
                  output.get_float_ptr(),
-                 output_grad.get_float_ptr());
+                 output_grad.get_float_ptr(),
+                 input.get_float_ptr(),
+                 input_grad.get_float_ptr());
 }
 
 TaskImplFunction get_pool_2d_init_task_impl() {
diff --git a/lib/local-execution/src/ops/reduction.cc b/lib/local-execution/src/ops/reduction.cc
index ee1a7c6c4e..b07d9fe965 100644
--- a/lib/local-execution/src/ops/reduction.cc
+++ b/lib/local-execution/src/ops/reduction.cc
@@ -63,13 +63,13 @@ static std::optional<float>
     backward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
 
-  auto input_grad = acc.get_tensor_grad<Permissions::WO>(INPUT);
   auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
+  auto input_grad = acc.get_tensor_grad<Permissions::WO>(INPUT);
   return profile(backward_kernel,
                  profiling,
                  "[Reduction] backward_time = {:.2lf}ms\n",
-                 input_grad,
-                 output_grad);
+                 output_grad,
+                 input_grad);
 }
 
 TaskImplFunction get_reduction_fwd_task_impl() {
diff --git a/lib/local-execution/src/ops/repartition.cc b/lib/local-execution/src/ops/repartition.cc
index 6c0c813c8d..7b6e9fe2f6 100644
--- a/lib/local-execution/src/ops/repartition.cc
+++ b/lib/local-execution/src/ops/repartition.cc
@@ -85,8 +85,8 @@ static std::optional<float>
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto per_device_state =
       acc.get_argument<RepartitionPerDeviceState>(PER_DEVICE_STATE);
-  auto input_grad = acc.get_tensor_grad<Permissions::RO>(INPUT);
-  auto output_grad = acc.get_tensor_grad<Permissions::WO>(OUTPUT);
+  auto output_grad = acc.get_tensor_grad<Permissions::RO>(INPUT);
+  auto input_grad = acc.get_tensor_grad<Permissions::WO>(OUTPUT);
 
   return profile(backward_kernel,
                  profiling,
diff --git a/lib/local-execution/src/ops/replicate.cc b/lib/local-execution/src/ops/replicate.cc
index d3ada35d93..99aeb913ba 100644
--- a/lib/local-execution/src/ops/replicate.cc
+++ b/lib/local-execution/src/ops/replicate.cc
@@ -66,8 +66,8 @@ static std::optional<float>
   return profile(backward_kernel,
                  profiling,
                  "[replicate] backward_time = {:.2lf}ms\n",
-                 input_grad,
                  output_grad,
+                 input_grad,
                  attrs.replicate_degree.unwrap_nonnegative());
 }
 
diff --git a/lib/local-execution/src/ops/reshape.cc b/lib/local-execution/src/ops/reshape.cc
index fc3a75607d..e382b2668e 100644
--- a/lib/local-execution/src/ops/reshape.cc
+++ b/lib/local-execution/src/ops/reshape.cc
@@ -86,8 +86,8 @@ static std::optional<float>
                  profiling,
                  "[Reshape] backward time = {:.2lf}ms\n",
                  per_device_state,
-                 input_grad,
-                 output_grad);
+                 output_grad,
+                 input_grad);
 }
 
 TaskImplFunction get_reshape_init_task_impl() {
diff --git a/lib/local-execution/src/ops/reverse.cc b/lib/local-execution/src/ops/reverse.cc
index ddd47d355d..00f56c6892 100644
--- a/lib/local-execution/src/ops/reverse.cc
+++ b/lib/local-execution/src/ops/reverse.cc
@@ -48,30 +48,12 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
   auto attrs = acc.get_argument<ReverseAttrs>(ATTRS);
 
-  nonnegative_int output_size = output.shape.get_volume();
-  auto axis = attrs.axis;
-  nonnegative_int in_blk_size = 1_n;
-  nonnegative_int reverse_dim_size = 1_n;
-  nonnegative_int num_out_blks = 1_n;
-  for (nonnegative_int i : nonnegative_range(output.shape.get_dim())) {
-    if (i < axis.value) {
-      in_blk_size *= output.shape.at(ff_dim_t{i});
-    } else if (i == axis.value) {
-      reverse_dim_size = output.shape.at(ff_dim_t{i});
-    } else {
-      num_out_blks *= output.shape.at(ff_dim_t{i});
-    }
-  }
-
   return profile(forward_kernel,
                  profiling,
                  "[reverse] forward_time = {:.2lf}ms\n",
-                 input.get_float_ptr(),
-                 output.get_float_ptr(),
-                 num_out_blks.unwrap_nonnegative(),
-                 reverse_dim_size.unwrap_nonnegative(),
-                 in_blk_size.unwrap_nonnegative(),
-                 output_size.unwrap_nonnegative());
+                 input,
+                 output,
+                 attrs);
 }
 
 static std::optional<float>
@@ -81,30 +63,12 @@ static std::optional<float>
   auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
   auto attrs = acc.get_argument<ReverseAttrs>(ATTRS);
 
-  int axis = input_grad.shape.num_dims().unwrap_nonnegative() -
-             attrs.axis.value.unwrap_nonnegative() - 1;
-  nonnegative_int in_blk_size = 1_n;
-  nonnegative_int reverse_dim_size = 1_n;
-  nonnegative_int num_out_blks = 1_n;
-  for (nonnegative_int i : nonnegative_range(input_grad.shape.get_dim())) {
-    if (i < axis) {
-      in_blk_size *= input_grad.shape.at(ff_dim_t{i});
-    } else if (i == axis) {
-      reverse_dim_size = input_grad.shape.at(ff_dim_t{i});
-    } else {
-      num_out_blks *= input_grad.shape.at(ff_dim_t{i});
-    }
-  }
-
   return profile(backward_kernel,
                  profiling,
                  "[reverse] backward_time = {:.2lf}ms\n",
-                 output_grad.get_float_ptr(),
-                 input_grad.get_float_ptr(),
-                 num_out_blks.unwrap_nonnegative(),
-                 reverse_dim_size.unwrap_nonnegative(),
-                 in_blk_size.unwrap_nonnegative(),
-                 input_grad.shape.get_volume().unwrap_nonnegative());
+                 output_grad,
+                 input_grad,
+                 attrs);
 }
 
 TaskImplFunction get_reverse_fwd_task_impl() {
diff --git a/lib/local-execution/src/ops/softmax.cc b/lib/local-execution/src/ops/softmax.cc
index 0e94422c5f..e008098e05 100644
--- a/lib/local-execution/src/ops/softmax.cc
+++ b/lib/local-execution/src/ops/softmax.cc
@@ -106,8 +106,8 @@ static std::optional<float>
   return profile(backward_kernel,
                  profiling,
                  "[SoftMax] backward_time = {:.2lf}ms\n",
-                 input_grad.get_float_ptr(),
                  output_grad.get_float_ptr(),
+                 input_grad.get_float_ptr(),
                  output_grad.shape.get_volume().unwrap_nonnegative());
 }
 
diff --git a/lib/local-execution/src/ops/transpose.cc b/lib/local-execution/src/ops/transpose.cc
index 4146836b9a..1859bb0ccc 100644
--- a/lib/local-execution/src/ops/transpose.cc
+++ b/lib/local-execution/src/ops/transpose.cc
@@ -67,8 +67,8 @@ static std::optional<float>
                  profiling,
                  "[Transpose] Backward_time = {:.2lf} [ms]",
                  attrs,
-                 input_grad,
-                 output_grad);
+                 output_grad,
+                 input_grad);
 }
 
 OpTaskInvocation backward(TransposeAttrs const &attrs) {
diff --git a/lib/local-execution/src/per_device_state.cc b/lib/local-execution/src/per_device_op_state.cc
similarity index 100%
rename from lib/local-execution/src/per_device_state.cc
rename to lib/local-execution/src/per_device_op_state.cc
diff --git a/lib/local-execution/src/tracked_allocator.cc b/lib/local-execution/src/tracked_allocator.cc
index e6c3a11711..ed181aea32 100644
--- a/lib/local-execution/src/tracked_allocator.cc
+++ b/lib/local-execution/src/tracked_allocator.cc
@@ -23,8 +23,13 @@ size_t TrackedAllocator::get_current_mem_usage() {
   return this->current_mem_usage;
 }
 
+DeviceType TrackedAllocator::get_allocation_device_type() const {
+  return this->allocator.get_allocation_device_type();
+}
+
 Allocator get_tracked_memory_allocator(Allocator const &base_allocator) {
-  return Allocator::create<TrackedAllocator>(base_allocator);
+  Allocator allocator = Allocator::create<TrackedAllocator>(base_allocator);
+  return allocator;
 }
 
 } // namespace FlexFlow
diff --git a/lib/local-execution/test/src/test_local_cost_estimator.cc b/lib/local-execution/test/src/test_local_cost_estimator.cc
index da3af6e3ad..9f8b4092c1 100644
--- a/lib/local-execution/test/src/test_local_cost_estimator.cc
+++ b/lib/local-execution/test/src/test_local_cost_estimator.cc
@@ -12,68 +12,71 @@
 // TEST_SUITE(FF_CUDA_TEST_SUITE) {
 //   TEST_CASE("Local Cost Estimator") {
 //     // local backing initialization
-//     ManagedPerDeviceFFHandle managed_handle{};
+//     ManagedPerDeviceFFHandle managed_handle{
+//       /*workSpaceSize=*/1024 * 1024,
+//       /*allowTensorOpMathConversion=*/true};
 
-//     RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{
-//         DeviceSpecific<PerDeviceFFHandle>::create(managed_handle.raw_handle()),
-//         EnableProfiling::YES,
-//         ProfilingSettings{/*warmup_iters=*/0,
-//                           /*measure_iters=*/1}};
+//         RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{
+//             DeviceSpecific<PerDeviceFFHandle>::create(managed_handle.raw_handle()),
+//             EnableProfiling::YES,
+//             ProfilingSettings{/*warmup_iters=*/0,
+//                               /*measure_iters=*/1}};
 
-//     LocalCostEstimator cost_estimator =
-//     LocalCostEstimator{runtime_arg_config};
+//         LocalCostEstimator cost_estimator =
+//         LocalCostEstimator{runtime_arg_config};
 
-//     SUBCASE("Estimate cost -- Attention Op") {
-//       int embed_dim = 32;
-//       int num_heads = 10;
-//       MultiHeadAttentionAttrs attrs = MultiHeadAttentionAttrs{
-//           /*embed_dim=*/embed_dim,
-//           /*num_heads=*/num_heads,
-//           /*kdim=*/embed_dim,
-//           /*vdim=*/embed_dim,
-//           /*dropout=*/0.0,
-//           /*bias=*/true,
-//           /*add_bias_kv=*/false,
-//           /*add_zero_attn=*/false,
-//       };
+//         SUBCASE("Estimate cost -- Attention Op") {
+//           int embed_dim = 32;
+//           int num_heads = 10;
+//           MultiHeadAttentionAttrs attrs = MultiHeadAttentionAttrs{
+//               /*embed_dim=*/embed_dim,
+//               /*num_heads=*/num_heads,
+//               /*kdim=*/embed_dim,
+//               /*vdim=*/embed_dim,
+//               /*dropout=*/0.0,
+//               /*bias=*/true,
+//               /*add_bias_kv=*/false,
+//               /*add_zero_attn=*/false,
+//           };
 
-//       size_t batch_size = 40;
-//       size_t seq_len = 48;
-//       size_t feature_size = 36;
+//           size_t batch_size = 40;
+//           size_t seq_len = 48;
+//           size_t feature_size = 36;
 
-//       DataType dtype = DataType::FLOAT;
-//       ParallelTensorShape inputs_shape = lift_to_parallel(TensorShape{
-//           TensorDims{FFOrdered<size_t>{batch_size, seq_len, feature_size}},
-//           DataType::FLOAT,
-//       });
+//           DataType dtype = DataType::FLOAT;
+//           ParallelTensorShape inputs_shape = lift_to_parallel(TensorShape{
+//               TensorDims{FFOrdered<size_t>{batch_size, seq_len,
+//               feature_size}}, DataType::FLOAT,
+//           });
 
-//       ParallelTensorShape weights_shape = throw_if_unexpected(
-//           get_weights_shape(attrs, inputs_shape, inputs_shape,
-//           inputs_shape));
-//       ParallelTensorAttrs weight_attrs =
-//           ParallelTensorAttrs{weights_shape,
-//                               /*sync_type=*/std::nullopt,
-//                               /*initializer=*/std::nullopt,
-//                               CreateGrad::YES};
+//           ParallelTensorShape weights_shape = throw_if_unexpected(
+//               get_weights_shape(attrs, inputs_shape, inputs_shape,
+//               inputs_shape));
+//           ParallelTensorAttrs weight_attrs =
+//               ParallelTensorAttrs{weights_shape,
+//                                   /*sync_type=*/std::nullopt,
+//                                   /*initializer=*/std::nullopt,
+//                                   CreateGrad::YES};
 
-//       ParallelTensorShape output_shape = throw_if_unexpected(
-//           get_output_shape(attrs, inputs_shape, inputs_shape, inputs_shape));
-//       ParallelTensorAttrs output_attrs =
-//           ParallelTensorAttrs{output_shape,
-//                               /*sync_type=*/std::nullopt,
-//                               /*initializer=*/std::nullopt,
-//                               CreateGrad::YES};
+//           ParallelTensorShape output_shape = throw_if_unexpected(
+//               get_output_shape(attrs, inputs_shape, inputs_shape,
+//               inputs_shape));
+//           ParallelTensorAttrs output_attrs =
+//               ParallelTensorAttrs{output_shape,
+//                                   /*sync_type=*/std::nullopt,
+//                                   /*initializer=*/std::nullopt,
+//                                   CreateGrad::YES};
 
-//       CostDetails result = cost_estimator.estimate_cost(
-//           PCGOperatorAttrs{attrs},
-//           std::vector<ParallelTensorShape>{
-//               inputs_shape, inputs_shape, inputs_shape},
-//           std::vector<ParallelTensorAttrs>{weight_attrs},
-//           std::vector<ParallelTensorAttrs>{output_attrs},
-//           make_1d_machine_view(gpu_id_t{0}, gpu_id_t{1}));
+//           CostDetails result = cost_estimator.estimate_cost(
+//               PCGOperatorAttrs{attrs},
+//               std::vector<ParallelTensorShape>{
+//                   inputs_shape, inputs_shape, inputs_shape},
+//               std::vector<ParallelTensorAttrs>{weight_attrs},
+//               std::vector<ParallelTensorAttrs>{output_attrs},
+//               make_1d_machine_view(gpu_id_t{0}, gpu_id_t{1}));
 
-//       CHECK(result.total_elapsed_time > 0);
-//       CHECK(result.total_mem_usage > 0);
+//           CHECK(result.total_elapsed_time > 0);
+//           CHECK(result.total_mem_usage > 0);
+//         }
+//       }
 //     }
-//   }
-// }
diff --git a/lib/local-execution/test/src/test_local_slots_backing.cc b/lib/local-execution/test/src/test_local_slots_backing.cc
index dffb19398c..e55d1eddf5 100644
--- a/lib/local-execution/test/src/test_local_slots_backing.cc
+++ b/lib/local-execution/test/src/test_local_slots_backing.cc
@@ -1,6 +1,6 @@
 #include "kernels/attention_kernels.h"
+#include "kernels/local_cpu_allocator.h"
 #include "local-execution/local_cost_estimator.h"
-#include "local-execution/local_cpu_allocator.h"
 #include "local-execution/local_slots_backing.h"
 #include "op-attrs/ops/attention.h"
 #include "op-attrs/parallel_tensor_shape.h"
@@ -106,24 +106,24 @@ TEST_SUITE(FF_TEST_SUITE) {
           std::pair<ArrayShape, DataType> result =
               get_result_shape_and_dtype_for_tensor_guid_and_map(
                   query_guid, local_slots_backing.gradient_tensor_mapping);
-          std::pair<ArrayShape, DataType> correct = {ArrayShape{query_shape},
-                                                     dtype};
+          std::pair<ArrayShape, DataType> correct = {
+              array_shape_from_tensor_shape(query_shape), dtype};
           CHECK(result == correct);
         }
         SUBCASE("Key grad") {
           std::pair<ArrayShape, DataType> result =
               get_result_shape_and_dtype_for_tensor_guid_and_map(
                   key_guid, local_slots_backing.gradient_tensor_mapping);
-          std::pair<ArrayShape, DataType> correct = {ArrayShape{key_shape},
-                                                     dtype};
+          std::pair<ArrayShape, DataType> correct = {
+              array_shape_from_tensor_shape(key_shape), dtype};
           CHECK(result == correct);
         }
         SUBCASE("Value grad") {
           std::pair<ArrayShape, DataType> result =
               get_result_shape_and_dtype_for_tensor_guid_and_map(
                   value_guid, local_slots_backing.gradient_tensor_mapping);
-          std::pair<ArrayShape, DataType> correct = {ArrayShape{value_shape},
-                                                     dtype};
+          std::pair<ArrayShape, DataType> correct = {
+              array_shape_from_tensor_shape(value_shape), dtype};
           CHECK(result == correct);
         }
       }
@@ -135,9 +135,9 @@ TEST_SUITE(FF_TEST_SUITE) {
               get_result_shape_and_dtype_for_tensor_guid_and_map(
                   output_guid, local_slots_backing.tensor_mapping);
           std::pair<ArrayShape, DataType> correct = {
-              ArrayShape{
+              array_shape_from_tensor_shape(
                   get_tensor_attrs(cg_builder.computation_graph, output_guid)
-                      .shape},
+                      .shape),
               dtype};
           CHECK(result == correct);
         }
@@ -146,9 +146,9 @@ TEST_SUITE(FF_TEST_SUITE) {
               get_result_shape_and_dtype_for_tensor_guid_and_map(
                   output_guid, local_slots_backing.gradient_tensor_mapping);
           std::pair<ArrayShape, DataType> correct = {
-              ArrayShape{
+              array_shape_from_tensor_shape(
                   get_tensor_attrs(cg_builder.computation_graph, output_guid)
-                      .shape},
+                      .shape),
               dtype};
           CHECK(result == correct);
         }
diff --git a/lib/local-execution/test/src/test_local_task_arg_accessor.cc b/lib/local-execution/test/src/test_local_task_arg_accessor.cc
index 0fab0f6a60..a39bb229e2 100644
--- a/lib/local-execution/test/src/test_local_task_arg_accessor.cc
+++ b/lib/local-execution/test/src/test_local_task_arg_accessor.cc
@@ -1,5 +1,5 @@
 #include "doctest/doctest.h"
-#include "local-execution/local_cpu_allocator.h"
+#include "kernels/local_cpu_allocator.h"
 #include "local-execution/local_task_argument_accessor.h"
 #include "local-execution/task_signature_impl.h"
 #include "utils/fmt/variant.h"
diff --git a/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml b/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml
index 27aa50f38f..09ee99915d 100644
--- a/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml
+++ b/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml
@@ -10,5 +10,6 @@ features = [
 [[values]]
 name = "SUM"
 
-[[value]]
+[[values]]
 name = "AVG"
+
diff --git a/lib/op-attrs/include/op-attrs/datatype_value.h b/lib/op-attrs/include/op-attrs/datatype_value.h
new file mode 100644
index 0000000000..723e69bddd
--- /dev/null
+++ b/lib/op-attrs/include/op-attrs/datatype_value.h
@@ -0,0 +1,16 @@
+#ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DATATYPE_VALUE_H
+#define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DATATYPE_VALUE_H
+
+#include "op-attrs/datatype_value.dtg.h"
+
+namespace FlexFlow {
+
+DataTypeValue make_float_data_type_value(float value);
+DataTypeValue make_double_data_type_value(double value);
+DataTypeValue make_int32_data_type_value(int32_t value);
+DataTypeValue make_int64_data_type_value(int64_t value);
+DataTypeValue make_bool_data_type_value(bool value);
+
+} // namespace FlexFlow
+
+#endif // _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_MAKE_DATATYPE_VALUE_H
diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/dim_ordered.h b/lib/op-attrs/include/op-attrs/dim_ordered/dim_ordered.h
index f2355289dc..5c47745209 100644
--- a/lib/op-attrs/include/op-attrs/dim_ordered/dim_ordered.h
+++ b/lib/op-attrs/include/op-attrs/dim_ordered/dim_ordered.h
@@ -17,13 +17,9 @@ struct DimOrdered {
   DimOrdered(std::initializer_list<T> const &l)
       : contents(l.begin(), l.end()) {}
 
-  /* template <typename I, typename std::enable_if<std::is_convertible<I,
-   * T>::value>::type> */
   DimOrdered(std::vector<T> const &contents)
       : contents(contents.begin(), contents.end()) {}
 
-  /* template <typename It, typename std::enable_if<std::is_convertible<typename
-   * It::value_type, T>::value>::type> */
   template <typename It>
   DimOrdered(It begin, It end) : contents(begin, end) {}
 
@@ -62,10 +58,6 @@ struct DimOrdered {
     return this->contents != other.contents;
   }
 
-  bool operator<(DimOrdered const &other) const {
-    return this->contents < other.contents;
-  }
-
   using iterator = typename stack_vector<T, MAX_TENSOR_DIM>::iterator;
   using const_iterator =
       typename stack_vector<T, MAX_TENSOR_DIM>::const_iterator;
@@ -116,7 +108,7 @@ struct DimOrdered {
   }
 
   reverse_iterator rend() {
-    return this->contents.crend();
+    return this->contents.rend();
   }
 
   const_reverse_iterator rend() const {
@@ -145,195 +137,26 @@ struct DimOrdered {
   stack_vector<T, MAX_TENSOR_DIM> contents;
 };
 
-template <typename T>
-struct DimOrdered<ff_dim_t, T> {
-  DimOrdered() {}
-
-  DimOrdered(std::initializer_list<T> const &l)
-      : contents(l.begin(), l.end()) {}
-
-  DimOrdered(std::vector<T> const &contents)
-      : contents(contents.begin(), contents.end()) {}
-
-  template <typename It>
-  DimOrdered(It begin, It end) : contents(begin, end) {}
-
-  template <size_t MAXSIZE>
-  DimOrdered(stack_vector<T, MAXSIZE> const &contents)
-      : contents(contents.begin(), contents.end()) {}
-
-  T const &at(ff_dim_t idx) const {
-    int raw = idx.value.unwrap_nonnegative();
-    return this->contents.at(raw);
-  }
-
-  T const &at(relative_ff_dim_t idx) const {
-    int raw = idx.value;
-    if (raw < 0) {
-      raw = this->contents.size() + raw;
-    }
-    return this->contents.at(raw);
-  }
-
-  T &at(ff_dim_t idx) {
-    int raw = idx.value.unwrap_nonnegative();
-    return this->contents.at(raw);
-  }
-
-  T &at(relative_ff_dim_t idx) {
-    int raw = idx.value;
-    if (raw < 0) {
-      raw = this->contents.size() + raw;
-    }
-    return this->contents.at(raw);
-  }
-
-  T const &operator[](ff_dim_t idx) const {
-    return this->at(idx);
-  }
-
-  T const &operator[](relative_ff_dim_t idx) const {
-    return this->at(idx);
-  }
-
-  T &operator[](ff_dim_t idx) {
-    return this->at(idx);
-  }
-
-  T &operator[](relative_ff_dim_t idx) {
-    return this->at(idx);
-  }
-
-  bool idx_is_valid(ff_dim_t const &idx) const {
-    int raw = idx.value.unwrap_nonnegative();
-    return raw < this->contents.size();
-  }
-
-  bool idx_is_valid(relative_ff_dim_t const &idx) const {
-    int raw = idx.value;
-    if (raw < 0) {
-      raw = this->contents.size() + raw;
-    }
-    return (raw >= 0 && raw < this->contents.size());
-  }
-
-  bool operator==(DimOrdered const &other) const {
-    return this->contents == other.contents;
-  }
-
-  bool operator!=(DimOrdered const &other) const {
-    return this->contents != other.contents;
-  }
-
-  bool operator<(DimOrdered const &other) const {
-    return this->contents < other.contents;
-  }
-
-  using iterator = typename stack_vector<T, MAX_TENSOR_DIM>::iterator;
-  using const_iterator =
-      typename stack_vector<T, MAX_TENSOR_DIM>::const_iterator;
-  using reverse_iterator =
-      typename stack_vector<T, MAX_TENSOR_DIM>::reverse_iterator;
-  using const_reverse_iterator =
-      typename stack_vector<T, MAX_TENSOR_DIM>::const_reverse_iterator;
-  using value_type = T;
-  using pointer = value_type *;
-  using const_pointer = value_type const *;
-  using reference = value_type &;
-  using const_reference = value_type const &;
-
-  iterator begin() {
-    return this->contents.begin();
-  }
-
-  const_iterator begin() const {
-    return this->cbegin();
-  }
-
-  const_iterator cbegin() const {
-    return this->contents.cbegin();
-  }
-
-  iterator end() {
-    return this->contents.end();
-  }
-
-  const_iterator end() const {
-    return this->cend();
-  }
-
-  const_iterator cend() const {
-    return this->contents.cend();
-  }
-
-  reverse_iterator rbegin() {
-    return this->contents.rbegin();
-  }
-
-  const_reverse_iterator rbegin() const {
-    return this->crbegin();
-  }
-
-  const_reverse_iterator crbegin() const {
-    return this->contents.crbegin();
-  }
-
-  reverse_iterator rend() {
-    return this->contents.crend();
-  }
-
-  const_reverse_iterator rend() const {
-    return this->crend();
-  }
-
-  const_reverse_iterator crend() const {
-    return this->contents.crend();
-  }
-
-  size_t size() const {
-    return this->contents.size();
-  }
-
-  size_t empty() const {
-    return this->contents.empty();
-  }
-
-  size_t num_dims() const {
-    return this->size();
-  }
-
-  friend struct ::std::hash<DimOrdered>;
-
-private:
-  stack_vector<T, MAX_TENSOR_DIM> contents;
-};
-
-template <typename T>
-using FFOrdered = DimOrdered<ff_dim_t, T>;
+template <typename Idx, typename T>
+auto operator<(DimOrdered<Idx, T> const &lhs, DimOrdered<Idx, T> const &rhs)
+    -> std::enable_if_t<is_lt_comparable_v<T>, bool> {
+  return std::lexicographical_compare(
+      lhs.cbegin(), lhs.cend(), rhs.cbegin(), rhs.cend());
+}
 
-template <typename T>
-std::string format_as(FFOrdered<T> const &v) {
+template <typename Idx, typename T>
+std::string format_as(DimOrdered<Idx, T> const &v) {
   std::vector<T> as_vec(v.cbegin(), v.cend());
   return fmt::format("<ff_ordered {}>", as_vec);
 }
 
-template <typename T>
-std::ostream &operator<<(std::ostream &s, FFOrdered<T> const &v) {
+template <typename Idx, typename T>
+std::ostream &operator<<(std::ostream &s, DimOrdered<Idx, T> const &v) {
   return (s << fmt::to_string(v));
 }
 
 } // namespace FlexFlow
 
-/* template <typename Idx, typename T> */
-/* void to_json(json &j, DimOrdered<Idx, T> const &x) { */
-/*   /1* j = std::vector<T>{x.cbegin(), x.cend()}; *1/ */
-/* } */
-
-/* template <typename Idx, typename T> */
-/* void from_json(json const &j, DimOrdered<Idx, T> &x) { */
-/*   /1* x = DimOrdered<Idx, T>{j.template get<std::vector<T>>()}; *1/ */
-/* } */
-
 namespace nlohmann {
 template <typename Idx, typename T>
 struct adl_serializer<::FlexFlow::DimOrdered<Idx, T>> {
diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/slice.h b/lib/op-attrs/include/op-attrs/dim_ordered/slice.h
index 166916dd44..76526447be 100644
--- a/lib/op-attrs/include/op-attrs/dim_ordered/slice.h
+++ b/lib/op-attrs/include/op-attrs/dim_ordered/slice.h
@@ -2,7 +2,7 @@
 #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_SLICE_H
 
 #include "op-attrs/dim_ordered/dim_ordered.h"
-#include "utils/containers/subvec.h"
+#include "utils/containers/slice.h"
 #include "utils/containers/transform.h"
 #include "utils/containers/vector_of.h"
 #include "utils/optional.h"
@@ -18,35 +18,8 @@ DimOrdered<Idx, T> nonoverloaded_slice(DimOrdered<Idx, T> const &d,
   };
 
   return DimOrdered<Idx, T>{
-      subvec(vector_of(d), to_raw_idx(start), to_raw_idx(end))};
+      slice(vector_of(d), to_raw_idx(start), to_raw_idx(end))};
 }
-
-template <typename T>
-FFOrdered<T> ff_dim_t_nonoverloaded_slice(FFOrdered<T> const &d,
-                                          std::optional<ff_dim_t> const &start,
-                                          std::optional<ff_dim_t> const &end) {
-  auto to_raw_idx =
-      [](std::optional<ff_dim_t> const &idx) -> std::optional<int> {
-    return transform(
-        idx, [](ff_dim_t const &i) { return i.value.unwrap_nonnegative(); });
-  };
-
-  return FFOrdered<T>{subvec(vector_of(d), to_raw_idx(start), to_raw_idx(end))};
-}
-
-template <typename T>
-FFOrdered<T> relative_ff_dim_t_nonoverloaded_slice(
-    FFOrdered<T> const &d,
-    std::optional<relative_ff_dim_t> const &start,
-    std::optional<relative_ff_dim_t> const &end) {
-  auto to_raw_idx =
-      [](std::optional<relative_ff_dim_t> const &idx) -> std::optional<int> {
-    return transform(idx, [](relative_ff_dim_t const &i) { return i.value; });
-  };
-
-  return FFOrdered<T>{subvec(vector_of(d), to_raw_idx(start), to_raw_idx(end))};
-}
-
 template <typename Idx, typename T>
 DimOrdered<Idx, T> slice(DimOrdered<Idx, T> const &d,
                          std::optional<Idx> const &start = std::nullopt,
@@ -54,20 +27,6 @@ DimOrdered<Idx, T> slice(DimOrdered<Idx, T> const &d,
   return ff_dim_t_nonoverloaded_slice(d, start, end);
 }
 
-template <typename T>
-FFOrdered<T> slice(FFOrdered<T> const &d,
-                   std::optional<ff_dim_t> const &start = std::nullopt,
-                   std::optional<ff_dim_t> const &end = std::nullopt) {
-  return ff_dim_t_nonoverloaded_slice(d, start, end);
-}
-
-template <typename T>
-FFOrdered<T> slice(FFOrdered<T> const &d,
-                   std::optional<relative_ff_dim_t> const &start = std::nullopt,
-                   std::optional<relative_ff_dim_t> const &end = std::nullopt) {
-  return relative_ff_dim_t_nonoverloaded_slice(d, start, end);
-}
-
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/concat.h b/lib/op-attrs/include/op-attrs/ff_ordered/concat.h
similarity index 95%
rename from lib/op-attrs/include/op-attrs/dim_ordered/concat.h
rename to lib/op-attrs/include/op-attrs/ff_ordered/concat.h
index 9b9eaf9b93..a5faed2b36 100644
--- a/lib/op-attrs/include/op-attrs/dim_ordered/concat.h
+++ b/lib/op-attrs/include/op-attrs/ff_ordered/concat.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_CONCAT_H
 #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_CONCAT_H
 
-#include "op-attrs/dim_ordered/dim_ordered.h"
+#include "op-attrs/ff_ordered/ff_ordered.h"
 #include "utils/containers/concat_vectors.h"
 #include "utils/containers/transform.h"
 
diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/enumerate.h b/lib/op-attrs/include/op-attrs/ff_ordered/enumerate.h
similarity index 95%
rename from lib/op-attrs/include/op-attrs/dim_ordered/enumerate.h
rename to lib/op-attrs/include/op-attrs/ff_ordered/enumerate.h
index 9e4271a1ff..bc8636615c 100644
--- a/lib/op-attrs/include/op-attrs/dim_ordered/enumerate.h
+++ b/lib/op-attrs/include/op-attrs/ff_ordered/enumerate.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_ENUMERATE_H
 #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_ENUMERATE_H
 
-#include "op-attrs/dim_ordered/dim_ordered.h"
+#include "op-attrs/ff_ordered/ff_ordered.h"
 #include "utils/bidict/bidict.h"
 #include "utils/containers/count.h"
 
diff --git a/lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered.h b/lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered.h
new file mode 100644
index 0000000000..92ed211c31
--- /dev/null
+++ b/lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered.h
@@ -0,0 +1,228 @@
+#ifndef _FLEXFLOW_OPATTRS_INCLUDE_OPATTRS_DIM_ORDERED_FF_ORDERED_H
+#define _FLEXFLOW_OPATTRS_INCLUDE_OPATTRS_DIM_ORDERED_FF_ORDERED_H
+
+#include "op-attrs/ff_dim_t.dtg.h"
+#include "op-attrs/relative_ff_dim_t.dtg.h"
+#include "utils/fmt/vector.h"
+#include "utils/stack_vector/stack_vector.h"
+
+namespace FlexFlow {
+
+template <typename T>
+struct FFOrdered {
+  FFOrdered() {}
+
+  FFOrdered(std::initializer_list<T> const &l) : contents(l.begin(), l.end()) {}
+
+  FFOrdered(std::vector<T> const &contents)
+      : contents(contents.begin(), contents.end()) {}
+
+  template <typename It>
+  FFOrdered(It begin, It end) : contents(begin, end) {}
+
+  template <size_t MAXSIZE>
+  FFOrdered(stack_vector<T, MAXSIZE> const &contents)
+      : contents(contents.begin(), contents.end()) {}
+
+  T const &at(ff_dim_t idx) const {
+    int raw = idx.value.unwrap_nonnegative();
+    return this->contents.at(raw);
+  }
+
+  T const &at(relative_ff_dim_t idx) const {
+    int raw = idx.value;
+    if (raw < 0) {
+      raw = this->contents.size() + raw;
+    }
+    return this->contents.at(raw);
+  }
+
+  T &at(ff_dim_t idx) {
+    int raw = idx.value.unwrap_nonnegative();
+    return this->contents.at(raw);
+  }
+
+  T &at(relative_ff_dim_t idx) {
+    int raw = idx.value;
+    if (raw < 0) {
+      raw = this->contents.size() + raw;
+    }
+    return this->contents.at(raw);
+  }
+
+  T const &operator[](ff_dim_t idx) const {
+    return this->at(idx);
+  }
+
+  T const &operator[](relative_ff_dim_t idx) const {
+    return this->at(idx);
+  }
+
+  T &operator[](ff_dim_t idx) {
+    return this->at(idx);
+  }
+
+  T &operator[](relative_ff_dim_t idx) {
+    return this->at(idx);
+  }
+
+  bool idx_is_valid(ff_dim_t const &idx) const {
+    int raw = idx.value.unwrap_nonnegative();
+    return raw < this->contents.size();
+  }
+
+  bool idx_is_valid(relative_ff_dim_t const &idx) const {
+    int raw = idx.value;
+    if (raw < 0) {
+      raw = this->contents.size() + raw;
+    }
+    return (raw >= 0 && raw < this->contents.size());
+  }
+
+  bool operator==(FFOrdered const &other) const {
+    return this->contents == other.contents;
+  }
+
+  bool operator!=(FFOrdered const &other) const {
+    return this->contents != other.contents;
+  }
+
+  using iterator = typename stack_vector<T, MAX_TENSOR_DIM>::iterator;
+  using const_iterator =
+      typename stack_vector<T, MAX_TENSOR_DIM>::const_iterator;
+  using reverse_iterator =
+      typename stack_vector<T, MAX_TENSOR_DIM>::reverse_iterator;
+  using const_reverse_iterator =
+      typename stack_vector<T, MAX_TENSOR_DIM>::const_reverse_iterator;
+  using value_type = T;
+  using pointer = value_type *;
+  using const_pointer = value_type const *;
+  using reference = value_type &;
+  using const_reference = value_type const &;
+
+  iterator begin() {
+    return this->contents.begin();
+  }
+
+  const_iterator begin() const {
+    return this->cbegin();
+  }
+
+  const_iterator cbegin() const {
+    return this->contents.cbegin();
+  }
+
+  iterator end() {
+    return this->contents.end();
+  }
+
+  const_iterator end() const {
+    return this->cend();
+  }
+
+  const_iterator cend() const {
+    return this->contents.cend();
+  }
+
+  reverse_iterator rbegin() {
+    return this->contents.rbegin();
+  }
+
+  const_reverse_iterator rbegin() const {
+    return this->crbegin();
+  }
+
+  const_reverse_iterator crbegin() const {
+    return this->contents.crbegin();
+  }
+
+  reverse_iterator rend() {
+    return this->contents.rend();
+  }
+
+  const_reverse_iterator rend() const {
+    return this->crend();
+  }
+
+  const_reverse_iterator crend() const {
+    return this->contents.crend();
+  }
+
+  size_t size() const {
+    return this->contents.size();
+  }
+
+  size_t empty() const {
+    return this->contents.empty();
+  }
+
+  size_t num_dims() const {
+    return this->size();
+  }
+
+  friend struct ::std::hash<FFOrdered>;
+
+private:
+  stack_vector<T, MAX_TENSOR_DIM> contents;
+};
+
+template <typename T>
+auto operator<(FFOrdered<T> const &lhs, FFOrdered<T> const &rhs)
+    -> std::enable_if_t<is_lt_comparable_v<T>, bool> {
+  return std::lexicographical_compare(
+      lhs.cbegin(), lhs.cend(), rhs.cbegin(), rhs.cend());
+}
+
+template <typename T>
+std::string format_as(FFOrdered<T> const &v) {
+  std::vector<T> as_vec(v.cbegin(), v.cend());
+  return fmt::format("<ff_ordered {}>", as_vec);
+}
+
+template <typename T>
+std::ostream &operator<<(std::ostream &s, FFOrdered<T> const &v) {
+  return (s << fmt::to_string(v));
+}
+
+} // namespace FlexFlow
+
+namespace nlohmann {
+template <typename T>
+struct adl_serializer<::FlexFlow::FFOrdered<T>> {
+  static ::FlexFlow::FFOrdered<T> from_json(nlohmann::json const &j) {
+    return {j.template get<std::vector<T>>()};
+  }
+
+  static void to_json(nlohmann::json &j, ::FlexFlow::FFOrdered<T> const &x) {
+    j = std::vector<T>{x.cbegin(), x.cend()};
+  }
+};
+} // namespace nlohmann
+
+namespace std {
+
+template <typename T>
+struct hash<::FlexFlow::FFOrdered<T>> {
+  size_t operator()(::FlexFlow::FFOrdered<T> const &t) const {
+    static_assert(::FlexFlow::is_hashable<T>::value,
+                  "Elements must be hashable");
+
+    return get_std_hash(t.contents);
+  }
+};
+
+} // namespace std
+
+namespace rc {
+
+template <typename T>
+struct Arbitrary<::FlexFlow::FFOrdered<T>> {
+  static Gen<::FlexFlow::FFOrdered<T>> arbitrary() {
+    return gen::construct<::FlexFlow::FFOrdered<T>>(
+        gen::arbitrary<::FlexFlow::stack_vector<T, MAX_TENSOR_DIM>>());
+  }
+};
+
+} // namespace rc
+
+#endif
diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/ff_ordered_from_map.h b/lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered_from_map.h
similarity index 88%
rename from lib/op-attrs/include/op-attrs/dim_ordered/ff_ordered_from_map.h
rename to lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered_from_map.h
index f8f49233ec..9232afddfb 100644
--- a/lib/op-attrs/include/op-attrs/dim_ordered/ff_ordered_from_map.h
+++ b/lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered_from_map.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_FF_ORDERED_FROM_MAP_H
 #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_FF_ORDERED_FROM_MAP_H
 
-#include "op-attrs/dim_ordered/dim_ordered.h"
-#include "op-attrs/dim_ordered/ff_ordered_of.h"
 #include "op-attrs/ff_dim_t.h"
+#include "op-attrs/ff_ordered/ff_ordered.h"
+#include "op-attrs/ff_ordered/ff_ordered_of.h"
 
 namespace FlexFlow {
 
diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/ff_ordered_of.h b/lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered_of.h
similarity index 88%
rename from lib/op-attrs/include/op-attrs/dim_ordered/ff_ordered_of.h
rename to lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered_of.h
index 8cc1bf3a51..ace60b7e3d 100644
--- a/lib/op-attrs/include/op-attrs/dim_ordered/ff_ordered_of.h
+++ b/lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered_of.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_FF_ORDERED_OF_H
 #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_FF_ORDERED_OF_H
 
-#include "op-attrs/dim_ordered/dim_ordered.h"
+#include "op-attrs/ff_ordered/ff_ordered.h"
 
 namespace FlexFlow {
 
diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/get_idxs.h b/lib/op-attrs/include/op-attrs/ff_ordered/get_idxs.h
similarity index 91%
rename from lib/op-attrs/include/op-attrs/dim_ordered/get_idxs.h
rename to lib/op-attrs/include/op-attrs/ff_ordered/get_idxs.h
index 4e7f8530a4..5ff390d3fe 100644
--- a/lib/op-attrs/include/op-attrs/dim_ordered/get_idxs.h
+++ b/lib/op-attrs/include/op-attrs/ff_ordered/get_idxs.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_GET_IDXS_H
 #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_GET_IDXS_H
 
-#include "op-attrs/dim_ordered/dim_ordered.h"
 #include "op-attrs/ff_dim_t.h"
+#include "op-attrs/ff_ordered/ff_ordered.h"
 #include "utils/containers/count.h"
 #include "utils/containers/transform.h"
 
diff --git a/lib/op-attrs/include/op-attrs/ff_ordered/slice.h b/lib/op-attrs/include/op-attrs/ff_ordered/slice.h
new file mode 100644
index 0000000000..79217c4cc3
--- /dev/null
+++ b/lib/op-attrs/include/op-attrs/ff_ordered/slice.h
@@ -0,0 +1,49 @@
+#ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_FF_ORDERED_SLICE_H
+#define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_FF_ORDERED_SLICE_H
+
+#include "op-attrs/ff_ordered/ff_ordered.h"
+#include "utils/containers/slice.h"
+#include "utils/containers/transform.h"
+#include "utils/containers/vector_of.h"
+
+namespace FlexFlow {
+
+template <typename T>
+FFOrdered<T> ff_dim_t_nonoverloaded_slice(FFOrdered<T> const &d,
+                                          ff_dim_t const &start,
+                                          std::optional<ff_dim_t> const &end) {
+  int raw_start = start.value.unwrap_nonnegative();
+  std::optional<int> raw_end = transform(
+      end, [](ff_dim_t const &i) { return i.value.unwrap_nonnegative(); });
+  return FFOrdered<T>{slice(vector_of(d), raw_start, raw_end)};
+}
+
+template <typename T>
+FFOrdered<T> relative_ff_dim_t_nonoverloaded_slice(
+    FFOrdered<T> const &d,
+    relative_ff_dim_t const &start,
+    std::optional<relative_ff_dim_t> const &end) {
+  int raw_start = start.value;
+  std::optional<int> raw_end =
+      transform(end, [](relative_ff_dim_t const &i) { return i.value; });
+
+  return FFOrdered<T>{slice(vector_of(d), raw_start, raw_end)};
+}
+
+template <typename T>
+FFOrdered<T> slice(FFOrdered<T> const &d,
+                   ff_dim_t const &start = ff_dim_t{0_n},
+                   std::optional<ff_dim_t> const &end = std::nullopt) {
+  return ff_dim_t_nonoverloaded_slice(d, start, end);
+}
+
+template <typename T>
+FFOrdered<T> slice(FFOrdered<T> const &d,
+                   relative_ff_dim_t const &start = relative_ff_dim_t{0},
+                   std::optional<relative_ff_dim_t> const &end = std::nullopt) {
+  return relative_ff_dim_t_nonoverloaded_slice(d, start, end);
+}
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/op-attrs/include/op-attrs/ff_ordered/transform.h b/lib/op-attrs/include/op-attrs/ff_ordered/transform.h
new file mode 100644
index 0000000000..3a8eeb9ecf
--- /dev/null
+++ b/lib/op-attrs/include/op-attrs/ff_ordered/transform.h
@@ -0,0 +1,17 @@
+#ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_TRANSFORM_H
+#define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_TRANSFORM_H
+
+#include "op-attrs/ff_ordered/ff_ordered.h"
+#include "utils/containers/vector_of.h"
+#include "utils/containers/vector_transform.h"
+
+namespace FlexFlow {
+
+template <typename T, typename F, typename Out = std::invoke_result_t<F, T>>
+FFOrdered<Out> transform(FFOrdered<T> const &d, F &&f) {
+  return FFOrdered<Out>{vector_transform(vector_of(d), f)};
+}
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/op-attrs/include/op-attrs/ff_ordered/zip.h b/lib/op-attrs/include/op-attrs/ff_ordered/zip.h
new file mode 100644
index 0000000000..fe207740f7
--- /dev/null
+++ b/lib/op-attrs/include/op-attrs/ff_ordered/zip.h
@@ -0,0 +1,18 @@
+#ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_FF_ORDERED_ZIP_H
+#define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_FF_ORDERED_ZIP_H
+
+#include "op-attrs/ff_ordered/ff_ordered.h"
+#include "utils/containers/vector_of.h"
+#include "utils/containers/zip.h"
+
+namespace FlexFlow {
+
+template <typename T1, typename T2>
+FFOrdered<std::pair<T1, T2>> zip(FFOrdered<T1> const &lhs,
+                                 FFOrdered<T2> const &rhs) {
+  return FFOrdered<std::pair<T1, T2>>{zip(vector_of(lhs), vector_of(rhs))};
+}
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/op-attrs/include/op-attrs/ops/transpose_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/transpose_attrs.struct.toml
index b1c5f60382..50756f095b 100644
--- a/lib/op-attrs/include/op-attrs/ops/transpose_attrs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/transpose_attrs.struct.toml
@@ -12,7 +12,7 @@ features = [
 includes = [
   "op-attrs/ff_dim_t.h",
   "op-attrs/ff_dim_t.dtg.h",
-  "op-attrs/dim_ordered/dim_ordered.h",
+  "op-attrs/ff_ordered/ff_ordered.h",
 ]
 
 [[fields]]
diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_dim_degrees.struct.toml b/lib/op-attrs/include/op-attrs/parallel_tensor_dim_degrees.struct.toml
index be3a95eec8..d68ef02ec1 100644
--- a/lib/op-attrs/include/op-attrs/parallel_tensor_dim_degrees.struct.toml
+++ b/lib/op-attrs/include/op-attrs/parallel_tensor_dim_degrees.struct.toml
@@ -12,7 +12,7 @@ features = [
 includes = [
   "op-attrs/parallel_tensor_shape/sum_degree.dtg.h",
   "op-attrs/parallel_tensor_shape/discard_copy_degree.dtg.h",
-  "op-attrs/dim_ordered/dim_ordered.h",
+  "op-attrs/ff_ordered/ff_ordered.h",
   "utils/nonnegative_int/nonnegative_int.h",
 ]
 
diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_dims.struct.toml b/lib/op-attrs/include/op-attrs/parallel_tensor_dims.struct.toml
index f24fa12309..d2f8758377 100644
--- a/lib/op-attrs/include/op-attrs/parallel_tensor_dims.struct.toml
+++ b/lib/op-attrs/include/op-attrs/parallel_tensor_dims.struct.toml
@@ -10,7 +10,7 @@ features = [
 ]
 
 includes = [
-  "op-attrs/dim_ordered/dim_ordered.h",
+  "op-attrs/ff_ordered/ff_ordered.h",
   "op-attrs/shard_parallel_dim.dtg.h",
   "op-attrs/replica_parallel_dim_set.dtg.h",
   "<unordered_map>",
diff --git a/lib/op-attrs/include/op-attrs/tensor_dims.h b/lib/op-attrs/include/op-attrs/tensor_dims.h
index 97f3432c2f..ba35295e09 100644
--- a/lib/op-attrs/include/op-attrs/tensor_dims.h
+++ b/lib/op-attrs/include/op-attrs/tensor_dims.h
@@ -19,7 +19,7 @@ std::optional<TensorDims>
     get_broadcast_target_dims(std::unordered_set<TensorDims> const &);
 
 TensorDims slice_tensor_dims(TensorDims const &,
-                             std::optional<relative_ff_dim_t> const &start,
+                             relative_ff_dim_t const &start,
                              std::optional<relative_ff_dim_t> const &stop);
 
 } // namespace FlexFlow
diff --git a/lib/op-attrs/include/op-attrs/tensor_dims.struct.toml b/lib/op-attrs/include/op-attrs/tensor_dims.struct.toml
index e86b866fd6..8c6d1098cc 100644
--- a/lib/op-attrs/include/op-attrs/tensor_dims.struct.toml
+++ b/lib/op-attrs/include/op-attrs/tensor_dims.struct.toml
@@ -10,7 +10,7 @@ features = [
 ]
 
 includes = [
-  "op-attrs/dim_ordered/dim_ordered.h",
+  "op-attrs/ff_ordered/ff_ordered.h",
   "utils/nonnegative_int/nonnegative_int.h",
 ]
 
diff --git a/lib/op-attrs/include/op-attrs/tensor_shape.h b/lib/op-attrs/include/op-attrs/tensor_shape.h
index a3cd8bfd9a..298ea04638 100644
--- a/lib/op-attrs/include/op-attrs/tensor_shape.h
+++ b/lib/op-attrs/include/op-attrs/tensor_shape.h
@@ -12,7 +12,7 @@ nonnegative_int get_num_elements(TensorShape const &);
 nonnegative_int get_size_in_bytes(TensorShape const &);
 
 TensorShape slice_tensor_shape(TensorShape const &,
-                               std::optional<relative_ff_dim_t> const &start,
+                               relative_ff_dim_t const &start,
                                std::optional<relative_ff_dim_t> const &stop);
 
 } // namespace FlexFlow
diff --git a/lib/op-attrs/src/op-attrs/datatype_value.cc b/lib/op-attrs/src/op-attrs/datatype_value.cc
new file mode 100644
index 0000000000..4604ef0b4e
--- /dev/null
+++ b/lib/op-attrs/src/op-attrs/datatype_value.cc
@@ -0,0 +1,25 @@
+#include "op-attrs/datatype_value.h"
+
+namespace FlexFlow {
+
+DataTypeValue make_float_data_type_value(float value) {
+  return DataTypeValue{value};
+}
+
+DataTypeValue make_double_data_type_value(double value) {
+  return DataTypeValue{value};
+}
+
+DataTypeValue make_int32_data_type_value(int32_t value) {
+  return DataTypeValue{value};
+}
+
+DataTypeValue make_int64_data_type_value(int64_t value) {
+  return DataTypeValue{value};
+}
+
+DataTypeValue make_bool_data_type_value(bool value) {
+  return DataTypeValue{value};
+}
+
+} // namespace FlexFlow
diff --git a/lib/op-attrs/src/op-attrs/dim_ordered/concat.cc b/lib/op-attrs/src/op-attrs/dim_ordered/concat.cc
deleted file mode 100644
index cb29f708a3..0000000000
--- a/lib/op-attrs/src/op-attrs/dim_ordered/concat.cc
+++ /dev/null
@@ -1 +0,0 @@
-#include "op-attrs/dim_ordered/concat.h"
diff --git a/lib/op-attrs/src/op-attrs/dim_ordered/enumerate.cc b/lib/op-attrs/src/op-attrs/dim_ordered/enumerate.cc
deleted file mode 100644
index 6edd5485af..0000000000
--- a/lib/op-attrs/src/op-attrs/dim_ordered/enumerate.cc
+++ /dev/null
@@ -1 +0,0 @@
-#include "op-attrs/dim_ordered/enumerate.h"
diff --git a/lib/op-attrs/src/op-attrs/dim_ordered/ff_ordered_from_map.cc b/lib/op-attrs/src/op-attrs/dim_ordered/ff_ordered_from_map.cc
deleted file mode 100644
index 2de88f38c8..0000000000
--- a/lib/op-attrs/src/op-attrs/dim_ordered/ff_ordered_from_map.cc
+++ /dev/null
@@ -1 +0,0 @@
-#include "op-attrs/dim_ordered/ff_ordered_from_map.h"
diff --git a/lib/op-attrs/src/op-attrs/dim_ordered/ff_ordered_of.cc b/lib/op-attrs/src/op-attrs/dim_ordered/ff_ordered_of.cc
deleted file mode 100644
index 8e5c2fd38a..0000000000
--- a/lib/op-attrs/src/op-attrs/dim_ordered/ff_ordered_of.cc
+++ /dev/null
@@ -1 +0,0 @@
-#include "op-attrs/dim_ordered/ff_ordered_of.h"
diff --git a/lib/op-attrs/src/op-attrs/dim_ordered/get_idxs.cc b/lib/op-attrs/src/op-attrs/dim_ordered/get_idxs.cc
deleted file mode 100644
index 175ae8d4bd..0000000000
--- a/lib/op-attrs/src/op-attrs/dim_ordered/get_idxs.cc
+++ /dev/null
@@ -1 +0,0 @@
-#include "op-attrs/dim_ordered/get_idxs.h"
diff --git a/lib/op-attrs/src/op-attrs/dim_ordered/slice.cc b/lib/op-attrs/src/op-attrs/dim_ordered/slice.cc
index 75ab1a32aa..8c3dbd7bbc 100644
--- a/lib/op-attrs/src/op-attrs/dim_ordered/slice.cc
+++ b/lib/op-attrs/src/op-attrs/dim_ordered/slice.cc
@@ -1,26 +1 @@
 #include "op-attrs/dim_ordered/slice.h"
-#include "utils/archetypes/value_type.h"
-
-namespace FlexFlow {
-
-using T = value_type<0>;
-
-template FFOrdered<T>
-    ff_dim_t_nonoverloaded_slice(FFOrdered<T> const &d,
-                                 std::optional<ff_dim_t> const &start,
-                                 std::optional<ff_dim_t> const &end);
-
-template FFOrdered<T> relative_ff_dim_t_nonoverloaded_slice(
-    FFOrdered<T> const &d,
-    std::optional<relative_ff_dim_t> const &start,
-    std::optional<relative_ff_dim_t> const &end);
-
-template FFOrdered<T> slice(FFOrdered<T> const &d,
-                            std::optional<ff_dim_t> const &start,
-                            std::optional<ff_dim_t> const &end);
-
-template FFOrdered<T> slice(FFOrdered<T> const &d,
-                            std::optional<relative_ff_dim_t> const &start,
-                            std::optional<relative_ff_dim_t> const &end);
-
-} // namespace FlexFlow
diff --git a/lib/op-attrs/src/op-attrs/dim_ordered/transform.cc b/lib/op-attrs/src/op-attrs/dim_ordered/transform.cc
new file mode 100644
index 0000000000..73683eba94
--- /dev/null
+++ b/lib/op-attrs/src/op-attrs/dim_ordered/transform.cc
@@ -0,0 +1 @@
+#include "op-attrs/dim_ordered/transform.h"
diff --git a/lib/op-attrs/src/op-attrs/ff_ordered/enumerate.cc b/lib/op-attrs/src/op-attrs/ff_ordered/enumerate.cc
new file mode 100644
index 0000000000..e06c144149
--- /dev/null
+++ b/lib/op-attrs/src/op-attrs/ff_ordered/enumerate.cc
@@ -0,0 +1,10 @@
+#include "op-attrs/ff_ordered/enumerate.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using T = value_type<0>;
+
+template std::map<ff_dim_t, T> enumerate(FFOrdered<T> const &);
+
+} // namespace FlexFlow
diff --git a/lib/op-attrs/src/op-attrs/ff_ordered/ff_ordered.cc b/lib/op-attrs/src/op-attrs/ff_ordered/ff_ordered.cc
new file mode 100644
index 0000000000..1420586809
--- /dev/null
+++ b/lib/op-attrs/src/op-attrs/ff_ordered/ff_ordered.cc
@@ -0,0 +1,14 @@
+#include "op-attrs/ff_ordered/ff_ordered.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using T = value_type<0>;
+
+template struct FFOrdered<T>;
+
+template std::string format_as(FFOrdered<T> const &);
+
+template std::ostream &operator<<(std::ostream &, FFOrdered<T> const &);
+
+} // namespace FlexFlow
diff --git a/lib/op-attrs/src/op-attrs/ff_ordered/ff_ordered_from_map.cc b/lib/op-attrs/src/op-attrs/ff_ordered/ff_ordered_from_map.cc
new file mode 100644
index 0000000000..e39fedb858
--- /dev/null
+++ b/lib/op-attrs/src/op-attrs/ff_ordered/ff_ordered_from_map.cc
@@ -0,0 +1,13 @@
+#include "op-attrs/ff_ordered/ff_ordered_from_map.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using T = value_type<0>;
+
+template FFOrdered<T> ff_ordered_from_map(std::map<ff_dim_t, T> const &);
+
+template FFOrdered<T>
+    ff_ordered_from_map(std::unordered_map<ff_dim_t, T> const &);
+
+} // namespace FlexFlow
diff --git a/lib/op-attrs/src/op-attrs/ff_ordered/get_idxs.cc b/lib/op-attrs/src/op-attrs/ff_ordered/get_idxs.cc
new file mode 100644
index 0000000000..3da15bebba
--- /dev/null
+++ b/lib/op-attrs/src/op-attrs/ff_ordered/get_idxs.cc
@@ -0,0 +1,10 @@
+#include "op-attrs/ff_ordered/get_idxs.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using T = value_type<0>;
+
+template std::vector<ff_dim_t> get_idxs(FFOrdered<T> const &);
+
+} // namespace FlexFlow
diff --git a/lib/op-attrs/src/op-attrs/ff_ordered/slice.cc b/lib/op-attrs/src/op-attrs/ff_ordered/slice.cc
new file mode 100644
index 0000000000..059fd811cd
--- /dev/null
+++ b/lib/op-attrs/src/op-attrs/ff_ordered/slice.cc
@@ -0,0 +1,24 @@
+#include "op-attrs/ff_ordered/slice.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using T = value_type<0>;
+
+template FFOrdered<T> ff_dim_t_nonoverloaded_slice(
+    FFOrdered<T> const &, ff_dim_t const &, std::optional<ff_dim_t> const &);
+
+template FFOrdered<T> relative_ff_dim_t_nonoverloaded_slice(
+    FFOrdered<T> const &,
+    relative_ff_dim_t const &,
+    std::optional<relative_ff_dim_t> const &);
+
+template FFOrdered<T> slice(FFOrdered<T> const &,
+                            ff_dim_t const &,
+                            std::optional<ff_dim_t> const &);
+
+template FFOrdered<T> slice(FFOrdered<T> const &,
+                            relative_ff_dim_t const &,
+                            std::optional<relative_ff_dim_t> const &);
+
+} // namespace FlexFlow
diff --git a/lib/op-attrs/src/op-attrs/ff_ordered/transform.cc b/lib/op-attrs/src/op-attrs/ff_ordered/transform.cc
new file mode 100644
index 0000000000..74bf4895a3
--- /dev/null
+++ b/lib/op-attrs/src/op-attrs/ff_ordered/transform.cc
@@ -0,0 +1,12 @@
+#include "op-attrs/ff_ordered/transform.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using T = value_type<0>;
+using Out = value_type<1>;
+using F = std::function<Out(T const &)>;
+
+template FFOrdered<Out> transform(FFOrdered<T> const &, F &&);
+
+} // namespace FlexFlow
diff --git a/lib/op-attrs/src/op-attrs/ff_ordered/zip.cc b/lib/op-attrs/src/op-attrs/ff_ordered/zip.cc
new file mode 100644
index 0000000000..dc715ea97c
--- /dev/null
+++ b/lib/op-attrs/src/op-attrs/ff_ordered/zip.cc
@@ -0,0 +1,12 @@
+#include "op-attrs/ff_ordered/zip.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using T1 = value_type<0>;
+using T2 = value_type<1>;
+
+template FFOrdered<std::pair<T1, T2>> zip(FFOrdered<T1> const &,
+                                          FFOrdered<T2> const &);
+
+} // namespace FlexFlow
diff --git a/lib/op-attrs/src/op-attrs/ops/batch_norm.cc b/lib/op-attrs/src/op-attrs/ops/batch_norm.cc
index d4763ef004..ddd92bd417 100644
--- a/lib/op-attrs/src/op-attrs/ops/batch_norm.cc
+++ b/lib/op-attrs/src/op-attrs/ops/batch_norm.cc
@@ -1,6 +1,6 @@
 #include "op-attrs/ops/batch_norm.h"
-#include "op-attrs/dim_ordered/concat.h"
-#include "op-attrs/dim_ordered/slice.h"
+#include "op-attrs/ff_ordered/concat.h"
+#include "op-attrs/ff_ordered/slice.h"
 #include "op-attrs/parallel_tensor_shape.h"
 #include "op-attrs/tensor_shape.h"
 #include "utils/containers/any_of.h"
diff --git a/lib/op-attrs/src/op-attrs/ops/concat.cc b/lib/op-attrs/src/op-attrs/ops/concat.cc
index fc42241ef2..bf0ba553e4 100644
--- a/lib/op-attrs/src/op-attrs/ops/concat.cc
+++ b/lib/op-attrs/src/op-attrs/ops/concat.cc
@@ -1,6 +1,6 @@
 #include "op-attrs/ops/concat.h"
-#include "op-attrs/dim_ordered/enumerate.h"
-#include "op-attrs/dim_ordered/ff_ordered_from_map.h"
+#include "op-attrs/ff_ordered/enumerate.h"
+#include "op-attrs/ff_ordered/ff_ordered_from_map.h"
 #include "op-attrs/parallel_tensor_shape.h"
 #include "op-attrs/tensor_dims.h"
 #include "op-attrs/tensor_shape.h"
diff --git a/lib/op-attrs/src/op-attrs/ops/embedding.cc b/lib/op-attrs/src/op-attrs/ops/embedding.cc
index 4dc602646b..5b5b91a8e7 100644
--- a/lib/op-attrs/src/op-attrs/ops/embedding.cc
+++ b/lib/op-attrs/src/op-attrs/ops/embedding.cc
@@ -1,8 +1,10 @@
 #include "op-attrs/ops/embedding.h"
-#include "op-attrs/dim_ordered/slice.h"
-#include "op-attrs/dim_ordered/transform.h"
+#include "op-attrs/ff_ordered/slice.h"
+#include "op-attrs/ff_ordered/transform.h"
+#include "op-attrs/ops/embedding_attrs.dtg.h"
 #include "op-attrs/parallel_tensor_dims.h"
 #include "utils/containers/product.h"
+#include "utils/fmt/optional.h"
 #include "utils/integer_conversions.h"
 
 namespace FlexFlow {
diff --git a/lib/op-attrs/src/op-attrs/ops/flat.cc b/lib/op-attrs/src/op-attrs/ops/flat.cc
index 8ed12167b3..b4eeda76ab 100644
--- a/lib/op-attrs/src/op-attrs/ops/flat.cc
+++ b/lib/op-attrs/src/op-attrs/ops/flat.cc
@@ -1,6 +1,6 @@
 #include "op-attrs/ops/flat.h"
-#include "op-attrs/dim_ordered/concat.h"
-#include "op-attrs/dim_ordered/slice.h"
+#include "op-attrs/ff_ordered/concat.h"
+#include "op-attrs/ff_ordered/slice.h"
 #include "op-attrs/parallel_tensor_shape.h"
 #include "op-attrs/tensor_dims.h"
 #include "utils/containers/any_of.h"
diff --git a/lib/op-attrs/src/op-attrs/ops/layer_norm.cc b/lib/op-attrs/src/op-attrs/ops/layer_norm.cc
index 00c6bb5e9b..c9798368e2 100644
--- a/lib/op-attrs/src/op-attrs/ops/layer_norm.cc
+++ b/lib/op-attrs/src/op-attrs/ops/layer_norm.cc
@@ -1,6 +1,6 @@
 #include "op-attrs/ops/layer_norm.h"
-#include "op-attrs/dim_ordered/ff_ordered_of.h"
-#include "op-attrs/dim_ordered/get_idxs.h"
+#include "op-attrs/ff_ordered/ff_ordered_of.h"
+#include "op-attrs/ff_ordered/get_idxs.h"
 #include "op-attrs/parallel_tensor_shape.h"
 #include "op-attrs/tensor_shape.h"
 #include "utils/containers/all_of.h"
diff --git a/lib/op-attrs/src/op-attrs/ops/linear.cc b/lib/op-attrs/src/op-attrs/ops/linear.cc
index fb26113613..bee9d0cf4f 100644
--- a/lib/op-attrs/src/op-attrs/ops/linear.cc
+++ b/lib/op-attrs/src/op-attrs/ops/linear.cc
@@ -1,11 +1,12 @@
 #include "op-attrs/ops/linear.h"
-#include "op-attrs/dim_ordered/slice.h"
-#include "op-attrs/dim_ordered/transform.h"
+#include "op-attrs/ff_ordered/slice.h"
+#include "op-attrs/ff_ordered/transform.h"
 #include "op-attrs/initializers/kaiming_initializer_mode.h"
 #include "op-attrs/parallel_tensor_shape.h"
 #include "op-attrs/tensor_shape.h"
 #include "utils/containers/product.h"
 #include "utils/expected.h"
+#include "utils/fmt/optional.h"
 #include "utils/integer_conversions.h"
 
 namespace FlexFlow {
@@ -101,7 +102,7 @@ tl::expected<ParallelTensorShape, std::string>
   SumDegree sum_degree = SumDegree{1_n};
   DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{
       get_sum_degree(input) * product(slice(ff_ordered_shard_degrees(input),
-                                            std::nullopt,
+                                            relative_ff_dim_t{0},
                                             relative_ff_dim_t{-1}))};
   FFOrdered<nonnegative_int> shard_degrees = FFOrdered<nonnegative_int>{
       shard_dim_at_idx(input, relative_ff_dim_t{-1}).degree,
@@ -126,8 +127,10 @@ tl::expected<ParallelTensorShape, std::string>
   SumDegree sum_degree =
       SumDegree{get_sum_degree(input) *
                 shard_dim_at_idx(input, relative_ff_dim_t{-1}).degree};
-  DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{product(slice(
-      ff_ordered_shard_degrees(input), std::nullopt, relative_ff_dim_t{-1}))};
+  DiscardCopyDegree discard_copy_degree =
+      DiscardCopyDegree{product(slice(ff_ordered_shard_degrees(input),
+                                      relative_ff_dim_t{0},
+                                      relative_ff_dim_t{-1}))};
   FFOrdered<nonnegative_int> shard_degrees =
       FFOrdered<nonnegative_int>{get_discard_copy_degree(input)};
 
diff --git a/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc b/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc
index 7a8f91e498..3f2245b2dc 100644
--- a/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc
+++ b/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc
@@ -1,6 +1,6 @@
 #include "op-attrs/parallel_tensor_dims.h"
-#include "op-attrs/dim_ordered/transform.h"
-#include "op-attrs/dim_ordered/zip.h"
+#include "op-attrs/ff_ordered/transform.h"
+#include "op-attrs/ff_ordered/zip.h"
 #include "op-attrs/replica_parallel_dim.h"
 #include "op-attrs/replica_parallel_dim_set.h"
 #include "op-attrs/shard_parallel_dim.h"
diff --git a/lib/op-attrs/src/op-attrs/tensor_dims.cc b/lib/op-attrs/src/op-attrs/tensor_dims.cc
index 8d0592eab7..760278297c 100644
--- a/lib/op-attrs/src/op-attrs/tensor_dims.cc
+++ b/lib/op-attrs/src/op-attrs/tensor_dims.cc
@@ -1,6 +1,6 @@
 #include "op-attrs/tensor_dims.h"
-#include "op-attrs/dim_ordered/slice.h"
-#include "op-attrs/dim_ordered/zip.h"
+#include "op-attrs/ff_ordered/slice.h"
+#include "op-attrs/ff_ordered/zip.h"
 #include "op-attrs/replica_parallel_dim_set.h"
 #include "op-attrs/shard_parallel_dim.dtg.h"
 #include "utils/containers/all_of.h"
@@ -67,7 +67,7 @@ std::optional<TensorDims>
 }
 
 TensorDims slice_tensor_dims(TensorDims const &dims,
-                             std::optional<relative_ff_dim_t> const &start,
+                             relative_ff_dim_t const &start,
                              std::optional<relative_ff_dim_t> const &stop) {
   return TensorDims{
       slice(dims.ff_ordered, start, stop),
diff --git a/lib/op-attrs/src/op-attrs/tensor_shape.cc b/lib/op-attrs/src/op-attrs/tensor_shape.cc
index 04b18794f1..afc14af54c 100644
--- a/lib/op-attrs/src/op-attrs/tensor_shape.cc
+++ b/lib/op-attrs/src/op-attrs/tensor_shape.cc
@@ -29,7 +29,7 @@ nonnegative_int get_size_in_bytes(TensorShape const &s) {
 }
 
 TensorShape slice_tensor_shape(TensorShape const &shape,
-                               std::optional<relative_ff_dim_t> const &start,
+                               relative_ff_dim_t const &start,
                                std::optional<relative_ff_dim_t> const &stop) {
   return TensorShape{
       slice_tensor_dims(shape.dims, start, stop),
diff --git a/lib/op-attrs/test/src/op-attrs/datatype_value.cc b/lib/op-attrs/test/src/op-attrs/datatype_value.cc
new file mode 100644
index 0000000000..9b0e90b601
--- /dev/null
+++ b/lib/op-attrs/test/src/op-attrs/datatype_value.cc
@@ -0,0 +1,68 @@
+#include "op-attrs/datatype_value.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("test make_data_type_value") {
+    SUBCASE("make_float_data_type_value") {
+      float value = 1.0f;
+      DataTypeValue data_type_value = make_float_data_type_value(value);
+
+      CHECK(data_type_value.has<float>());
+      CHECK_FALSE(data_type_value.has<double>());
+      CHECK_FALSE(data_type_value.has<int32_t>());
+      CHECK_FALSE(data_type_value.has<int64_t>());
+      CHECK_FALSE(data_type_value.has<bool>());
+      CHECK(data_type_value.get<float>() == value);
+    }
+
+    SUBCASE("make_double_data_type_value") {
+      double value = 2.71828;
+      DataTypeValue data_type_value = make_double_data_type_value(value);
+
+      CHECK(data_type_value.has<double>());
+      CHECK_FALSE(data_type_value.has<float>());
+      CHECK_FALSE(data_type_value.has<int32_t>());
+      CHECK_FALSE(data_type_value.has<int64_t>());
+      CHECK_FALSE(data_type_value.has<bool>());
+      CHECK(data_type_value.get<double>() == value);
+    }
+
+    SUBCASE("make_int32_data_type_value") {
+      int32_t value = -42;
+      DataTypeValue data_type_value = make_int32_data_type_value(value);
+
+      CHECK(data_type_value.has<int32_t>());
+      CHECK_FALSE(data_type_value.has<float>());
+      CHECK_FALSE(data_type_value.has<double>());
+      CHECK_FALSE(data_type_value.has<int64_t>());
+      CHECK_FALSE(data_type_value.has<bool>());
+      CHECK(data_type_value.get<int32_t>() == value);
+    }
+
+    SUBCASE("make_int64_data_type_value") {
+      int64_t value = 1LL << 40;
+      DataTypeValue data_type_value = make_int64_data_type_value(value);
+
+      CHECK(data_type_value.has<int64_t>());
+      CHECK_FALSE(data_type_value.has<float>());
+      CHECK_FALSE(data_type_value.has<double>());
+      CHECK_FALSE(data_type_value.has<int32_t>());
+      CHECK_FALSE(data_type_value.has<bool>());
+      CHECK(data_type_value.get<int64_t>() == value);
+    }
+
+    SUBCASE("make_bool_data_type_value") {
+      bool value = true;
+      DataTypeValue data_type_value = make_bool_data_type_value(value);
+
+      CHECK(data_type_value.has<bool>());
+      CHECK_FALSE(data_type_value.has<float>());
+      CHECK_FALSE(data_type_value.has<double>());
+      CHECK_FALSE(data_type_value.has<int32_t>());
+      CHECK_FALSE(data_type_value.has<int64_t>());
+      CHECK(data_type_value.get<bool>() == value);
+    }
+  }
+}
diff --git a/lib/op-attrs/test/src/op-attrs/dim_ordered/dim_ordered.cc b/lib/op-attrs/test/src/op-attrs/dim_ordered/dim_ordered.cc
index d7901a0c53..a5a261da25 100644
--- a/lib/op-attrs/test/src/op-attrs/dim_ordered/dim_ordered.cc
+++ b/lib/op-attrs/test/src/op-attrs/dim_ordered/dim_ordered.cc
@@ -10,8 +10,4 @@ TEST_SUITE(FF_TEST_SUITE) {
       "Arbitrary<DimOrdered<int, T>> with T=", T, int, double, char) {
     RC_SUBCASE([](DimOrdered<int, T>) {});
   }
-
-  TEST_CASE_TEMPLATE("Arbitrary<FFOrdered<T>> with T=", T, int, double, char) {
-    RC_SUBCASE([](FFOrdered<T>) {});
-  }
 }
diff --git a/lib/op-attrs/test/src/op-attrs/dim_ordered/concat.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/concat.cc
similarity index 97%
rename from lib/op-attrs/test/src/op-attrs/dim_ordered/concat.cc
rename to lib/op-attrs/test/src/op-attrs/ff_ordered/concat.cc
index 2ac641cfc2..d8e04124bc 100644
--- a/lib/op-attrs/test/src/op-attrs/dim_ordered/concat.cc
+++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/concat.cc
@@ -1,4 +1,4 @@
-#include "op-attrs/dim_ordered/concat.h"
+#include "op-attrs/ff_ordered/concat.h"
 #include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
diff --git a/lib/op-attrs/test/src/op-attrs/dim_ordered/enumerate.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/enumerate.cc
similarity index 92%
rename from lib/op-attrs/test/src/op-attrs/dim_ordered/enumerate.cc
rename to lib/op-attrs/test/src/op-attrs/ff_ordered/enumerate.cc
index bf4c33d65a..e1a94e72c3 100644
--- a/lib/op-attrs/test/src/op-attrs/dim_ordered/enumerate.cc
+++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/enumerate.cc
@@ -1,4 +1,4 @@
-#include "op-attrs/dim_ordered/enumerate.h"
+#include "op-attrs/ff_ordered/enumerate.h"
 #include "test/utils/doctest/fmt/map.h"
 #include <doctest/doctest.h>
 
diff --git a/lib/op-attrs/test/src/op-attrs/ff_ordered/ff_ordered.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/ff_ordered.cc
new file mode 100644
index 0000000000..b0812ba9d6
--- /dev/null
+++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/ff_ordered.cc
@@ -0,0 +1,11 @@
+#include "op-attrs/ff_ordered/ff_ordered.h"
+#include "test/utils/rapidcheck.h"
+#include <doctest/doctest.h>
+
+using namespace FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE_TEMPLATE("Arbitrary<FFOrdered<T>> with T=", T, int, double, char) {
+    RC_SUBCASE([](FFOrdered<T>) {});
+  }
+}
diff --git a/lib/op-attrs/test/src/op-attrs/dim_ordered/ff_ordered_from_map.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/ff_ordered_from_map.cc
similarity index 96%
rename from lib/op-attrs/test/src/op-attrs/dim_ordered/ff_ordered_from_map.cc
rename to lib/op-attrs/test/src/op-attrs/ff_ordered/ff_ordered_from_map.cc
index bba989920e..73036d5662 100644
--- a/lib/op-attrs/test/src/op-attrs/dim_ordered/ff_ordered_from_map.cc
+++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/ff_ordered_from_map.cc
@@ -1,4 +1,4 @@
-#include "op-attrs/dim_ordered/ff_ordered_from_map.h"
+#include "op-attrs/ff_ordered/ff_ordered_from_map.h"
 #include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
diff --git a/lib/op-attrs/test/src/op-attrs/dim_ordered/slice.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/slice.cc
similarity index 79%
rename from lib/op-attrs/test/src/op-attrs/dim_ordered/slice.cc
rename to lib/op-attrs/test/src/op-attrs/ff_ordered/slice.cc
index b2fddd058e..2f1dfecd65 100644
--- a/lib/op-attrs/test/src/op-attrs/dim_ordered/slice.cc
+++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/slice.cc
@@ -1,4 +1,4 @@
-#include "op-attrs/dim_ordered/slice.h"
+#include "op-attrs/ff_ordered/slice.h"
 #include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
@@ -25,13 +25,6 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       CHECK(result == correct);
     }
-    SUBCASE("std::nullopt_t, ff_dim_t") {
-      FFOrdered<size_t> result =
-          slice(d, std::nullopt, ff_dim_t{nonnegative_int{3}});
-      FFOrdered<size_t> correct = FFOrdered<size_t>{1, 2, 3};
-
-      CHECK(result == correct);
-    }
     SUBCASE("relative_ff_dim_t, relative_ff_dim_t") {
       FFOrdered<size_t> result =
           slice(d, relative_ff_dim_t{1}, relative_ff_dim_t{-1});
@@ -45,12 +38,6 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       CHECK(result == correct);
     }
-    SUBCASE("std::nullopt_t, relative_ff_dim_t") {
-      FFOrdered<size_t> result = slice(d, std::nullopt, relative_ff_dim_t{-1});
-      FFOrdered<size_t> correct = FFOrdered<size_t>{1, 2, 3};
-
-      CHECK(result == correct);
-    }
     SUBCASE("start index = stop index") {
       FFOrdered<size_t> result =
           slice(d, relative_ff_dim_t{1}, relative_ff_dim_t{1});
@@ -86,10 +73,10 @@ TEST_SUITE(FF_TEST_SUITE) {
       CHECK_THROWS(slice(d, relative_ff_dim_t{10}, std::nullopt));
     }
     SUBCASE("stop index out of bounds (too low)") {
-      CHECK_THROWS(slice(d, std::nullopt, relative_ff_dim_t{-10}));
+      CHECK_THROWS(slice(d, relative_ff_dim_t{0}, relative_ff_dim_t{-10}));
     }
     SUBCASE("stop index out of bounds (too high)") {
-      CHECK_THROWS(slice(d, std::nullopt, relative_ff_dim_t{10}));
+      CHECK_THROWS(slice(d, relative_ff_dim_t{0}, relative_ff_dim_t{10}));
     }
   }
 }
diff --git a/lib/op-attrs/test/src/op-attrs/ff_ordered/transform.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/transform.cc
new file mode 100644
index 0000000000..4bf189ec77
--- /dev/null
+++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/transform.cc
@@ -0,0 +1,35 @@
+#include "op-attrs/ff_ordered/transform.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("transform(FFOrdered<T>, F)") {
+    SUBCASE("input is empty") {
+      FFOrdered<std::string> input = {};
+
+      FFOrdered<int> result = transform(input, [](std::string const &) -> int {
+        CHECK(false);
+        return 0;
+      });
+      FFOrdered<int> correct = {};
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("input is not empty") {
+      FFOrdered<int> input = {2, 1, 2, 5};
+
+      FFOrdered<std::string> result =
+          transform(input, [](int x) { return fmt::to_string(x); });
+      FFOrdered<std::string> correct = FFOrdered<std::string>{
+          "2",
+          "1",
+          "2",
+          "5",
+      };
+
+      CHECK(result == correct);
+    }
+  }
+}
diff --git a/lib/op-attrs/test/src/op-attrs/ff_ordered/zip.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/zip.cc
new file mode 100644
index 0000000000..19167cd0ff
--- /dev/null
+++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/zip.cc
@@ -0,0 +1,38 @@
+#include "op-attrs/ff_ordered/zip.h"
+#include "test/utils/doctest/fmt/pair.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("zip(FFOrdered<T1>, FFOrdered<T2>)") {
+    FFOrdered<int> lhs_input = {9, 9, 8, 9};
+    FFOrdered<std::string> rhs_input = {"m", "m", "k", "l", "m"};
+
+    SUBCASE("lhs is longer") {
+      FFOrdered<std::pair<int, std::string>> result = zip(lhs_input, rhs_input);
+
+      FFOrdered<std::pair<int, std::string>> correct = {
+          {9, "m"},
+          {9, "m"},
+          {8, "k"},
+          {9, "l"},
+      };
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("rhs is longer") {
+      FFOrdered<std::pair<std::string, int>> result = zip(rhs_input, lhs_input);
+
+      FFOrdered<std::pair<std::string, int>> correct = {
+          {"m", 9},
+          {"m", 9},
+          {"k", 8},
+          {"l", 9},
+      };
+
+      CHECK(result == correct);
+    }
+  }
+}
diff --git a/lib/pcg/include/pcg/metric.enum.toml b/lib/pcg/include/pcg/metric.enum.toml
new file mode 100644
index 0000000000..ebb2323203
--- /dev/null
+++ b/lib/pcg/include/pcg/metric.enum.toml
@@ -0,0 +1,26 @@
+namespace = "FlexFlow"
+name = "Metric"
+features = [
+  "hash",
+  "json",
+  "rapidcheck",
+  "fmt",
+]
+
+[[values]]
+name = "ACCURACY"
+
+[[values]]
+name = "CATEGORICAL_CROSSENTROPY"
+
+[[values]]
+name = "SPARSE_CATEGORICAL_CROSSENTROPY"
+
+[[values]]
+name = "MEAN_SQUARED_ERROR"
+
+[[values]]
+name = "ROOT_MEAN_SQUARED_ERROR"
+
+[[values]]
+name = "MEAN_ABSOLUTE_ERROR"
diff --git a/lib/pcg/include/pcg/metric_attrs.h b/lib/pcg/include/pcg/metric_attrs.h
new file mode 100644
index 0000000000..343c2154dd
--- /dev/null
+++ b/lib/pcg/include/pcg/metric_attrs.h
@@ -0,0 +1,28 @@
+#ifndef _FF_METRICS_H_
+#define _FF_METRICS_H_
+
+#include "op-attrs/ops/loss_functions/loss_functions.h"
+#include "pcg/metric.dtg.h"
+#include "utils/fmt.h"
+#include <unordered_set>
+
+namespace FlexFlow {
+
+class MetricsAttrs {
+public:
+  MetricsAttrs() = delete;
+  MetricsAttrs(LossFunction, std::unordered_set<Metric> const &);
+
+public:
+  LossFunction loss_type;
+  bool measure_accuracy;
+  bool measure_categorical_crossentropy;
+  bool measure_sparse_categorical_crossentropy;
+  bool measure_mean_squared_error;
+  bool measure_root_mean_squared_error;
+  bool measure_mean_absolute_error;
+};
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/pcg/src/pcg/metric_attrs.cc b/lib/pcg/src/pcg/metric_attrs.cc
new file mode 100644
index 0000000000..9a93e75350
--- /dev/null
+++ b/lib/pcg/src/pcg/metric_attrs.cc
@@ -0,0 +1,38 @@
+#include "pcg/metric_attrs.h"
+
+namespace FlexFlow {
+MetricsAttrs::MetricsAttrs(LossFunction _loss_type,
+                           std::unordered_set<Metric> const &metrics)
+    : loss_type(_loss_type), measure_accuracy(false),
+      measure_categorical_crossentropy(false),
+      measure_sparse_categorical_crossentropy(false),
+      measure_mean_squared_error(false), measure_root_mean_squared_error(false),
+      measure_mean_absolute_error(false) {
+  for (Metric const &m : metrics) {
+    switch (m) {
+      case Metric::ACCURACY:
+        measure_accuracy = true;
+        continue;
+      case Metric::CATEGORICAL_CROSSENTROPY:
+        measure_categorical_crossentropy = true;
+        continue;
+      case Metric::SPARSE_CATEGORICAL_CROSSENTROPY:
+        measure_sparse_categorical_crossentropy = true;
+        continue;
+      case Metric::MEAN_SQUARED_ERROR:
+        measure_mean_squared_error = true;
+        continue;
+      case Metric::ROOT_MEAN_SQUARED_ERROR:
+        measure_root_mean_squared_error = true;
+        continue;
+      case Metric::MEAN_ABSOLUTE_ERROR:
+        measure_mean_absolute_error = true;
+        continue;
+      default:
+        throw mk_runtime_error(fmt::format(
+            "Initializing MetricsAttrs with unrecogonized metrics type {}", m));
+    }
+  }
+}
+
+} // namespace FlexFlow
diff --git a/lib/pcg/src/pcg/parallel_computation_graph/generate_weight_transform.cc b/lib/pcg/src/pcg/parallel_computation_graph/generate_weight_transform.cc
index 2cf149f78a..940024c9b6 100644
--- a/lib/pcg/src/pcg/parallel_computation_graph/generate_weight_transform.cc
+++ b/lib/pcg/src/pcg/parallel_computation_graph/generate_weight_transform.cc
@@ -1,5 +1,5 @@
 #include "pcg/parallel_computation_graph/generate_weight_transform.h"
-#include "op-attrs/dim_ordered/enumerate.h"
+#include "op-attrs/ff_ordered/enumerate.h"
 #include "op-attrs/parallel_tensor_shape.h"
 
 namespace FlexFlow {
diff --git a/lib/runtime/src/metrics_functions.cc b/lib/runtime/src/metrics_functions.cc
index feb6e704b2..33e15baed2 100644
--- a/lib/runtime/src/metrics_functions.cc
+++ b/lib/runtime/src/metrics_functions.cc
@@ -25,39 +25,6 @@ namespace FlexFlow {
 
 LegionRuntime::Logger::Category log_metrics("metrics");
 
-MetricsAttrs::MetricsAttrs(LossFunction _loss_type,
-                           std::vector<Metric> const &metrics)
-    : loss_type(_loss_type), measure_accuracy(false),
-      measure_categorical_crossentropy(false),
-      measure_sparse_categorical_crossentropy(false),
-      measure_mean_squared_error(false), measure_root_mean_squared_error(false),
-      measure_mean_absolute_error(false) {
-  for (Metric const &m : metrics) {
-    switch (m) {
-      case Metric::ACCURACY:
-        measure_accuracy = true;
-        continue;
-      case Metric::CATEGORICAL_CROSSENTROPY:
-        measure_categorical_crossentropy = true;
-        continue;
-      case Metric::SPARSE_CATEGORICAL_CROSSENTROPY:
-        measure_sparse_categorical_crossentropy = true;
-        continue;
-      case Metric::MEAN_SQUARED_ERROR:
-        measure_mean_squared_error = true;
-        continue;
-      case Metric::ROOT_MEAN_SQUARED_ERROR:
-        measure_root_mean_squared_error = true;
-        continue;
-      case Metric::MEAN_ABSOLUTE_ERROR:
-        measure_mean_absolute_error = true;
-        continue;
-      default:
-        throw mk_runtime_error("Unrecogonized metrics type {}", m);
-    }
-  }
-}
-
 enum Slots {
   LOGIT,
   LABEL,
diff --git a/lib/runtime/src/metrics_functions.h b/lib/runtime/src/metrics_functions.h
index fbb0b633bf..73dc3bbc51 100644
--- a/lib/runtime/src/metrics_functions.h
+++ b/lib/runtime/src/metrics_functions.h
@@ -16,38 +16,13 @@
 #ifndef _FF_METRICS_FUNCTIONS_H_
 #define _FF_METRICS_FUNCTIONS_H_
 
+#include "kernels/metric.h"
 #include "kernels/perf_metrics.h"
 #include "legion.h"
-#include "op-attrs/ops/loss_functions.h"
 #include "task_spec/task_invocation.h"
-#include "utils/fmt.h"
 
 namespace FlexFlow {
 
-enum class Metric {
-  ACCURACY,
-  CATEGORICAL_CROSSENTROPY,
-  SPARSE_CATEGORICAL_CROSSENTROPY,
-  MEAN_SQUARED_ERROR,
-  ROOT_MEAN_SQUARED_ERROR,
-  MEAN_ABSOLUTE_ERROR,
-};
-
-class MetricsAttrs {
-public:
-  MetricsAttrs() = delete;
-  MetricsAttrs(LossFunction, std::vector<Metric> const &);
-
-public:
-  LossFunction loss_type;
-  bool measure_accuracy;
-  bool measure_categorical_crossentropy;
-  bool measure_sparse_categorical_crossentropy;
-  bool measure_mean_squared_error;
-  bool measure_root_mean_squared_error;
-  bool measure_mean_absolute_error;
-};
-
 TypedIndexTaskInvocation<PerfMetrics>
     compute_metrics(MetricsAttrs const &,
                     parallel_tensor_guid_t const &logit,
@@ -79,40 +54,4 @@ VISITABLE_STRUCT(::FlexFlow::MetricsAttrs,
                  measure_root_mean_squared_error,
                  measure_mean_absolute_error);
 
-namespace fmt {
-
-template <>
-struct formatter<::FlexFlow::Metric> : formatter<string_view> {
-  template <typename FormatContext>
-  auto format(::FlexFlow::Metric m, FormatContext &ctx) const
-      -> decltype(ctx.out()) {
-    using namespace FlexFlow;
-
-    string_view name = "unknown";
-    switch (m) {
-      case Metric::ACCURACY:
-        name = "Accuracy";
-        break;
-      case Metric::CATEGORICAL_CROSSENTROPY:
-        name = "CategoricalCrossEntropy";
-        break;
-      case Metric::SPARSE_CATEGORICAL_CROSSENTROPY:
-        name = "SparseCategoricalCrossEntropy";
-        break;
-      case Metric::MEAN_SQUARED_ERROR:
-        name = "MeanSquaredError";
-        break;
-      case Metric::ROOT_MEAN_SQUARED_ERROR:
-        name = "RootMeanSquaredError";
-        break;
-      case Metric::MEAN_ABSOLUTE_ERROR:
-        name = "MeanAbsoluteError";
-        break;
-    }
-    return formatter<string_view>::format(name, ctx);
-  }
-};
-
-} // namespace fmt
-
 #endif
diff --git a/lib/runtime/src/ops/embedding.cc b/lib/runtime/src/ops/embedding.cc
index 253fd3cb4f..83e7c15460 100644
--- a/lib/runtime/src/ops/embedding.cc
+++ b/lib/runtime/src/ops/embedding.cc
@@ -77,11 +77,11 @@ static std::optional<float>
   return profile(backward_kernel,
                  profiling,
                  "[Embedding] backward_time = {:.2lf}ms\n",
-                 input,
                  output,
+                 input,
                  weight_grad,
-                 input.data_type,
                  output.data_type,
+                 input.data_type,
                  attrs.aggr,
                  input.shape.get_dim(),
                  output.shape.get_dim(),
diff --git a/lib/utils/include/utils/containers/subvec.h b/lib/utils/include/utils/containers/slice.h
similarity index 69%
rename from lib/utils/include/utils/containers/subvec.h
rename to lib/utils/include/utils/containers/slice.h
index c89e9227de..a82fb383b5 100644
--- a/lib/utils/include/utils/containers/subvec.h
+++ b/lib/utils/include/utils/containers/slice.h
@@ -9,9 +9,9 @@
 namespace FlexFlow {
 
 template <typename T>
-std::vector<T> subvec(std::vector<T> const &v,
-                      std::optional<int> const &maybe_start,
-                      std::optional<int> const &maybe_end) {
+std::vector<T> slice(std::vector<T> const &v,
+                     int const &maybe_start,
+                     std::optional<int> const &maybe_end) {
   auto begin_iter = v.cbegin();
   auto end_iter = v.cend();
 
@@ -22,15 +22,13 @@ std::vector<T> subvec(std::vector<T> const &v,
         if (idx < 0) {
           new_idx = size + idx;
         }
-        if (new_idx < 0 || new_idx > size) {
-          throw mk_runtime_error("Index {} is out of bounds for array {}");
-        }
+
+        ASSERT(new_idx >= 0, "Index out of bounds");
+        ASSERT(new_idx <= size, "Index out of bounds");
         return new_idx;
       };
 
-  if (maybe_start.has_value()) {
-    begin_iter += resolve_loc(maybe_start.value());
-  }
+  begin_iter += resolve_loc(maybe_start);
 
   if (maybe_end.has_value()) {
     end_iter = v.cbegin() + resolve_loc(maybe_end.value());
diff --git a/lib/utils/include/utils/containers/zip_strict.h b/lib/utils/include/utils/containers/zip_strict.h
index 64049042d4..5606fccff1 100644
--- a/lib/utils/include/utils/containers/zip_strict.h
+++ b/lib/utils/include/utils/containers/zip_strict.h
@@ -4,21 +4,17 @@
 #include "utils/containers/zip.h"
 #include "utils/exception.h"
 #include "utils/fmt/vector.h"
+#include <libassert/assert.hpp>
 
 namespace FlexFlow {
 
 template <typename L, typename R>
 std::vector<std::pair<L, R>> zip_strict(std::vector<L> const &lhs,
                                         std::vector<R> const &rhs) {
-  if (lhs.size() != rhs.size()) {
-    throw mk_runtime_error(
-        fmt::format("zip_strict requires lhs and rhs to have the same length, "
-                    "but received lhs={} (length {}), rhs={} (length {})",
-                    lhs,
-                    lhs.size(),
-                    rhs,
-                    rhs.size()));
-  }
+  ASSERT(lhs.size() == rhs.size(),
+         "zip_strict requires lhs and rhs to have the same length",
+         lhs,
+         rhs);
 
   return zip(lhs, rhs);
 }
diff --git a/lib/utils/include/utils/exception.h b/lib/utils/include/utils/exception.h
index 080cbb3611..f95eb8a38d 100644
--- a/lib/utils/include/utils/exception.h
+++ b/lib/utils/include/utils/exception.h
@@ -3,6 +3,7 @@
 
 #include "utils/fmt.h"
 #include <fmt/format.h>
+#include <libassert/assert.hpp>
 #include <stdexcept>
 #include <tl/expected.hpp>
 
diff --git a/lib/utils/include/utils/indent.h b/lib/utils/include/utils/indent.h
new file mode 100644
index 0000000000..eccbd34cfc
--- /dev/null
+++ b/lib/utils/include/utils/indent.h
@@ -0,0 +1,12 @@
+#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_INDENT_H
+#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_INDENT_H
+
+#include <string>
+
+namespace FlexFlow {
+
+std::string indent(std::string const &, int indent_size = 2);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/utils/include/utils/stack_vector/stack_vector.h b/lib/utils/include/utils/stack_vector/stack_vector.h
index 5d4d6eaad3..64d005a10e 100644
--- a/lib/utils/include/utils/stack_vector/stack_vector.h
+++ b/lib/utils/include/utils/stack_vector/stack_vector.h
@@ -272,18 +272,6 @@ struct stack_vector {
     return !(*this == other);
   }
 
-  bool operator<(stack_vector const &other) const {
-    for (std::size_t i = 0; i < std::min(this->m_size, other.m_size); i++) {
-      if (this->at(i) < other.at(i)) {
-        return true;
-      } else if (this->at(i) > other.at(i)) {
-        return false;
-      }
-    }
-
-    return (this->m_size < other.m_size);
-  }
-
   std::size_t size() const {
     return this->m_size;
   }
@@ -305,17 +293,16 @@ struct stack_vector {
 private:
   std::size_t m_size = 0;
   std::array<element_type, MAXSIZE> contents;
-
-  static_assert(
-      implies<is_equal_comparable<T>, is_equal_comparable<stack_vector>>::value,
-      "");
-  static_assert(
-      implies<is_neq_comparable<T>, is_neq_comparable<stack_vector>>::value,
-      "");
-  static_assert(
-      implies<is_lt_comparable<T>, is_lt_comparable<stack_vector>>::value, "");
 };
 
+template <typename T, std::size_t MAXSIZE>
+auto operator<(stack_vector<T, MAXSIZE> const &lhs,
+               stack_vector<T, MAXSIZE> const &rhs)
+    -> std::enable_if_t<is_lt_comparable_v<T>, bool> {
+  return std::lexicographical_compare(
+      lhs.begin(), lhs.end(), rhs.begin(), rhs.end());
+}
+
 template <typename T, std::size_t MAXSIZE>
 std::ostream &operator<<(std::ostream &s, stack_vector<T, MAXSIZE> const &v) {
   return s << fmt::to_string(v);
diff --git a/lib/utils/src/utils/containers/slice.cc b/lib/utils/src/utils/containers/slice.cc
new file mode 100644
index 0000000000..f960c21881
--- /dev/null
+++ b/lib/utils/src/utils/containers/slice.cc
@@ -0,0 +1,3 @@
+#include "utils/containers/slice.h"
+
+namespace FlexFlow {} // namespace FlexFlow
diff --git a/lib/utils/src/utils/containers/subvec.cc b/lib/utils/src/utils/containers/subvec.cc
deleted file mode 100644
index 93c7de31c5..0000000000
--- a/lib/utils/src/utils/containers/subvec.cc
+++ /dev/null
@@ -1 +0,0 @@
-#include "utils/containers/subvec.h"
diff --git a/lib/utils/src/utils/full_binary_tree/binary_tree_path.cc b/lib/utils/src/utils/full_binary_tree/binary_tree_path.cc
index 8445a2721a..8aed06ae01 100644
--- a/lib/utils/src/utils/full_binary_tree/binary_tree_path.cc
+++ b/lib/utils/src/utils/full_binary_tree/binary_tree_path.cc
@@ -1,5 +1,5 @@
 #include "utils/full_binary_tree/binary_tree_path.h"
-#include "utils/containers/subvec.h"
+#include "utils/containers/slice.h"
 
 namespace FlexFlow {
 
@@ -27,7 +27,7 @@ BinaryTreePathEntry binary_tree_path_get_top_level(BinaryTreePath const &p) {
 
 BinaryTreePath binary_tree_path_get_non_top_level(BinaryTreePath const &p) {
   return BinaryTreePath{
-      subvec(p.entries, 1, std::nullopt),
+      slice(p.entries, 1, std::nullopt),
   };
 }
 
diff --git a/lib/utils/src/utils/graph/series_parallel/series_reduction.cc b/lib/utils/src/utils/graph/series_parallel/series_reduction.cc
index 5b9b592444..459e61be71 100644
--- a/lib/utils/src/utils/graph/series_parallel/series_reduction.cc
+++ b/lib/utils/src/utils/graph/series_parallel/series_reduction.cc
@@ -3,7 +3,7 @@
 #include "utils/containers/contains_key.h"
 #include "utils/containers/get_only.h"
 #include "utils/containers/require_same.h"
-#include "utils/containers/subvec.h"
+#include "utils/containers/slice.h"
 #include "utils/containers/unordered_set_of.h"
 #include "utils/containers/values.h"
 #include "utils/graph/digraph/algorithms/get_predecessors.h"
@@ -103,7 +103,7 @@ MultiDiEdge
   Node last = g.get_multidiedge_dst(reduction.edges.back());
 
   std::vector<Node> internal_nodes;
-  for (MultiDiEdge const &e : subvec(reduction.edges, std::nullopt, -1)) {
+  for (MultiDiEdge const &e : slice(reduction.edges, 0, -1)) {
     internal_nodes.push_back(g.get_multidiedge_dst(e));
   }
 
diff --git a/lib/utils/src/utils/indent.cc b/lib/utils/src/utils/indent.cc
new file mode 100644
index 0000000000..2761ad1878
--- /dev/null
+++ b/lib/utils/src/utils/indent.cc
@@ -0,0 +1,17 @@
+#include "utils/indent.h"
+#include "utils/containers/flatmap.h"
+
+namespace FlexFlow {
+
+std::string indent(std::string const &s, int indent_size) {
+  std::string indent_str(indent_size, ' ');
+  return indent_str + flatmap(s, [&](char c) -> std::string {
+           if (c == '\n') {
+             return "\n" + indent_str;
+           } else {
+             return std::string{c};
+           };
+         });
+}
+
+} // namespace FlexFlow
diff --git a/lib/utils/src/utils/stack_vector/stack_vector.cc b/lib/utils/src/utils/stack_vector/stack_vector.cc
index d4fb849412..e2009d74d3 100644
--- a/lib/utils/src/utils/stack_vector/stack_vector.cc
+++ b/lib/utils/src/utils/stack_vector/stack_vector.cc
@@ -1,9 +1,9 @@
 #include "utils/stack_vector/stack_vector.h"
-#include "utils/archetypes/ordered_value_type.h"
+#include "utils/archetypes/value_type.h"
 
 namespace FlexFlow {
 
-using T = ordered_value_type<0>;
+using T = value_type<0>;
 
 template struct stack_vector<T, 5>;
 template struct stack_vector<int, 5>;
diff --git a/lib/utils/test/common/include/test/utils/doctest/check_kv.h b/lib/utils/test/common/include/test/utils/doctest/check_kv.h
new file mode 100644
index 0000000000..6449b8ac87
--- /dev/null
+++ b/lib/utils/test/common/include/test/utils/doctest/check_kv.h
@@ -0,0 +1,12 @@
+#ifndef _FLEXFLOW_LIB_UTILS_TEST_COMMON_INCLUDE_TEST_UTILS_DOCTEST_CHECK_KV_H
+#define _FLEXFLOW_LIB_UTILS_TEST_COMMON_INCLUDE_TEST_UTILS_DOCTEST_CHECK_KV_H
+
+#include <string>
+
+namespace FlexFlow {
+
+std::string check_kv(std::string const &k, std::string const &v);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/utils/test/common/src/main.cc b/lib/utils/test/common/src/main.cc
index 9522fa7fdb..6df2d925b7 100644
--- a/lib/utils/test/common/src/main.cc
+++ b/lib/utils/test/common/src/main.cc
@@ -1,2 +1,15 @@
-#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
-#include "doctest/doctest.h"
+#define DOCTEST_CONFIG_IMPLEMENT
+#include <doctest/doctest.h>
+
+#include <libassert/assert.hpp>
+#include <stdexcept>
+
+void libassert_throw_exception_handler(libassert::assertion_info const &info) {
+  throw std::runtime_error("Assertion failed:\n" + info.to_string());
+}
+
+int main(int argc, char **argv) {
+  libassert::set_failure_handler(libassert_throw_exception_handler);
+
+  return doctest::Context(argc, argv).run();
+}
diff --git a/lib/utils/test/common/src/test/utils/doctest/check_kv.cc b/lib/utils/test/common/src/test/utils/doctest/check_kv.cc
new file mode 100644
index 0000000000..d3c1ee335e
--- /dev/null
+++ b/lib/utils/test/common/src/test/utils/doctest/check_kv.cc
@@ -0,0 +1,17 @@
+#include "test/utils/doctest/check_kv.h"
+#include "utils/indent.h"
+#include <sstream>
+
+namespace FlexFlow {
+
+std::string check_kv(std::string const &k, std::string const &v) {
+  std::ostringstream oss;
+
+  oss << std::endl
+      << indent(k + "=", /*indent_size=*/4) << std::endl
+      << indent(v, /*indent_size=*/6);
+
+  return oss.str();
+}
+
+} // namespace FlexFlow
diff --git a/lib/utils/test/src/utils/containers/subvec.cc b/lib/utils/test/src/utils/containers/slice.cc
similarity index 69%
rename from lib/utils/test/src/utils/containers/subvec.cc
rename to lib/utils/test/src/utils/containers/slice.cc
index 610fc55b5a..4e4d840bfe 100644
--- a/lib/utils/test/src/utils/containers/subvec.cc
+++ b/lib/utils/test/src/utils/containers/slice.cc
@@ -1,4 +1,4 @@
-#include "utils/containers/subvec.h"
+#include "utils/containers/slice.h"
 #include "test/utils/doctest/fmt/vector.h"
 #include <doctest/doctest.h>
 #include <vector>
@@ -6,57 +6,57 @@
 using namespace FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE("subvec") {
+  TEST_CASE("slice") {
     std::vector<int> v = {1, 2, 3, 4, 5};
 
-    SUBCASE("Basic subvector") {
-      auto result = subvec(v, 1, 4);
+    SUBCASE("Basic slice") {
+      auto result = slice(v, 1, 4);
       std::vector<int> correct = {2, 3, 4};
       CHECK(result == correct);
     }
 
     SUBCASE("From beginning to index") {
-      auto result = subvec(v, std::nullopt, 3);
+      auto result = slice(v, 0, 3);
       std::vector<int> correct = {1, 2, 3};
       CHECK(result == correct);
     }
 
     SUBCASE("From index to end") {
-      auto result = subvec(v, 2, std::nullopt);
+      auto result = slice(v, 2, std::nullopt);
       std::vector<int> correct = {3, 4, 5};
       CHECK(result == correct);
     }
 
     SUBCASE("All of the vector") {
-      auto result = subvec(v, std::nullopt, std::nullopt);
+      auto result = slice(v, 0, std::nullopt);
       std::vector<int> correct = {1, 2, 3, 4, 5};
       CHECK(result == correct);
     }
 
     SUBCASE("Start greater than end") {
-      auto result = subvec(v, 3, 1);
+      auto result = slice(v, 3, 1);
       std::vector<int> correct = {};
       CHECK(result == correct);
     }
 
     SUBCASE("Start equal to end") {
-      auto result = subvec(v, 3, 3);
+      auto result = slice(v, 3, 3);
       std::vector<int> correct = {};
       CHECK(result == correct);
     }
 
     SUBCASE("Negative indices") {
-      auto result = subvec(v, -3, -1);
+      auto result = slice(v, -3, -1);
       std::vector<int> correct = {3, 4};
       CHECK(result == correct);
     }
 
     SUBCASE("Upper index is out of bounds by 1") {
-      CHECK_THROWS(subvec(v, 2, 6));
+      CHECK_THROWS(slice(v, 2, 6));
     }
 
     SUBCASE("Lower index is out of bounds by 1") {
-      CHECK_THROWS(subvec(v, -6, 2));
+      CHECK_THROWS(slice(v, -6, 2));
     }
   }
 }
diff --git a/lib/utils/test/src/utils/indent.cc b/lib/utils/test/src/utils/indent.cc
new file mode 100644
index 0000000000..b137253fae
--- /dev/null
+++ b/lib/utils/test/src/utils/indent.cc
@@ -0,0 +1,66 @@
+#include "utils/indent.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("indent") {
+    SUBCASE("string is empty") {
+      std::string input = "";
+
+      std::string result = indent(input);
+      std::string correct = "  ";
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("string is one line") {
+      std::string input = "hello world";
+      std::string result = indent(input);
+      std::string correct = "  hello world";
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("string has multiple lines") {
+      std::string input = "\n"
+                          "a b\n"
+                          "c d\n"
+                          "e f\n"
+                          "g\n";
+
+      std::string result = indent(input);
+      std::string correct = "  \n"
+                            "  a b\n"
+                            "  c d\n"
+                            "  e f\n"
+                            "  g\n"
+                            "  ";
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("leading and trailing whitespace is preserved") {
+      std::string input = "   a b  \n"
+                          "c   d e\n"
+                          "     ";
+
+      std::string result = indent(input);
+      std::string correct = "     a b  \n"
+                            "  c   d e\n"
+                            "       ";
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("allows custom indent size") {
+      std::string input = "hello\nworld";
+
+      std::string result = indent(input, /*indent_size=*/4);
+      std::string correct = "    hello\n"
+                            "    world";
+
+      CHECK(result == correct);
+    }
+  }
+}
diff --git a/lib/utils/test/src/utils/stack_vector/stack_vector.cc b/lib/utils/test/src/utils/stack_vector/stack_vector.cc
index c36de733b6..6eb2cc0d88 100644
--- a/lib/utils/test/src/utils/stack_vector/stack_vector.cc
+++ b/lib/utils/test/src/utils/stack_vector/stack_vector.cc
@@ -1,12 +1,97 @@
 #include "utils/stack_vector/stack_vector.h"
 #include "test/utils/doctest/fmt/vector.h"
 #include "test/utils/rapidcheck.h"
+#include "utils/archetypes/value_type.h"
 #include <doctest/doctest.h>
 #include <iterator>
 
 using namespace FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("operator<(stack_vector<T, MAXSIZE>, stack_vector<T, MAXSIZE>)") {
+    constexpr std::size_t MAXSIZE = 5;
+
+    SUBCASE("T is ordered") {
+      SUBCASE("inputs are the same") {
+        std::vector<int> input = {2, 1, 2, 3};
+
+        bool result = (input < input);
+        bool correct = false;
+
+        CHECK(result == correct);
+      }
+
+      SUBCASE("lhs is strict prefix of rhs") {
+        std::vector<int> lhs = {2, 1, 2};
+        std::vector<int> rhs = {2, 1, 2, 3};
+
+        bool result = (lhs < rhs);
+        bool correct = true;
+
+        CHECK(result == correct);
+      }
+
+      SUBCASE("lhs is empty") {
+        std::vector<int> lhs = {};
+        std::vector<int> rhs = {2, 1, 2, 3};
+
+        bool result = (lhs < rhs);
+        bool correct = true;
+
+        CHECK(result == correct);
+      }
+
+      SUBCASE("lhs has a smaller element first") {
+        std::vector<int> lhs = {2, 1, 0, 3};
+        std::vector<int> rhs = {2, 1, 2};
+
+        bool result = (lhs < rhs);
+        bool correct = true;
+
+        CHECK(result == correct);
+      }
+
+      // from the definition of a strict total order, i.e.,
+      // https://en.wikipedia.org/w/index.php?title=Total_order&oldid=1278541072#Strict_and_non-strict_total_orders
+      RC_SUBCASE("operator< is irreflexive",
+                 [](stack_vector<int, MAXSIZE> const &input) {
+                   RC_ASSERT(!(input < input));
+                 });
+
+      RC_SUBCASE("operator< is asymmetric",
+                 [](stack_vector<int, MAXSIZE> const &lhs,
+                    stack_vector<int, MAXSIZE> const &rhs) {
+                   RC_PRE(lhs != rhs);
+
+                   RC_ASSERT((lhs < rhs) == !(rhs < lhs));
+                 });
+
+      RC_SUBCASE("operator< is transitive",
+                 [](stack_vector<int, MAXSIZE> const &a,
+                    stack_vector<int, MAXSIZE> const &b,
+                    stack_vector<int, MAXSIZE> const &c) {
+                   RC_PRE(a < b);
+                   RC_PRE(b < c);
+
+                   RC_ASSERT(a < c);
+                 });
+
+      RC_SUBCASE("operator< is connected",
+                 [](stack_vector<int, MAXSIZE> const &lhs,
+                    stack_vector<int, MAXSIZE> const &rhs) {
+                   RC_PRE(lhs != rhs);
+
+                   RC_ASSERT((lhs < rhs) || (rhs < lhs));
+                 });
+    }
+
+    SUBCASE("T is not ordered") {
+      bool result = is_lt_comparable_v<stack_vector<value_type<0>, MAXSIZE>>;
+
+      CHECK_FALSE(result);
+    }
+  }
+
   TEST_CASE_TEMPLATE(
       "stack_vector<T, MAXSIZE>::push_back", T, int, double, char) {
     constexpr std::size_t MAXSIZE = 5;

From fd3d7f1df7c78989a49fc00a74ce9367f716aaf6 Mon Sep 17 00:00:00 2001
From: Colin Unger <lockshaw@lockshaw.net>
Date: Fri, 2 May 2025 16:11:20 -0700
Subject: [PATCH 08/11] Add section on EditorConfig to setup guide in
 CONTRIBUTING.md (#1612)

* Add section on EditorConfig to setup guide in CONTRIBUTING.md

* Update workflow badges in README
---
 CONTRIBUTING.md | 14 +++++++++++---
 README.md       |  3 +--
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 1a1b3c9bee..f52ec68c0c 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -83,6 +83,15 @@ Total Test time (real) =   8.64 sec
 If you don't, or if you see any tests failing, please double check that you have followed the instructions above. 
 If you have and are still encountering an issue, please [contact us](#contact-us) with a detailed description of your platform and the commands you have run.
 
+### EditorConfig
+
+FlexFlow Train uses [EditorConfig](https://editorconfig.org/) to ensure consistent low-level details (indentation settings, character encoding, etc.) across different editors.
+The EditorConfig file for FlexFlow Train can be found in [`.editorconfig`](./.editorconfig).
+If you are using vim, emacs, or another editor with built-in EditorConfig support (a full list of editors with built-in EditorConfig support can be found [here](https://editorconfig.org/#pre-installed))
+the configuration will be detected and applied without you needing to do anything.
+If you are using an editor not on this list, you will need to install a corresponding [EditorConfig plugin](https://editorconfig.org/#editor-plugins).
+**If you are using vscode, you should install [this plugin](https://marketplace.visualstudio.com/items?itemName=EditorConfig.EditorConfig).**
+
 ### GPU setup
 
 If you are developing on a machine with one or more CUDA GPUs, you can also run the tests that require a GPU by entering the `gpu` devshell instead of the `default` devshell:
@@ -227,9 +236,8 @@ The bulk of the FlexFlow source code is stored in the following folders:
 
 We currently implement CI testing using Github Workflows. Each workflow is defined by its corresponding YAML file in the [.github/workflows](.github/workflows) folder of the repo. We currently have the following workflows:
 
-1. [`tests`](./.github/workflows/per-lib-check.yml): Builds and runs GPU and non-GPU unit tests for all of the code under `lib` and `bin`. Also uploads coverage numbers to [codecov.io](https://app.codecov.io/gh/flexflow/flexflow-train).
-2. [`clang-format-check.yml`](./.github/workflows/clang-format-check.yml): ensures that the source code is properly formatted using `clang-format`. To format your code locally, run `proj format` (see [here](#building-testing-etc) for more information on `proj`).
-4. [`shell-check.yml`](./.github/workflows/shell-check.yml): runs shellcheck on all bash scripts in the repo.
+1. [`tests.yml`](./.github/workflows/tests.yml): Builds and runs GPU and non-GPU unit tests for all of the code under `lib` and `bin`. Uploads coverage numbers to [codecov.io](https://app.codecov.io/gh/flexflow/flexflow-train). Also ensures that the source code is properly formatted using `clang-format`. To format your code locally, run `proj format` (see [here](#building-testing-etc) for more information on `proj`).
+2. [`shell-check.yml`](./.github/workflows/shell-check.yml): runs shellcheck on all bash scripts in the repo.
 
 GPU machines for CI are managed using [runs-on](https://runs-on.com/).
 
diff --git a/README.md b/README.md
index 0d56bc46e0..f181c4ad96 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,5 @@
 # FlexFlow Train
-[![clang-format Check](https://github.com/flexflow/flexflow-train/actions/workflows/clang-format-check.yml/badge.svg?branch=master)](https://github.com/flexflow/flexflow-train/actions/workflows/clang-format-check.yml)
-[![per-lib-checks](https://github.com/flexflow/flexflow-train/actions/workflows/per-lib-check.yml/badge.svg)](https://github.com/flexflow/flexflow-train/actions/workflows/per-lib-check.yml)
+[![tests](https://github.com/flexflow/flexflow-train/actions/workflows/tests.yml/badge.svg)](https://github.com/flexflow/flexflow-train/actions/workflows/tests.yml)
 [![shell-check](https://github.com/flexflow/flexflow-train/actions/workflows/shell-check.yml/badge.svg)](https://github.com/flexflow/flexflow-train/actions/workflows/shell-check.yml)
 [![Documentation Status](https://readthedocs.org/projects/flexflow/badge/?version=latest)](https://flexflow.readthedocs.io/en/latest/?badge=latest)
 

From 651ba943fd623285a0df0c0081c9dd26f20041ab Mon Sep 17 00:00:00 2001
From: Victor Li <vli42@sapling2.stanford.edu>
Date: Sat, 17 May 2025 21:43:42 -0700
Subject: [PATCH 09/11] Slight refactoring

---
 lib/compiler/src/compiler/mcmc/mcmc_over_mapped_pcg.cc           | 1 -
 .../compiler/mcmc/{mcmc_algorithm.cc => mcmc_over_mapped_pcg.cc} | 0
 2 files changed, 1 deletion(-)
 rename lib/compiler/test/src/compiler/mcmc/{mcmc_algorithm.cc => mcmc_over_mapped_pcg.cc} (100%)

diff --git a/lib/compiler/src/compiler/mcmc/mcmc_over_mapped_pcg.cc b/lib/compiler/src/compiler/mcmc/mcmc_over_mapped_pcg.cc
index 47ecc2479f..75ef4d08a6 100644
--- a/lib/compiler/src/compiler/mcmc/mcmc_over_mapped_pcg.cc
+++ b/lib/compiler/src/compiler/mcmc/mcmc_over_mapped_pcg.cc
@@ -38,7 +38,6 @@ SearchResult mcmc_graph_optimize(ParallelComputationGraph &pcg,
             get_random_pattern_match(random_substitution.value().pcg_pattern,
                                      sub_pcg_from_full_pcg(mapped_pcg.pcg));
         if (pattern_match != std::nullopt) {
-          std::cout << "HELLO" << std::endl;
           return apply_substitution_and_update_machine_mapping(
               mapped_pcg, random_substitution.value(), pattern_match.value());
         }
diff --git a/lib/compiler/test/src/compiler/mcmc/mcmc_algorithm.cc b/lib/compiler/test/src/compiler/mcmc/mcmc_over_mapped_pcg.cc
similarity index 100%
rename from lib/compiler/test/src/compiler/mcmc/mcmc_algorithm.cc
rename to lib/compiler/test/src/compiler/mcmc/mcmc_over_mapped_pcg.cc

From e3cf79aa41262d4a1a5a916078a3b7eee76d0bf2 Mon Sep 17 00:00:00 2001
From: Victor Li <vli42@sapling2.stanford.edu>
Date: Sun, 18 May 2025 01:20:05 -0700
Subject: [PATCH 10/11] Updating MCMC to work with substitutions

---
 .../machine_mapping_mutation_set.h            |  13 +-
 .../src/compiler/graph_optimize_result.cc     |  15 --
 ...substitution_and_update_machine_mapping.cc |  19 ++-
 .../machine_mapping_mutation_set.cc           | 139 ------------------
 .../src/compiler/mcmc/mcmc_over_mapped_pcg.cc |   1 +
 lib/compiler/src/compiler/search_result.cc    |   2 +-
 .../src/compiler/mcmc/mcmc_over_mapped_pcg.cc |   5 +-
 .../apply_substitution/apply_substitution.cc  |   2 -
 8 files changed, 21 insertions(+), 175 deletions(-)
 delete mode 100644 lib/compiler/src/compiler/graph_optimize_result.cc

diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_mutation_set.h b/lib/compiler/include/compiler/machine_mapping/machine_mapping_mutation_set.h
index 6dfefec7d1..796c94b371 100644
--- a/lib/compiler/include/compiler/machine_mapping/machine_mapping_mutation_set.h
+++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_mutation_set.h
@@ -9,22 +9,11 @@ std::optional<MachineMapping>
     get_naive_mapping(ParallelComputationGraph &pcg,
                       MachineSpecification const &resources,
                       DeviceType const &device_type);
-std::vector<MachineMapping>
-    get_possible_mutations(SearchResult mapped_pcg,
-                           MachineSpecification const &resource);
+                      
 std::optional<MachineMapping>
     get_random_mutation(SearchResult mapped_pcg,
                         MachineSpecification const &resource,
                         DeviceType const &device_type);
-MachineView increment_stride(MachineView machine_view, nonnegative_int dim);
-MachineView decrement_all_strides(MachineView machine_view);
-MachineView change_stride(nonnegative_int stride,
-                          MachineView machine_view,
-                          nonnegative_int dim);
-MachineView change_node_idx(nonnegative_int node_ix, MachineView machine_view);
-MachineView change_device_idx(nonnegative_int device_idx,
-                              MachineView machine_view);
-MachineView switch_projection(MachineView machine_view, nonnegative_int dim);
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/compiler/src/compiler/graph_optimize_result.cc b/lib/compiler/src/compiler/graph_optimize_result.cc
deleted file mode 100644
index 33243a226d..0000000000
--- a/lib/compiler/src/compiler/graph_optimize_result.cc
+++ /dev/null
@@ -1,15 +0,0 @@
-#include "compiler/search_result.h"
-
-namespace FlexFlow {
-
-std::string format_as(SearchResult const &r) {
-  return fmt::format("<GraphOptimizeResult\npcg={}\nmachine_mapping={}>",
-                     as_dot(r.pcg),
-                     r.machine_mapping);
-}
-
-std::ostream &operator<<(std::ostream &s, SearchResult const &r) {
-  return (s << fmt::to_string(r));
-}
-
-} // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.cc b/lib/compiler/src/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.cc
index 411ee67145..1276a63893 100644
--- a/lib/compiler/src/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.cc
+++ b/lib/compiler/src/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.cc
@@ -2,6 +2,7 @@
 #include "pcg/parallel_computation_graph/parallel_computation_graph_edge.h"
 #include "pcg/parallel_computation_graph/parallel_tensor_guid_t.h"
 #include "substitutions/apply_substitution/evaluate_substitution_output.h"
+#include "substitutions/apply_substitution/apply_substitution.h"
 #include "substitutions/apply_substitution/output_expr_to_result_sub_pcg_mapping.h"
 #include "substitutions/open_parallel_tensor_guid_t.h"
 #include "substitutions/pcg_pattern_match.h"
@@ -13,6 +14,7 @@
 #include "utils/containers/restrict_keys.h"
 #include "utils/containers/set_minus.h"
 #include "utils/containers/values.h"
+#include "utils/containers/is_subseteq_of.h"
 
 namespace FlexFlow {
 
@@ -47,6 +49,7 @@ SearchResult apply_substitution_and_update_machine_mapping(
       transform(matched_nodes, [&](parallel_layer_guid_t const &node) {
         return machine_views.at(node);
       });
+  MachineView first_substituted_machine_view = *substituted_machine_views.begin();
 
   std::unordered_map<parallel_layer_guid_t, ParallelLayerAttrs> post_node_data =
       [&] {
@@ -56,10 +59,8 @@ SearchResult apply_substitution_and_update_machine_mapping(
         std::unordered_map<parallel_layer_guid_t, ParallelLayerAttrs>
             post_node_data_from_sub = output_graph_data.node_data;
 
-        // just taking the first substituted machine view, not sure if this
-        // is fine
         for (auto [layer, attrs] : post_node_data_from_sub) {
-          machine_views.try_emplace(layer, *substituted_machine_views.begin());
+          machine_views.insert_or_assign(layer, first_substituted_machine_view);
         }
 
         return merge_disjoint_maps(post_node_data_from_orig,
@@ -175,6 +176,18 @@ SearchResult apply_substitution_and_update_machine_mapping(
       post_value_data,
   };
 
+  assert(is_subseteq_of(keys(post_node_data), keys(machine_views)));
+
+  for (auto it = machine_views.begin(); it != machine_views.end(); ) {
+    if (post_node_data.find(it->first) == post_node_data.end()) {
+        it = machine_views.erase(it);
+    } else {
+        ++it;
+    }
+  }
+
+  assert(keys(post_node_data) == keys(machine_views));
+
   return SearchResult{
       pcg_from_sub_pcg_by_dropping_inputs(sub_pcg_from_graph_data(post_data)),
       MachineMapping{machine_views}};
diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping_mutation_set.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping_mutation_set.cc
index 7f7a54d07a..3688385e2f 100644
--- a/lib/compiler/src/compiler/machine_mapping/machine_mapping_mutation_set.cc
+++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping_mutation_set.cc
@@ -29,71 +29,6 @@ std::optional<MachineMapping>
 
 std::optional<MachineMapping>
     get_random_mutation(SearchResult mapped_pcg,
-                        MachineSpecification const &resources,
-                        DeviceType const &device_type) {
-  ParallelComputationGraph pcg = mapped_pcg.pcg;
-  std::vector<parallel_layer_guid_t> layers = topological_ordering(pcg);
-  if (layers.size() == 0) {
-    return std::nullopt;
-  }
-  parallel_layer_guid_t random_layer = select_random(layers);
-  MachineMapping machine_mapping = mapped_pcg.machine_mapping;
-  MachineView machine_view = machine_mapping.machine_views.at(random_layer);
-
-  int mutation_op = select_random(range(6));
-  switch (mutation_op) {
-    case 0: {
-      machine_view = decrement_all_strides(machine_view);
-      break;
-    }
-    case 1: {
-      nonnegative_int rand_dim = select_random(
-          nonnegative_range(nonnegative_int{num_dims(machine_view)}));
-      machine_view = increment_stride(machine_view, rand_dim);
-      break;
-    }
-    case 2: {
-      nonnegative_int rand_node_idx =
-          select_random(nonnegative_range(resources.num_nodes));
-      machine_view = change_node_idx(rand_node_idx, machine_view);
-      break;
-    }
-    case 3: {
-      if (device_type == DeviceType::GPU) {
-        nonnegative_int rand_device_idx =
-            select_random(nonnegative_range(resources.num_gpus_per_node));
-        machine_view = change_device_idx(rand_device_idx, machine_view);
-      } else {
-        nonnegative_int rand_device_idx =
-            select_random(nonnegative_range(resources.num_cpus_per_node));
-        machine_view = change_device_idx(rand_device_idx, machine_view);
-      }
-      break;
-    }
-    case 4: {
-      nonnegative_int rand_dim = select_random(
-          nonnegative_range(nonnegative_int{num_dims(machine_view)}));
-      machine_view = switch_projection(machine_view, rand_dim);
-      break;
-    }
-    case 5: {
-      // copy layer
-      parallel_layer_guid_t layer_to_copy = select_random(layers);
-      machine_view = machine_mapping.machine_views.at(layer_to_copy);
-      break;
-    }
-  }
-  OperatorTaskSpace task = get_operator_task_space(pcg, random_layer);
-  if (is_valid_machine_view(machine_view, task, resources)) {
-    // only apply it if valid
-    machine_mapping.machine_views.at(random_layer) = machine_view;
-  }
-  return machine_mapping;
-}
-
-// "lazy" version just picks a random available machine view for a random layer
-std::optional<MachineMapping>
-    get_random_mutation_lazy(SearchResult mapped_pcg,
                              MachineSpecification const &resources,
                              DeviceType const &device_type) {
   ParallelComputationGraph pcg = mapped_pcg.pcg;
@@ -102,7 +37,6 @@ std::optional<MachineMapping>
     return std::nullopt;
   }
   parallel_layer_guid_t random_layer = select_random(layers);
-  ;
 
   MachineMapping machine_mapping = mapped_pcg.machine_mapping;
   MachineView machine_view = machine_mapping.machine_views.at(random_layer);
@@ -115,77 +49,4 @@ std::optional<MachineMapping>
   machine_mapping.machine_views.at(random_layer) = random_new_machine_view;
   return machine_mapping;
 }
-
-MachineView increment_stride(MachineView machine_view, nonnegative_int dim) {
-  std::vector<stride_t> strides = get_strides(machine_view);
-  nonnegative_int new_stride =
-      strides.at(dim.unwrap_nonnegative()).unwrapped + 1_n;
-  return change_stride(new_stride, machine_view, dim);
-}
-
-MachineView decrement_all_strides(MachineView machine_view) {
-  std::vector<stride_t> strides = get_strides(machine_view);
-  for (nonnegative_int dim :
-       nonnegative_range(nonnegative_int{num_dims(machine_view)})) {
-    nonnegative_int old_stride = strides.at(dim.unwrap_nonnegative()).unwrapped;
-    if (old_stride >= 1_n) {
-      machine_view =
-          change_stride(nonnegative_int{old_stride.unwrap_nonnegative() - 1},
-                        machine_view,
-                        dim);
-    }
-  }
-  return machine_view;
-}
-
-MachineView change_stride(nonnegative_int stride,
-                          MachineView machine_view,
-                          nonnegative_int dim) {
-  std::vector<stride_t> strides = get_strides(machine_view);
-  strides.at(dim.unwrap_nonnegative()) = stride_t{stride};
-  MachineView new_machine_view =
-      machine_view_from_strides_and_machine_spec_dimensions(
-          machine_view.start, strides, get_dimensions(machine_view));
-  return new_machine_view;
-}
-
-MachineView change_node_idx(nonnegative_int node_ix, MachineView machine_view) {
-  MachineView new_machine_view =
-      machine_view_from_strides_and_machine_spec_dimensions(
-          MachineSpaceCoordinate{node_ix,
-                                 machine_view.start.device_idx,
-                                 machine_view.start.device_type},
-          get_strides(machine_view),
-          get_dimensions(machine_view));
-  return new_machine_view;
-}
-
-MachineView change_device_idx(nonnegative_int device_idx,
-                              MachineView machine_view) {
-  MachineView new_machine_view =
-      machine_view_from_strides_and_machine_spec_dimensions(
-          MachineSpaceCoordinate{machine_view.start.node_idx,
-                                 device_idx,
-                                 machine_view.start.device_type},
-          get_strides(machine_view),
-          get_dimensions(machine_view));
-  return new_machine_view;
-}
-
-MachineView switch_projection(MachineView machine_view, nonnegative_int dim) {
-  std::vector<MachineSpecificationDimension> dims =
-      get_dimensions(machine_view);
-  MachineSpecificationDimension projection = dims.at(dim.unwrap_nonnegative());
-  if (projection == MachineSpecificationDimension::INTER_NODE) {
-    dims.at(dim.unwrap_nonnegative()) =
-        MachineSpecificationDimension::INTRA_NODE;
-  } else {
-    dims.at(dim.unwrap_nonnegative()) =
-        MachineSpecificationDimension::INTER_NODE;
-  }
-  MachineView new_machine_view =
-      machine_view_from_strides_and_machine_spec_dimensions(
-          machine_view.start, get_strides(machine_view), dims);
-  return new_machine_view;
-}
 } // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/mcmc/mcmc_over_mapped_pcg.cc b/lib/compiler/src/compiler/mcmc/mcmc_over_mapped_pcg.cc
index 75ef4d08a6..452cf3baa3 100644
--- a/lib/compiler/src/compiler/mcmc/mcmc_over_mapped_pcg.cc
+++ b/lib/compiler/src/compiler/mcmc/mcmc_over_mapped_pcg.cc
@@ -6,6 +6,7 @@
 #include "substitutions/pcg_pattern.h"
 #include "substitutions/pcg_pattern_match.h"
 #include "substitutions/unity_substitution_set.h"
+#include "compiler/search_result.h"
 #include "utils/optional.h"
 
 namespace FlexFlow {
diff --git a/lib/compiler/src/compiler/search_result.cc b/lib/compiler/src/compiler/search_result.cc
index 33243a226d..0afc10723a 100644
--- a/lib/compiler/src/compiler/search_result.cc
+++ b/lib/compiler/src/compiler/search_result.cc
@@ -3,7 +3,7 @@
 namespace FlexFlow {
 
 std::string format_as(SearchResult const &r) {
-  return fmt::format("<GraphOptimizeResult\npcg={}\nmachine_mapping={}>",
+  return fmt::format("<SearchResult\npcg={}\nmachine_mapping={}>",
                      as_dot(r.pcg),
                      r.machine_mapping);
 }
diff --git a/lib/compiler/test/src/compiler/mcmc/mcmc_over_mapped_pcg.cc b/lib/compiler/test/src/compiler/mcmc/mcmc_over_mapped_pcg.cc
index 5c469c4301..07feef073d 100644
--- a/lib/compiler/test/src/compiler/mcmc/mcmc_over_mapped_pcg.cc
+++ b/lib/compiler/test/src/compiler/mcmc/mcmc_over_mapped_pcg.cc
@@ -65,7 +65,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     MCMCOverMappedPCGConfig search_config =
         MCMCOverMappedPCGConfig{/*temperature=*/1.0,
                                 /*num_iterations=*/100_n,
-                                /*substitution_interval=*/100_n,
+                                /*substitution_interval=*/5_n,
                                 /*device_type=*/DeviceType::GPU};
 
     SearchResult result = mcmc_graph_optimize(
@@ -74,7 +74,6 @@ TEST_SUITE(FF_TEST_SUITE) {
         result.pcg, cost_estimator, result.machine_mapping, full_machine_spec);
     std::cout << runtime << std::endl;
 
-    CHECK(runtime < 16);
-    CHECK(false);
+    CHECK(runtime < 12);
   }
 }
diff --git a/lib/substitutions/src/substitutions/apply_substitution/apply_substitution.cc b/lib/substitutions/src/substitutions/apply_substitution/apply_substitution.cc
index f1354264f8..61bfe15d7b 100644
--- a/lib/substitutions/src/substitutions/apply_substitution/apply_substitution.cc
+++ b/lib/substitutions/src/substitutions/apply_substitution/apply_substitution.cc
@@ -159,8 +159,6 @@ SubParallelComputationGraph
       post_value_data,
   };
 
-  std::cout << as_dot(sub_pcg_from_graph_data(post_data)) << std::endl;
-
   return sub_pcg_from_graph_data(post_data);
 }
 

From 1af4a6b45c11e8ee836d7c3c0eb626d4f42147ff Mon Sep 17 00:00:00 2001
From: Victor Li <vli42@sapling2.stanford.edu>
Date: Sun, 18 May 2025 01:21:51 -0700
Subject: [PATCH 11/11] Fixing formatting

---
 .../machine_mapping/machine_mapping_mutation_set.h  |  2 +-
 ...apply_substitution_and_update_machine_mapping.cc | 13 +++++++------
 .../machine_mapping/machine_mapping_mutation_set.cc |  4 ++--
 .../src/compiler/mcmc/mcmc_over_mapped_pcg.cc       |  2 +-
 .../test/src/compiler/mcmc/mcmc_over_mapped_pcg.cc  |  2 +-
 5 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_mutation_set.h b/lib/compiler/include/compiler/machine_mapping/machine_mapping_mutation_set.h
index 796c94b371..43af640e02 100644
--- a/lib/compiler/include/compiler/machine_mapping/machine_mapping_mutation_set.h
+++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_mutation_set.h
@@ -9,7 +9,7 @@ std::optional<MachineMapping>
     get_naive_mapping(ParallelComputationGraph &pcg,
                       MachineSpecification const &resources,
                       DeviceType const &device_type);
-                      
+
 std::optional<MachineMapping>
     get_random_mutation(SearchResult mapped_pcg,
                         MachineSpecification const &resource,
diff --git a/lib/compiler/src/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.cc b/lib/compiler/src/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.cc
index 1276a63893..252384985b 100644
--- a/lib/compiler/src/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.cc
+++ b/lib/compiler/src/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.cc
@@ -1,20 +1,20 @@
 #include "compiler/machine_mapping/apply_substitution_and_update_machine_mapping.h"
 #include "pcg/parallel_computation_graph/parallel_computation_graph_edge.h"
 #include "pcg/parallel_computation_graph/parallel_tensor_guid_t.h"
-#include "substitutions/apply_substitution/evaluate_substitution_output.h"
 #include "substitutions/apply_substitution/apply_substitution.h"
+#include "substitutions/apply_substitution/evaluate_substitution_output.h"
 #include "substitutions/apply_substitution/output_expr_to_result_sub_pcg_mapping.h"
 #include "substitutions/open_parallel_tensor_guid_t.h"
 #include "substitutions/pcg_pattern_match.h"
 #include "substitutions/sub_parallel_computation_graph.h"
 #include "substitutions/sub_parallel_computation_graph_data.dtg.h"
 #include "substitutions/sub_parallel_computation_graph_edge.h"
+#include "utils/containers/is_subseteq_of.h"
 #include "utils/containers/keys.h"
 #include "utils/containers/merge_maps.h"
 #include "utils/containers/restrict_keys.h"
 #include "utils/containers/set_minus.h"
 #include "utils/containers/values.h"
-#include "utils/containers/is_subseteq_of.h"
 
 namespace FlexFlow {
 
@@ -49,7 +49,8 @@ SearchResult apply_substitution_and_update_machine_mapping(
       transform(matched_nodes, [&](parallel_layer_guid_t const &node) {
         return machine_views.at(node);
       });
-  MachineView first_substituted_machine_view = *substituted_machine_views.begin();
+  MachineView first_substituted_machine_view =
+      *substituted_machine_views.begin();
 
   std::unordered_map<parallel_layer_guid_t, ParallelLayerAttrs> post_node_data =
       [&] {
@@ -178,11 +179,11 @@ SearchResult apply_substitution_and_update_machine_mapping(
 
   assert(is_subseteq_of(keys(post_node_data), keys(machine_views)));
 
-  for (auto it = machine_views.begin(); it != machine_views.end(); ) {
+  for (auto it = machine_views.begin(); it != machine_views.end();) {
     if (post_node_data.find(it->first) == post_node_data.end()) {
-        it = machine_views.erase(it);
+      it = machine_views.erase(it);
     } else {
-        ++it;
+      ++it;
     }
   }
 
diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping_mutation_set.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping_mutation_set.cc
index 3688385e2f..15648eab74 100644
--- a/lib/compiler/src/compiler/machine_mapping/machine_mapping_mutation_set.cc
+++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping_mutation_set.cc
@@ -29,8 +29,8 @@ std::optional<MachineMapping>
 
 std::optional<MachineMapping>
     get_random_mutation(SearchResult mapped_pcg,
-                             MachineSpecification const &resources,
-                             DeviceType const &device_type) {
+                        MachineSpecification const &resources,
+                        DeviceType const &device_type) {
   ParallelComputationGraph pcg = mapped_pcg.pcg;
   std::vector<parallel_layer_guid_t> layers = topological_ordering(pcg);
   if (layers.size() == 0) {
diff --git a/lib/compiler/src/compiler/mcmc/mcmc_over_mapped_pcg.cc b/lib/compiler/src/compiler/mcmc/mcmc_over_mapped_pcg.cc
index 452cf3baa3..ab7769679e 100644
--- a/lib/compiler/src/compiler/mcmc/mcmc_over_mapped_pcg.cc
+++ b/lib/compiler/src/compiler/mcmc/mcmc_over_mapped_pcg.cc
@@ -2,11 +2,11 @@
 #include "compiler/machine_mapping/apply_substitution_and_update_machine_mapping.h"
 #include "compiler/machine_mapping/machine_mapping_mutation_set.h"
 #include "compiler/mcmc/generic_mcmc_algorithm.h"
+#include "compiler/search_result.h"
 #include "compiler/task_graph_simulator/task_simulator.h"
 #include "substitutions/pcg_pattern.h"
 #include "substitutions/pcg_pattern_match.h"
 #include "substitutions/unity_substitution_set.h"
-#include "compiler/search_result.h"
 #include "utils/optional.h"
 
 namespace FlexFlow {
diff --git a/lib/compiler/test/src/compiler/mcmc/mcmc_over_mapped_pcg.cc b/lib/compiler/test/src/compiler/mcmc/mcmc_over_mapped_pcg.cc
index 07feef073d..7d74d897e4 100644
--- a/lib/compiler/test/src/compiler/mcmc/mcmc_over_mapped_pcg.cc
+++ b/lib/compiler/test/src/compiler/mcmc/mcmc_over_mapped_pcg.cc
@@ -1,5 +1,5 @@
-#include "../cost_estimator_for_test.h"
 #include "compiler/mcmc/mcmc_over_mapped_pcg.h"
+#include "../cost_estimator_for_test.h"
 #include "compiler/task_graph_simulator/task_simulator.h"
 #include "doctest/doctest.h"
 #include "op-attrs/parallel_tensor_dims.h"