diff --git a/.envrc b/.envrc
new file mode 100644
index 0000000000..2797f0f929
--- /dev/null
+++ b/.envrc
@@ -0,0 +1,3 @@
+source_up_if_exists
+
+use flake
diff --git a/.flake/pkgs/fccf/default.nix b/.flake/pkgs/fccf/default.nix
new file mode 100644
index 0000000000..f792b8606c
--- /dev/null
+++ b/.flake/pkgs/fccf/default.nix
@@ -0,0 +1,54 @@
+{ fetchFromGitHub
+, stdenv
+, cmake
+, pkg-config
+, libclang
+, libllvm
+, lib
+, zlib
+, argparse
+, nlohmann_json
+, fmt
+}:
+
+stdenv.mkDerivation rec {
+  pname = "fccf";
+  version = "03d373fc65e2d7ceeac441ba4bbddfdc25618dff";
+
+  src = fetchFromGitHub {
+    owner = "p-ranav";
+    repo = "fccf";
+    rev = version;
+    sha256 = "sha256-3NdPon5ZfjoGFFgBlb0rzRnfWgSopvAc5Gls2NWHaOE=";
+  };
+
+  nativeBuildInputs = [
+    cmake
+    pkg-config
+  ];
+
+  buildInputs = [
+    libclang
+    libllvm
+    zlib
+    argparse
+    nlohmann_json
+    fmt
+  ];
+
+  patches = [
+    ./json-package-name.patch
+    ./fix-argparse-include.patch
+  ];
+
+  cmakeFlags = [
+    "-DCMAKE_BUILD_TYPE=Release"
+    "-DFETCHCONTENT_TRY_FIND_PACKAGE_MODE=ALWAYS"
+  ];
+
+  meta = with lib; {
+    description = "A command-line tool that quickly searches through C/C++ source code in a directory based on a search string and prints relevant code snippets that match the query";
+    homepage = "https://github.com/p-ranav/fccf";
+    license = licenses.mit;
+  };
+}
diff --git a/.flake/pkgs/fccf/fix-argparse-include.patch b/.flake/pkgs/fccf/fix-argparse-include.patch
new file mode 100644
index 0000000000..2cb648c1bf
--- /dev/null
+++ b/.flake/pkgs/fccf/fix-argparse-include.patch
@@ -0,0 +1,13 @@
+diff --git a/source/main.cpp b/source/main.cpp
+index 7e131d3..6c05d89 100644
+--- a/source/main.cpp
++++ b/source/main.cpp
+@@ -6,7 +6,7 @@
+ #include <string_view>
+ #include <vector>
+ 
+-#include <argparse.hpp>
++#include <argparse/argparse.hpp>
+ #include <nlohmann/json.hpp>
+ #include "searcher.hpp"
+ #include <unistd.h>
diff --git a/.flake/pkgs/fccf/json-package-name.patch b/.flake/pkgs/fccf/json-package-name.patch
new file mode 100644
index 0000000000..51f6a012cf
--- /dev/null
+++ b/.flake/pkgs/fccf/json-package-name.patch
@@ -0,0 +1,12 @@
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 20bcbbf..923075f 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -48,6 +48,7 @@ FetchContent_MakeAvailable(fmt)
+ 
+ FetchContent_Declare(json
+   URL https://github.com/nlohmann/json/releases/download/v3.10.5/json.tar.xz
++  FIND_PACKAGE_ARGS NAMES nlohmann_json
+ )
+ FetchContent_MakeAvailable(json)
+ 
diff --git a/.github/runs-on.yml b/.github/runs-on.yml
index a4fff33536..5033e69d65 100644
--- a/.github/runs-on.yml
+++ b/.github/runs-on.yml
@@ -1,23 +1,4 @@
 images:
-  runs-on-gpu-pinned:
-    platform: "linux"
-    arch: "x64"
-    owner: "135269210855" # runs-on
-    # to find, go to 
-    # https://us-east-2.console.aws.amazon.com/ec2/home?region=us-east-2#Images:visibility=public-images;search=:runs-on;v=3;$case=tags:false%5C,client:false;$regex=tags:false%5C,client:false
-    name: "runs-on-v2.2-ubuntu22-gpu-x64-20250220122045"
-
-  runs-on-cpu-pinned:
-    platform: "linux"
-    arch: "x64"
-    owner: "135269210855" # runs-on
-    name: "runs-on-v2.2-ubuntu22-full-x64-20250220122045"
-
-  official-ubuntu-ami:
-    platform: "linux"
-    arch: "x64"
-    ami: "ami-0a60b027285c0d4c5"
-
   flexflow-gpu-ci:
     platform: "linux"
     arch: "x64"
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 9d98fb07dd..799e3069a9 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -57,9 +57,9 @@ jobs:
     name: GPU unit tests
     needs: cpu-ci
     runs-on:
-      - runs-on
+      - runs-on=${{ github.run_id }}
       - family=g4dn.xlarge
-      - image=runs-on-gpu-pinned
+      - image=flexflow-gpu-ci
 
     strategy:
       max-parallel: 1
diff --git a/.proj.toml b/.proj.toml
index a06fb53c3a..8eed6166cd 100644
--- a/.proj.toml
+++ b/.proj.toml
@@ -2,57 +2,81 @@ project_name = "flexflow"
 testsuite_macro = "FF_TEST_SUITE"
 namespace_name = "FlexFlow"
 header_extension = ".h"
+cuda_launch_cmd = [
+  "nixGL",
+  "--",
+]
 
 [targets.utils]
 type = "lib"
-tests = true
-benchmarks = true
+has-cpu-only-tests = true
+has-cpu-only-benchmarks = true
+has-cuda-tests = false
+has-cuda-benchmarks = false
 
 [targets.op-attrs]
 type = "lib"
-tests = true
-benchmarks = false
+has-cpu-only-tests = true
+has-cpu-only-benchmarks = false
+has-cuda-tests = false
+has-cuda-benchmarks = false
 
 [targets.kernels]
 type = "lib"
-tests = true
-benchmarks = false
+has-cpu-only-tests = true
+has-cpu-only-benchmarks = false
+has-cuda-tests = true
+has-cuda-benchmarks = false
 
 [targets.pcg]
 type = "lib"
-tests = true
-benchmarks = false
+has-cpu-only-tests = true
+has-cpu-only-benchmarks = false
+has-cuda-tests = false
+has-cuda-benchmarks = false
 
 [targets.substitutions]
 type = "lib"
-tests = true
-benchmarks = false
+has-cpu-only-tests = true
+has-cpu-only-benchmarks = false
+has-cuda-tests = false
+has-cuda-benchmarks = false
 
 [targets.compiler]
 type = "lib"
-tests = true
-benchmarks = true
+has-cpu-only-tests = true
+has-cpu-only-benchmarks = true
+has-cuda-tests = false
+has-cuda-benchmarks = false
 
 [targets.substitution-generator]
 type = "lib"
-tests = true
-benchmarks = false
+has-cpu-only-tests = true
+has-cpu-only-benchmarks = false
+has-cuda-tests = false
+has-cuda-benchmarks = false
 
 [targets.local-execution]
 type = "lib"
-tests = true
-benchmarks = false
+has-cpu-only-tests = true
+has-cpu-only-benchmarks = false
+has-cuda-tests = false
+has-cuda-benchmarks = false
 
 [targets.models]
 type = "lib"
-tests = true
-benchmarks = false
+has-cpu-only-tests = true
+has-cpu-only-benchmarks = false
+has-cuda-tests = false
+has-cuda-benchmarks = false
 
 [targets.export-model-arch]
 type = "bin"
+cuda = false
 
 [targets.substitution-to-dot]
 type = "bin"
+cuda = false
 
 # default_build_targets = [
 #   "utils",
diff --git a/.vimrc b/.vimrc
new file mode 100644
index 0000000000..4c8a8a8279
--- /dev/null
+++ b/.vimrc
@@ -0,0 +1,8 @@
+" example search path configuration
+set path=lib/runtime/**,lib/**
+
+" set build target
+" let g:target = "pcg"
+
+" set test target
+" let g:test_target = "utils-test"
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 1a1b3c9bee..f52ec68c0c 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -83,6 +83,15 @@ Total Test time (real) =   8.64 sec
 If you don't, or if you see any tests failing, please double check that you have followed the instructions above. 
 If you have and are still encountering an issue, please [contact us](#contact-us) with a detailed description of your platform and the commands you have run.
 
+### EditorConfig
+
+FlexFlow Train uses [EditorConfig](https://editorconfig.org/) to ensure consistent low-level details (indentation settings, character encoding, etc.) across different editors.
+The EditorConfig file for FlexFlow Train can be found in [`.editorconfig`](./.editorconfig).
+If you are using vim, emacs, or another editor with built-in EditorConfig support (a full list of editors with built-in EditorConfig support can be found [here](https://editorconfig.org/#pre-installed))
+the configuration will be detected and applied without you needing to do anything.
+If you are using an editor not on this list, you will need to install a corresponding [EditorConfig plugin](https://editorconfig.org/#editor-plugins).
+**If you are using vscode, you should install [this plugin](https://marketplace.visualstudio.com/items?itemName=EditorConfig.EditorConfig).**
+
 ### GPU setup
 
 If you are developing on a machine with one or more CUDA GPUs, you can also run the tests that require a GPU by entering the `gpu` devshell instead of the `default` devshell:
@@ -227,9 +236,8 @@ The bulk of the FlexFlow source code is stored in the following folders:
 
 We currently implement CI testing using Github Workflows. Each workflow is defined by its corresponding YAML file in the [.github/workflows](.github/workflows) folder of the repo. We currently have the following workflows:
 
-1. [`tests`](./.github/workflows/per-lib-check.yml): Builds and runs GPU and non-GPU unit tests for all of the code under `lib` and `bin`. Also uploads coverage numbers to [codecov.io](https://app.codecov.io/gh/flexflow/flexflow-train).
-2. [`clang-format-check.yml`](./.github/workflows/clang-format-check.yml): ensures that the source code is properly formatted using `clang-format`. To format your code locally, run `proj format` (see [here](#building-testing-etc) for more information on `proj`).
-4. [`shell-check.yml`](./.github/workflows/shell-check.yml): runs shellcheck on all bash scripts in the repo.
+1. [`tests.yml`](./.github/workflows/tests.yml): Builds and runs GPU and non-GPU unit tests for all of the code under `lib` and `bin`. Uploads coverage numbers to [codecov.io](https://app.codecov.io/gh/flexflow/flexflow-train). Also ensures that the source code is properly formatted using `clang-format`. To format your code locally, run `proj format` (see [here](#building-testing-etc) for more information on `proj`).
+2. [`shell-check.yml`](./.github/workflows/shell-check.yml): runs shellcheck on all bash scripts in the repo.
 
 GPU machines for CI are managed using [runs-on](https://runs-on.com/).
 
diff --git a/README.md b/README.md
index 0d56bc46e0..f181c4ad96 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,5 @@
 # FlexFlow Train
-[![clang-format Check](https://github.com/flexflow/flexflow-train/actions/workflows/clang-format-check.yml/badge.svg?branch=master)](https://github.com/flexflow/flexflow-train/actions/workflows/clang-format-check.yml)
-[![per-lib-checks](https://github.com/flexflow/flexflow-train/actions/workflows/per-lib-check.yml/badge.svg)](https://github.com/flexflow/flexflow-train/actions/workflows/per-lib-check.yml)
+[![tests](https://github.com/flexflow/flexflow-train/actions/workflows/tests.yml/badge.svg)](https://github.com/flexflow/flexflow-train/actions/workflows/tests.yml)
 [![shell-check](https://github.com/flexflow/flexflow-train/actions/workflows/shell-check.yml/badge.svg)](https://github.com/flexflow/flexflow-train/actions/workflows/shell-check.yml)
 [![Documentation Status](https://readthedocs.org/projects/flexflow/badge/?version=latest)](https://flexflow.readthedocs.io/en/latest/?badge=latest)
 
diff --git a/cmake/flexflow-utils.cmake b/cmake/flexflow-utils.cmake
index 478ebda318..ef5d6d9d11 100644
--- a/cmake/flexflow-utils.cmake
+++ b/cmake/flexflow-utils.cmake
@@ -126,11 +126,16 @@ function(ff_add_test_executable)
     ${FF_TEST_EXEC_NAME}
     ${SRC})
 
+  target_include_directories(
+    ${FF_TEST_EXEC_NAME}
+    PRIVATE
+    ${FF_TEST_EXEC_PRIVATE_INCLUDE})
+
   target_link_libraries(
     ${FF_TEST_EXEC_NAME}
     ${FF_TEST_EXEC_DEPS})
 
-  target_compile_definitions(${FF_TEST_EXEC_NAME} PRIVATE FF_TEST_SUITE="${FF_TEST_EXEC_NAME}" FF_CUDA_TEST_SUITE="cuda-${FF_TEST_EXEC_NAME}")
+  target_compile_definitions(${FF_TEST_EXEC_NAME} PRIVATE FF_TEST_SUITE="cpu-${FF_TEST_EXEC_NAME}" FF_CUDA_TEST_SUITE="cuda-${FF_TEST_EXEC_NAME}")
 
   define_ff_vars(${FF_TEST_EXEC_NAME})
   ff_set_cxx_properties(${FF_TEST_EXEC_NAME})
diff --git a/flake.lock b/flake.lock
index c991232013..ff6e797d51 100644
--- a/flake.lock
+++ b/flake.lock
@@ -66,11 +66,11 @@
         ]
       },
       "locked": {
-        "lastModified": 1741679698,
-        "narHash": "sha256-poSOQS/2qImAo/PgRu37pHdOrwAsZEyC8PMM3evFLX4=",
+        "lastModified": 1746157536,
+        "narHash": "sha256-g4Hx/05+Ce3hl8OS1zm4pY/+ThD1blWKmcaPsohSX5Y=",
         "owner": "lockshaw",
         "repo": "proj",
-        "rev": "0de983ff66abea4703f73988d29fc807e2b0a9bd",
+        "rev": "5871bc7b7fb9d7d7f14c8bca6c50a0cf2e75834d",
         "type": "github"
       },
       "original": {
diff --git a/flake.nix b/flake.nix
index 77a6c61b7d..5fa48fa3fd 100644
--- a/flake.nix
+++ b/flake.nix
@@ -59,6 +59,7 @@
         bencher-cli = pkgs.callPackage ./.flake/pkgs/bencher-cli.nix { };
         ffdb = pkgs.callPackage ./.flake/pkgs/ffdb { inherit proj; };
         hpp2plantuml = pkgs.python3Packages.callPackage ./.flake/pkgs/hpp2plantuml.nix { };
+        fccf = pkgs.callPackage ./.flake/pkgs/fccf { };
         rapidcheckFull = pkgs.symlinkJoin {
           name = "rapidcheckFull";
           paths = (with pkgs; [ rapidcheck.out rapidcheck.dev ]);
@@ -162,6 +163,7 @@
               ruff
               jq
               gh
+              expect
             ])
             (with pkgs.python3Packages; [
               gitpython
@@ -179,6 +181,7 @@
             (with self.packages.${system}; [
               ffdb
               hpp2plantuml
+              fccf
             ])
           ];
         };
diff --git a/lib/compiler/include/compiler/algorithm_config.variant.toml b/lib/compiler/include/compiler/algorithm_config.variant.toml
new file mode 100644
index 0000000000..4e58104875
--- /dev/null
+++ b/lib/compiler/include/compiler/algorithm_config.variant.toml
@@ -0,0 +1,18 @@
+namespace = "FlexFlow"
+name = "AlgorithmConfig"
+features = [
+  "eq",
+  "hash",
+  "fmt",
+]
+
+includes = [
+  "compiler/data_parallelism/data_parallelism_config.dtg.h",
+  "compiler/unity_algorithm/unity_search_config.dtg.h",
+]
+
+[[values]]
+type = "::FlexFlow::DataParallelismConfig"
+
+[[values]]
+type = "::FlexFlow::UnitySearchConfig"
diff --git a/lib/compiler/include/compiler/compiler.h b/lib/compiler/include/compiler/compiler.h
index 178ab19a53..8697c06beb 100644
--- a/lib/compiler/include/compiler/compiler.h
+++ b/lib/compiler/include/compiler/compiler.h
@@ -1,42 +1,22 @@
 #ifndef _FLEXFLOW_COMPILER_COMPILER_H
 #define _FLEXFLOW_COMPILER_COMPILER_H
 
-#include "pcg/cost_values.h"
-#include "pcg/machine_view.h"
-#include "pcg/parallel_computation_graph/parallel_computation_graph.h"
-#include "pcg/tensor_mapping.h"
+#include "compiler/algorithm_config.dtg.h"
+#include "compiler/cost_estimator/cost_estimator.h"
+#include "compiler/search_result.dtg.h"
+#include "pcg/machine_specification.dtg.h"
 
 namespace FlexFlow {
 
 enum class SearchAlgorithm {
   DATA_PARALLEL,
-};
-
-using SearchAlgorithmConfig = std::variant<>;
-using SearchSolution = std::variant<>;
-
-struct SearchResult {
-  ParallelComputationGraph pcg;
-  TensorMapping tensor_mapping;
-  SearchSolution solution;
-  CostValues cost_values;
+  UNITY,
 };
 
 SearchResult optimize(ComputationGraph const &,
                       MachineSpecification const &,
                       CostEstimator const &,
-                      SearchAlgorithm,
-                      optional<AlgorithmConfig> const &);
-
-// struct SearchSolution {
-//   LabelledMultiDiGraph<PCGOperatorAttrs, ParallelTensorShape> optimized_pcg;
-//   std::unordered_map<Node, MachineView> device_assignments;
-//   /* std::unordered_map<tensor_guid_t,
-//   std::unordered_set<parallel_tensor_guid_t>> tensor_mappings; */
-// };
-//
-// SearchSolution run_data_parallelize(ComputationGraph const &,
-// MachineSpecification const &);
+                      AlgorithmConfig const &);
 
 } // namespace FlexFlow
 
diff --git a/lib/compiler/include/compiler/data_parallelism/data_parallelism_config.struct.toml b/lib/compiler/include/compiler/data_parallelism/data_parallelism_config.struct.toml
new file mode 100644
index 0000000000..68512fa473
--- /dev/null
+++ b/lib/compiler/include/compiler/data_parallelism/data_parallelism_config.struct.toml
@@ -0,0 +1,14 @@
+namespace = "FlexFlow"
+name = "DataParallelismConfig"
+features = [
+  "eq",
+  "hash",
+  "fmt",
+]
+
+includes = [
+]
+
+[[fields]]
+name = "degree"
+type = "int"
diff --git a/lib/compiler/include/compiler/graph_optimize_result.struct.toml b/lib/compiler/include/compiler/graph_optimize_result.struct.toml
deleted file mode 100644
index 22f29cbd59..0000000000
--- a/lib/compiler/include/compiler/graph_optimize_result.struct.toml
+++ /dev/null
@@ -1,16 +0,0 @@
-namespace = "FlexFlow"
-name = "GraphOptimizeResult"
-features = [ ]
-
-includes = [ 
-  "compiler/machine_mapping/machine_mapping.dtg.h",
-  "pcg/parallel_computation_graph/parallel_computation_graph.h"
-]
-
-[[fields]]
-name = "pcg"
-type = "::FlexFlow::ParallelComputationGraph"
-
-[[fields]]
-name = "machine_mapping"
-type = "::FlexFlow::MachineMapping"
diff --git a/lib/compiler/include/compiler/allowed_machine_views.h b/lib/compiler/include/compiler/machine_mapping/allowed_machine_views.h
similarity index 100%
rename from lib/compiler/include/compiler/allowed_machine_views.h
rename to lib/compiler/include/compiler/machine_mapping/allowed_machine_views.h
diff --git a/lib/compiler/include/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.h b/lib/compiler/include/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.h
new file mode 100644
index 0000000000..b08ca57851
--- /dev/null
+++ b/lib/compiler/include/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.h
@@ -0,0 +1,32 @@
+#ifndef _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_APPLY_SUBSTITUTION_APPLY_SUBSTITUTION_AND_UPDATE_MACHINE_MAPPING_H
+#define _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_APPLY_SUBSTITUTION_APPLY_SUBSTITUTION_AND_UPDATE_MACHINE_MAPPING_H
+
+#include "compiler/search_result.dtg.h"
+#include "substitutions/pcg_pattern_match.dtg.h"
+#include "substitutions/sub_parallel_computation_graph.dtg.h"
+#include "substitutions/substitution.dtg.h"
+
+namespace FlexFlow {
+/**
+ * @brief Applies \p substitution to \p mapped_pcg at the location specified by
+ * \p match, returning the resulting SearchResult (mapped pcg)
+ *
+ * @param mapped_pcg
+ * @param substitution
+ * @param match The location at which to apply substitution. This location in
+ * sub_pcg should match substitution's PCGPattern. Likely created by running
+ * FlexFlow::find_pattern_matches(PCGPattern const &,
+ * SubParallelComputationGraph const &).
+ * @return SearchResult A mapped pcg similar to mapped_pcg, but with
+ * the subgraph of the pcg specified by match replaced with the result of the
+ * output expression of substitution and the machine mapping updated to account
+ * for the new output
+ */
+SearchResult apply_substitution_and_update_machine_mapping(
+    SearchResult const &mapped_pcg,
+    Substitution const &sub,
+    PCGPatternMatch const &match);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping.h b/lib/compiler/include/compiler/machine_mapping/machine_mapping.h
index 7375cde985..796225637e 100644
--- a/lib/compiler/include/compiler/machine_mapping/machine_mapping.h
+++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping.h
@@ -2,6 +2,8 @@
 #define _FLEXFLOW_COMPILER_MACHINE_MAPPING_H
 
 #include "compiler/machine_mapping/machine_mapping.dtg.h"
+#include "compiler/machine_mapping/machine_mapping_result.h"
+#include "compiler/series_parallel/pcg/pcg_binary_sp_decomposition.dtg.h"
 #include "pcg/device_id_t.dtg.h"
 #include "pcg/machine_specification.dtg.h"
 #include "pcg/operator_task_space.dtg.h"
@@ -14,6 +16,13 @@ MachineMapping combine_disjoint_mappings(MachineMapping const &,
 
 bool nodes_are_disjoint(MachineMapping const &m1, MachineMapping const &m2);
 
+parallel_layer_guid_t
+    get_layer_from_path(PCGBinarySPDecomposition const &sp_decomposition,
+                        BinaryTreePath const &path);
+
+std::optional<MachineMapping> get_machine_mapping_from_machine_mapping_result(
+    PCGBinarySPDecomposition const &, MachineMappingResult const &);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_mutation_set.h b/lib/compiler/include/compiler/machine_mapping/machine_mapping_mutation_set.h
new file mode 100644
index 0000000000..43af640e02
--- /dev/null
+++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_mutation_set.h
@@ -0,0 +1,19 @@
+#ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_MCMC_MACHINE_MAPPING_MUTATION_SET_H
+#define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_MCMC_MACHINE_MAPPING_MUTATION_SET_H
+
+#include "compiler/machine_mapping/machine_mapping.h"
+#include "compiler/search_result.dtg.h"
+
+namespace FlexFlow {
+std::optional<MachineMapping>
+    get_naive_mapping(ParallelComputationGraph &pcg,
+                      MachineSpecification const &resources,
+                      DeviceType const &device_type);
+
+std::optional<MachineMapping>
+    get_random_mutation(SearchResult mapped_pcg,
+                        MachineSpecification const &resource,
+                        DeviceType const &device_type);
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.h b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.h
index 68d02aaa54..168ba6c3d5 100644
--- a/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.h
+++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.h
@@ -9,6 +9,9 @@
 
 namespace FlexFlow {
 
+bool is_valid_machine_mapping_problem_tree(
+    MachineMappingProblemTree const &problem_tree);
+
 MachineMappingProblemTree
     get_machine_mapping_problem_tree(ParallelComputationGraph const &pcg,
                                      PCGBinarySPDecomposition const &sp);
diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h
index 29e9e7c90b..3d1dc91d24 100644
--- a/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h
+++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h
@@ -4,6 +4,7 @@
 #include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.dtg.h"
 #include "compiler/machine_mapping/machine_mapping_problem_tree/mm_problem_tree_parallel_split.dtg.h"
 #include "compiler/machine_mapping/machine_mapping_problem_tree/mm_problem_tree_series_split.dtg.h"
+#include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.dtg.h"
 #include "utils/full_binary_tree/binary_tree_path.dtg.h"
 #include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree_implementation.dtg.h"
 #include "utils/graph/series_parallel/sp_decomposition_tree_node_type.dtg.h"
@@ -27,6 +28,9 @@ std::optional<MachineMappingProblemTree>
     mm_problem_tree_get_subtree_at_path(MachineMappingProblemTree const &,
                                         BinaryTreePath const &);
 
+std::string as_dot(MachineMappingProblemTree const &);
+void debug_print_dot(MachineMappingProblemTree const &);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.struct.toml b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.struct.toml
index fe76683eb7..7493c68387 100644
--- a/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.struct.toml
+++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.struct.toml
@@ -11,6 +11,7 @@ includes = [
   "op-attrs/parallel_tensor_shape.dtg.h",
   "<vector>",
   "pcg/machine_view.dtg.h",
+  "pcg/operator_task_space.dtg.h",
 ]
 
 src_includes = [
@@ -34,3 +35,6 @@ type = "std::vector<::FlexFlow::ParallelTensorShape>"
 name = "output_shapes"
 type = "std::vector<::FlexFlow::ParallelTensorShape>"
 
+[[fields]]
+name = "op_task_space"
+type = "::FlexFlow::OperatorTaskSpace"
diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_result.h b/lib/compiler/include/compiler/machine_mapping/machine_mapping_result.h
index b21fea5f24..db2f4e6f0d 100644
--- a/lib/compiler/include/compiler/machine_mapping/machine_mapping_result.h
+++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_result.h
@@ -31,6 +31,8 @@ FeasibleMachineMappingResult require_feasible(MachineMappingResult const &);
     make_singleton_machine_mapping_result(float runtime,
                                           MachineView const &machine_view);
 
+[[nodiscard]] float get_runtime_cost(MachineMappingResult const &mm_result);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/compiler/include/compiler/mcmc/generic_mcmc_algorithm.h b/lib/compiler/include/compiler/mcmc/generic_mcmc_algorithm.h
new file mode 100644
index 0000000000..a27ecbc8f4
--- /dev/null
+++ b/lib/compiler/include/compiler/mcmc/generic_mcmc_algorithm.h
@@ -0,0 +1,57 @@
+#ifndef _FLEXFLOW_COMPILER_MCMC_GENERIC_MCMC_ALGORITHM_H
+#define _FLEXFLOW_COMPILER_MCMC_GENERIC_MCMC_ALGORITHM_H
+
+#include "compiler/mcmc/generic_mcmc_config.dtg.h"
+#include "compiler/mcmc/generic_mcmc_state.h"
+#include "utils/nonnegative_int/nonnegative_range.h"
+#include "utils/random_utils.h"
+#include <optional>
+
+namespace FlexFlow {
+
+template <typename State, typename ScoringFunc>
+void modify_state_for_minimization(
+    Generic_MCMC_state<State, float> &best_state,
+    Generic_MCMC_state<State, float> &current_state,
+    State candidate,
+    ScoringFunc scorer,
+    float temperature) {
+  float best_estimate = best_state.get_score();
+  float new_estimate = scorer(candidate);
+  float delta = new_estimate - best_estimate;
+  if (delta < 0 || (randf() < exp(-delta / temperature))) {
+    current_state = Generic_MCMC_state<State, float>(candidate, new_estimate);
+    if (delta < 0) {
+      best_state = current_state;
+    }
+  }
+}
+
+// GeneratingFunc : State -> nn_int -> std::optional<State>
+// ScoringFunc : State -> float
+
+template <typename State, typename GeneratingFunc, typename ScoringFunc>
+Generic_MCMC_state<State, float>
+    minimize_score(State const &starting_state,
+                   GeneratingFunc const &generator,
+                   ScoringFunc const &scorer,
+                   GenericMCMCConfig const &search_config) {
+  using MCMCState = Generic_MCMC_state<State, float>;
+  MCMCState best_state = MCMCState(starting_state, scorer(starting_state));
+  MCMCState current_state = best_state;
+  for (nonnegative_int i : nonnegative_range(search_config.num_iterations)) {
+    std::optional<State> candidate = generator(current_state.get_state(), i);
+    if (candidate != std::nullopt) {
+      modify_state_for_minimization(best_state,
+                                    current_state,
+                                    candidate.value(),
+                                    scorer,
+                                    search_config.temperature);
+    }
+  }
+  return best_state;
+}
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/compiler/include/compiler/mcmc/generic_mcmc_config.struct.toml b/lib/compiler/include/compiler/mcmc/generic_mcmc_config.struct.toml
new file mode 100644
index 0000000000..e11c84f0bd
--- /dev/null
+++ b/lib/compiler/include/compiler/mcmc/generic_mcmc_config.struct.toml
@@ -0,0 +1,19 @@
+namespace = "FlexFlow"
+name = "GenericMCMCConfig"
+features = [
+  "eq",
+  "hash",
+  "fmt",
+]
+
+includes = [
+  "utils/nonnegative_int/nonnegative_int.h"
+]
+
+[[fields]]
+name = "temperature"
+type = "float"
+
+[[fields]]
+name = "num_iterations"
+type = "::FlexFlow::nonnegative_int"
\ No newline at end of file
diff --git a/lib/compiler/include/compiler/mcmc/generic_mcmc_state.h b/lib/compiler/include/compiler/mcmc/generic_mcmc_state.h
new file mode 100644
index 0000000000..6a6aada32b
--- /dev/null
+++ b/lib/compiler/include/compiler/mcmc/generic_mcmc_state.h
@@ -0,0 +1,27 @@
+#ifndef _FLEXFLOW_COMPILER_MCMC_GENERIC_MCMC_STATE_H
+#define _FLEXFLOW_COMPILER_MCMC_GENERIC_MCMC_STATE_H
+#include "utils/nonnegative_int/nonnegative_int.h"
+
+namespace FlexFlow {
+
+template <typename State, typename Score>
+struct Generic_MCMC_state {
+public:
+  Generic_MCMC_state(State const &state, Score const &score)
+      : state(state), score(score) {}
+
+  State const &get_state() const {
+    return state;
+  }
+  Score const &get_score() const {
+    return score;
+  }
+
+private:
+  State state;
+  Score score;
+};
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/compiler/include/compiler/mcmc/mcmc_over_mapped_pcg.h b/lib/compiler/include/compiler/mcmc/mcmc_over_mapped_pcg.h
new file mode 100644
index 0000000000..c2d8737184
--- /dev/null
+++ b/lib/compiler/include/compiler/mcmc/mcmc_over_mapped_pcg.h
@@ -0,0 +1,22 @@
+#ifndef _FLEXFLOW_COMPILER_MCMC_OVER_MAPPED_PCG_H
+#define _FLEXFLOW_COMPILER_MCMC_OVER_MAPPED_PCG_H
+
+#include "compiler/cost_estimator/cost_estimator.h"
+#include "compiler/mcmc/mcmc_over_mapped_pcg_config.dtg.h"
+#include "compiler/search_result.dtg.h"
+#include "pcg/computation_graph.h"
+#include "pcg/machine_specification.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h"
+#include "substitutions/sub_parallel_computation_graph.h"
+#include "substitutions/substitution.h"
+
+namespace FlexFlow {
+
+SearchResult mcmc_graph_optimize(ParallelComputationGraph &pcg,
+                                 CostEstimator const &cost_estimator,
+                                 MachineSpecification const &resources,
+                                 MCMCOverMappedPCGConfig const &search_config);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/compiler/include/compiler/mcmc/mcmc_over_mapped_pcg_config.struct.toml b/lib/compiler/include/compiler/mcmc/mcmc_over_mapped_pcg_config.struct.toml
new file mode 100644
index 0000000000..e1548a581e
--- /dev/null
+++ b/lib/compiler/include/compiler/mcmc/mcmc_over_mapped_pcg_config.struct.toml
@@ -0,0 +1,28 @@
+namespace = "FlexFlow"
+name = "MCMCOverMappedPCGConfig"
+features = [
+  "eq",
+  "hash",
+  "fmt",
+]
+
+includes = [
+  "pcg/device_type.dtg.h",
+  "utils/nonnegative_int/nonnegative_int.h"
+]
+
+[[fields]]
+name = "temperature"
+type = "float"
+
+[[fields]]
+name = "num_iterations"
+type = "::FlexFlow::nonnegative_int"
+
+[[fields]]
+name = "substitution_interval"
+type = "::FlexFlow::nonnegative_int"
+
+[[fields]]
+name = "device_type"
+type = "::FlexFlow::DeviceType"
\ No newline at end of file
diff --git a/lib/compiler/include/compiler/graph_optimize_result.h b/lib/compiler/include/compiler/search_result.h
similarity index 54%
rename from lib/compiler/include/compiler/graph_optimize_result.h
rename to lib/compiler/include/compiler/search_result.h
index f3843e2a93..197b36e9ea 100644
--- a/lib/compiler/include/compiler/graph_optimize_result.h
+++ b/lib/compiler/include/compiler/search_result.h
@@ -1,12 +1,12 @@
 #ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_GRAPH_OPTIMIZE_RESULT_H
 #define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_GRAPH_OPTIMIZE_RESULT_H
 
-#include "compiler/graph_optimize_result.dtg.h"
+#include "compiler/search_result.dtg.h"
 
 namespace FlexFlow {
 
-std::string format_as(GraphOptimizeResult const &);
-std::ostream &operator<<(std::ostream &, GraphOptimizeResult const &);
+std::string format_as(SearchResult const &);
+std::ostream &operator<<(std::ostream &, SearchResult const &);
 
 } // namespace FlexFlow
 
diff --git a/lib/compiler/include/compiler/search_result.struct.toml b/lib/compiler/include/compiler/search_result.struct.toml
new file mode 100644
index 0000000000..120d182c75
--- /dev/null
+++ b/lib/compiler/include/compiler/search_result.struct.toml
@@ -0,0 +1,17 @@
+namespace = "FlexFlow"
+name = "SearchResult"
+features = [
+]
+
+includes = [
+  "pcg/parallel_computation_graph/parallel_computation_graph.h",
+  "compiler/machine_mapping/machine_mapping.h",
+]
+
+[[fields]]
+name = "pcg"
+type = "::FlexFlow::ParallelComputationGraph"
+
+[[fields]]
+name = "machine_mapping"
+type = "::FlexFlow::MachineMapping"
diff --git a/lib/compiler/include/compiler/series_parallel/pcg/get_pcg_balanced_binary_sp_decomposition.h b/lib/compiler/include/compiler/series_parallel/pcg/get_pcg_balanced_binary_sp_decomposition.h
index d43edaa79d..bb7459c767 100644
--- a/lib/compiler/include/compiler/series_parallel/pcg/get_pcg_balanced_binary_sp_decomposition.h
+++ b/lib/compiler/include/compiler/series_parallel/pcg/get_pcg_balanced_binary_sp_decomposition.h
@@ -1,6 +1,8 @@
 #ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_SERIES_PARALLEL_GET_PCG_BALANCED_BINARY_SP_DECOMPOSITION_H
 #define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_SERIES_PARALLEL_GET_PCG_BALANCED_BINARY_SP_DECOMPOSITION_H
 
+#include "compiler/series_parallel/pcg/pcg_binary_sp_decomposition.dtg.h"
+
 namespace FlexFlow {
 
 std::optional<PCGBinarySPDecomposition>
diff --git a/lib/compiler/include/compiler/series_parallel/pcg/pcg_binary_sp_decomposition.h b/lib/compiler/include/compiler/series_parallel/pcg/pcg_binary_sp_decomposition.h
index 86fa1a59aa..e4fd841787 100644
--- a/lib/compiler/include/compiler/series_parallel/pcg/pcg_binary_sp_decomposition.h
+++ b/lib/compiler/include/compiler/series_parallel/pcg/pcg_binary_sp_decomposition.h
@@ -27,6 +27,10 @@ std::optional<PCGBinarySPDecomposition>
 std::unordered_multiset<parallel_layer_guid_t>
     get_parallel_layers(PCGBinarySPDecomposition const &);
 
+PCGBinarySPDecomposition
+    pcg_binary_sp_decomposition_from_binary_sp_decomposition_tree(
+        BinarySPDecompositionTree const &);
+
 SPDecompositionTreeNodeType get_node_type(PCGBinarySPDecomposition const &);
 
 std::unordered_set<BinaryTreePath>
diff --git a/lib/compiler/include/compiler/unity_algorithm.h b/lib/compiler/include/compiler/unity_algorithm.h
deleted file mode 100644
index 232f2b9563..0000000000
--- a/lib/compiler/include/compiler/unity_algorithm.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef _FLEXFLOW_COMPILER_UNITY_ALGORITHM_H
-#define _FLEXFLOW_COMPILER_UNITY_ALGORITHM_H
-
-#include "compiler/cost_estimator/cost_estimator.h"
-#include "compiler/graph_optimize_result.dtg.h"
-#include "optimizer_config.dtg.h"
-#include "pcg/computation_graph.h"
-#include "pcg/machine_specification.dtg.h"
-#include "substitutions/sub_parallel_computation_graph.h"
-
-namespace FlexFlow {
-
-GraphOptimizeResult graph_optimize(
-    ParallelComputationGraph &pcg,
-    CostEstimator const &cost_estimator,
-    MachineSpecification const &resources,
-    std::function<std::unordered_set<MachineView>(
-        ParallelLayerAttrs const &, MachineSpecification const &)> const
-        &allowed_machine_views,
-    OptimizerConfig const &opt_config);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/compiler/include/compiler/graph_optimize_state.h b/lib/compiler/include/compiler/unity_algorithm/graph_optimize_state.h
similarity index 63%
rename from lib/compiler/include/compiler/graph_optimize_state.h
rename to lib/compiler/include/compiler/unity_algorithm/graph_optimize_state.h
index 404111ff8b..9f609f3118 100644
--- a/lib/compiler/include/compiler/graph_optimize_state.h
+++ b/lib/compiler/include/compiler/unity_algorithm/graph_optimize_state.h
@@ -1,16 +1,17 @@
-#ifndef _FLEXFLOW_COMPILER_MCMC_STATE_H
-#define _FLEXFLOW_COMPILER_MCMC_STATE_H
+#ifndef _FLEXFLOW_COMPILER_UNITY_ALGORITHM_STATE_H
+#define _FLEXFLOW_COMPILER_UNITY_ALGORITHM_STATE_H
 
-#include "compiler/graph_optimize_result.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.h"
 
 namespace FlexFlow {
 
 struct GraphOptimizeState {
-  explicit GraphOptimizeState(GraphOptimizeResult const &graph_optimize_result,
+  GraphOptimizeState() = delete;
+  explicit GraphOptimizeState(ParallelComputationGraph const &pcg,
                               float runtime);
 
-  GraphOptimizeResult graph_optimize_result;
-  float runtime;
+  ParallelComputationGraph pcg;
+  float runtime_with_optimal_mm;
 
   bool operator==(GraphOptimizeState const &other) const;
   bool operator!=(GraphOptimizeState const &other) const;
diff --git a/lib/compiler/include/compiler/unity_algorithm/unity_algorithm.h b/lib/compiler/include/compiler/unity_algorithm/unity_algorithm.h
new file mode 100644
index 0000000000..618e764f80
--- /dev/null
+++ b/lib/compiler/include/compiler/unity_algorithm/unity_algorithm.h
@@ -0,0 +1,19 @@
+#ifndef _FLEXFLOW_COMPILER_UNITY_ALGORITHM_H
+#define _FLEXFLOW_COMPILER_UNITY_ALGORITHM_H
+
+#include "compiler/cost_estimator/cost_estimator.h"
+#include "compiler/search_result.dtg.h"
+#include "compiler/unity_algorithm/unity_search_config.dtg.h"
+#include "pcg/machine_specification.dtg.h"
+#include "substitutions/substitution.h"
+
+namespace FlexFlow {
+
+SearchResult graph_optimize(ParallelComputationGraph &pcg,
+                            CostEstimator const &cost_estimator,
+                            MachineSpecification const &resources,
+                            UnitySearchConfig const &search_config);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/compiler/include/compiler/optimizer_config.struct.toml b/lib/compiler/include/compiler/unity_algorithm/unity_search_config.struct.toml
similarity index 90%
rename from lib/compiler/include/compiler/optimizer_config.struct.toml
rename to lib/compiler/include/compiler/unity_algorithm/unity_search_config.struct.toml
index b7f4f71e9c..9ec22cf916 100644
--- a/lib/compiler/include/compiler/optimizer_config.struct.toml
+++ b/lib/compiler/include/compiler/unity_algorithm/unity_search_config.struct.toml
@@ -1,5 +1,5 @@
 namespace = "FlexFlow"
-name = "OptimizerConfig"
+name = "UnitySearchConfig"
 features = [
   "eq",
   "hash",
diff --git a/lib/compiler/src/compiler/compiler.cc b/lib/compiler/src/compiler/compiler.cc
new file mode 100644
index 0000000000..a58651f01a
--- /dev/null
+++ b/lib/compiler/src/compiler/compiler.cc
@@ -0,0 +1,26 @@
+#include "compiler/compiler.h"
+#include "compiler/unity_algorithm/unity_algorithm.h"
+#include "pcg/pcg_from_computation_graph.h"
+#include "utils/overload.h"
+
+namespace FlexFlow {
+
+SearchResult optimize(ComputationGraph const &computation_graph,
+                      MachineSpecification const &machine_specification,
+                      CostEstimator const &cost_estimator,
+                      AlgorithmConfig const &search_config) {
+  return search_config.visit<SearchResult>(overload{
+      [&](DataParallelismConfig const &config) -> SearchResult {
+        throw std::runtime_error(
+            "Data parallel search algorithm is not implemented yet");
+      },
+      [&](UnitySearchConfig const &config) {
+        ParallelComputationGraph pcg =
+            pcg_from_computation_graph(computation_graph);
+        return graph_optimize(
+            pcg, cost_estimator, machine_specification, config);
+      },
+  });
+}
+
+} // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/graph_optimize_result.cc b/lib/compiler/src/compiler/graph_optimize_result.cc
deleted file mode 100644
index f48c119603..0000000000
--- a/lib/compiler/src/compiler/graph_optimize_result.cc
+++ /dev/null
@@ -1,15 +0,0 @@
-#include "compiler/graph_optimize_result.h"
-
-namespace FlexFlow {
-
-std::string format_as(GraphOptimizeResult const &r) {
-  return fmt::format("<GraphOptimizeResult\npcg={}\nmachine_mapping={}>",
-                     as_dot(r.pcg),
-                     r.machine_mapping);
-}
-
-std::ostream &operator<<(std::ostream &s, GraphOptimizeResult const &r) {
-  return (s << fmt::to_string(r));
-}
-
-} // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/graph_optimize_state.cc b/lib/compiler/src/compiler/graph_optimize_state.cc
deleted file mode 100644
index 1091b92866..0000000000
--- a/lib/compiler/src/compiler/graph_optimize_state.cc
+++ /dev/null
@@ -1,96 +0,0 @@
-#include "compiler/graph_optimize_state.h"
-#include "compiler/graph_optimize_result.h"
-#include "pcg/parallel_computation_graph/parallel_tensor_guid_t.h"
-
-namespace FlexFlow {
-
-GraphOptimizeState::GraphOptimizeState(
-    GraphOptimizeResult const &graph_optimize_result, float runtime)
-    : graph_optimize_result(graph_optimize_result), runtime(runtime) {}
-
-bool GraphOptimizeState::operator==(GraphOptimizeState const &other) const {
-  // Note(@wmdi): This is a hack to implement a partially correct homomorphism
-  // check. Switch to the homomorphism check used in substitutions right after
-  // https://github.com/flexflow/FlexFlow/pull/1471 is merged.
-  auto layers1 = topological_ordering(graph_optimize_result.pcg);
-  auto layers2 = topological_ordering(other.graph_optimize_result.pcg);
-  if (layers1.size() != layers2.size()) {
-    return false;
-  }
-  std::unordered_map<parallel_tensor_guid_t, parallel_tensor_guid_t> mapping;
-  for (size_t i = 0; i < layers1.size(); ++i) {
-    if (get_parallel_layer_attrs(graph_optimize_result.pcg, layers1[i]) !=
-        get_parallel_layer_attrs(other.graph_optimize_result.pcg, layers2[i])) {
-      return false;
-    }
-    auto inputs1 = get_incoming_tensors(graph_optimize_result.pcg, layers1[i]);
-    auto inputs2 =
-        get_incoming_tensors(other.graph_optimize_result.pcg, layers2[i]);
-    if (inputs1.size() != inputs2.size()) {
-      return false;
-    }
-    for (size_t j = 0; j < inputs1.size(); ++j) {
-      if (inputs1[j] != mapping.at(inputs2[j])) {
-        return false;
-      }
-    }
-    auto outputs1 = get_layer_outputs(graph_optimize_result.pcg, layers1[i]);
-    auto outputs2 =
-        get_layer_outputs(other.graph_optimize_result.pcg, layers2[i]);
-    if (outputs1.size() != outputs2.size()) {
-      return false;
-    }
-    for (size_t j = 0; j < outputs1.size(); ++j) {
-      mapping.emplace(outputs2[j], outputs1[j]);
-    }
-  }
-  return true;
-}
-
-bool GraphOptimizeState::operator!=(GraphOptimizeState const &other) const {
-  return !(*this == other);
-}
-
-bool GraphOptimizeState::operator<(GraphOptimizeState const &other) const {
-  return runtime < other.runtime;
-}
-
-std::string format_as(GraphOptimizeState const &st) {
-  return fmt::format("<GraphOptimizeState graph_optimize_result={} runtime={}>",
-                     st.graph_optimize_result,
-                     st.runtime);
-}
-
-std::ostream &operator<<(std::ostream &s, GraphOptimizeState const &st) {
-  return (s << fmt::to_string(st));
-}
-
-} // namespace FlexFlow
-
-namespace std {
-
-size_t hash<::FlexFlow::GraphOptimizeState>::operator()(
-    ::FlexFlow::GraphOptimizeState const &state) const {
-  // TODO(@wmdi): Eventually it might be good to use a proper graph hash like
-  // https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.graph_hashing.weisfeiler_lehman_graph_hash.html#networkx.algorithms.graph_hashing.weisfeiler_lehman_graph_hash
-  size_t seed = 0;
-  auto layers = topological_ordering(state.graph_optimize_result.pcg);
-  ::FlexFlow::hash_combine(seed, layers.size());
-  for (auto layer : layers) {
-    ::FlexFlow::hash_combine(
-        seed, get_parallel_layer_attrs(state.graph_optimize_result.pcg, layer));
-    auto inputs = get_incoming_tensors(state.graph_optimize_result.pcg, layer);
-    ::FlexFlow::hash_combine(seed, inputs.size());
-    for (auto input : inputs) {
-      for (size_t i = 0; i < layers.size(); ++i) {
-        if (get_source_layer(input) == layers[i]) {
-          ::FlexFlow::hash_combine(seed, i);
-          break;
-        }
-      }
-    }
-  }
-  return seed;
-}
-
-} // namespace std
diff --git a/lib/compiler/src/compiler/allowed_machine_views.cc b/lib/compiler/src/compiler/machine_mapping/allowed_machine_views.cc
similarity index 79%
rename from lib/compiler/src/compiler/allowed_machine_views.cc
rename to lib/compiler/src/compiler/machine_mapping/allowed_machine_views.cc
index 6f86d1d82a..b4df1451ca 100644
--- a/lib/compiler/src/compiler/allowed_machine_views.cc
+++ b/lib/compiler/src/compiler/machine_mapping/allowed_machine_views.cc
@@ -1,4 +1,4 @@
-#include "compiler/allowed_machine_views.h"
+#include "compiler/machine_mapping/allowed_machine_views.h"
 #include "pcg/machine_specification.h"
 #include "pcg/machine_view.h"
 #include "pcg/multi_dimensional_stride.dtg.h"
@@ -57,6 +57,8 @@ static std::unordered_set<MachineView>
         product(transform(tensor_dims, [](nonnegative_int num_devices) {
           return nonnegative_int{num_devices.unwrap_nonnegative() - 1};
         }));
+    min_num_devices_with_full_stride_volume =
+        std::max(min_num_devices_with_full_stride_volume, 1_n);
     return ceildiv(total_devices, min_num_devices_with_full_stride_volume);
   };
 
@@ -66,13 +68,19 @@ static std::unordered_set<MachineView>
     nonnegative_int max_stride_upper_bound =
         get_max_stride_upper_bound(tensor_dims, total_devices);
 
-    std::vector<stride_t> single_stride_range =
-        transform(nonnegative_range(1_n, max_stride_upper_bound + 1_n),
-                  [](nonnegative_int stride) { return stride_t{stride}; });
+    std::vector<std::vector<stride_t>> stride_options =
+        transform(tensor_dims, [&](nonnegative_int dim_size) {
+          if (dim_size != 1_n) {
+            return transform(
+                nonnegative_range(1_n, max_stride_upper_bound + 1_n),
+                [](nonnegative_int stride) { return stride_t{stride}; });
+          } else {
+            return std::vector<stride_t>{stride_t{1_n}};
+          }
+        });
+
     std::unordered_multiset<std::vector<stride_t>> raw_stride_vectors =
-        cartesian_product(
-            repeat_element(/*num_times=*/num_elements(tensor_dims),
-                           /*element=*/single_stride_range));
+        cartesian_product(stride_options);
     std::unordered_multiset<MultiDimensionalStride> strides =
         transform(raw_stride_vectors, [](auto const &stride_vec) {
           return MultiDimensionalStride{stride_vec};
@@ -94,10 +102,18 @@ static std::unordered_set<MachineView>
   };
 
   auto candidate_dimensions = [](OperatorTaskSpace const &task) {
-    std::unordered_set<MachineSpecificationDimension> options = {
-        MachineSpecificationDimension::INTER_NODE,
-        MachineSpecificationDimension::INTRA_NODE};
-    return get_all_permutations_with_repetition(options, num_dims(task));
+    std::vector<std::vector<MachineSpecificationDimension>> dimension_options =
+        transform(task.degrees, [](nonnegative_int dim_size) {
+          if (dim_size == 1_n) {
+            return std::vector<MachineSpecificationDimension>{
+                MachineSpecificationDimension::INTRA_NODE};
+          } else {
+            return std::vector<MachineSpecificationDimension>{
+                MachineSpecificationDimension::INTER_NODE,
+                MachineSpecificationDimension::INTRA_NODE};
+          }
+        });
+    return cartesian_product(dimension_options);
   };
 
   std::vector<nonnegative_int> tensor_dims = task.degrees;
diff --git a/lib/compiler/src/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.cc b/lib/compiler/src/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.cc
new file mode 100644
index 0000000000..252384985b
--- /dev/null
+++ b/lib/compiler/src/compiler/machine_mapping/apply_substitution_and_update_machine_mapping.cc
@@ -0,0 +1,197 @@
+#include "compiler/machine_mapping/apply_substitution_and_update_machine_mapping.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph_edge.h"
+#include "pcg/parallel_computation_graph/parallel_tensor_guid_t.h"
+#include "substitutions/apply_substitution/apply_substitution.h"
+#include "substitutions/apply_substitution/evaluate_substitution_output.h"
+#include "substitutions/apply_substitution/output_expr_to_result_sub_pcg_mapping.h"
+#include "substitutions/open_parallel_tensor_guid_t.h"
+#include "substitutions/pcg_pattern_match.h"
+#include "substitutions/sub_parallel_computation_graph.h"
+#include "substitutions/sub_parallel_computation_graph_data.dtg.h"
+#include "substitutions/sub_parallel_computation_graph_edge.h"
+#include "utils/containers/is_subseteq_of.h"
+#include "utils/containers/keys.h"
+#include "utils/containers/merge_maps.h"
+#include "utils/containers/restrict_keys.h"
+#include "utils/containers/set_minus.h"
+#include "utils/containers/values.h"
+
+namespace FlexFlow {
+
+SearchResult apply_substitution_and_update_machine_mapping(
+    SearchResult const &mapped_pcg,
+    Substitution const &sub,
+    PCGPatternMatch const &match) {
+  SubParallelComputationGraph spcg = sub_pcg_from_full_pcg(mapped_pcg.pcg);
+
+  auto substitution_output_result =
+      evaluate_substitution_output(spcg, sub, match);
+  SubParallelComputationGraph substitution_output_graph =
+      substitution_output_result.first;
+  OutputExprToResultSubPCGMapping output_expr_to_result_sub_pcg_mapping =
+      substitution_output_result.second;
+
+  SubParallelComputationGraphData output_graph_data =
+      get_sub_pcg_data(substitution_output_graph);
+  SubParallelComputationGraphData pre_data = get_sub_pcg_data(spcg);
+
+  std::unordered_set<parallel_layer_guid_t> pre_nodes =
+      keys(pre_data.node_data);
+  std::unordered_set<parallel_layer_guid_t> matched_nodes =
+      unordered_set_of(values(match.node_assignment));
+  std::unordered_set<parallel_layer_guid_t> post_nodes_from_original_graph =
+      set_minus(pre_nodes, matched_nodes);
+
+  std::unordered_map<parallel_layer_guid_t, MachineView> machine_views =
+      mapped_pcg.machine_mapping.machine_views;
+
+  std::unordered_set<MachineView> substituted_machine_views =
+      transform(matched_nodes, [&](parallel_layer_guid_t const &node) {
+        return machine_views.at(node);
+      });
+  MachineView first_substituted_machine_view =
+      *substituted_machine_views.begin();
+
+  std::unordered_map<parallel_layer_guid_t, ParallelLayerAttrs> post_node_data =
+      [&] {
+        std::unordered_map<parallel_layer_guid_t, ParallelLayerAttrs>
+            post_node_data_from_orig = restrict_keys(
+                pre_data.node_data, post_nodes_from_original_graph);
+        std::unordered_map<parallel_layer_guid_t, ParallelLayerAttrs>
+            post_node_data_from_sub = output_graph_data.node_data;
+
+        for (auto [layer, attrs] : post_node_data_from_sub) {
+          machine_views.insert_or_assign(layer, first_substituted_machine_view);
+        }
+
+        return merge_disjoint_maps(post_node_data_from_orig,
+                                   post_node_data_from_sub);
+      }();
+
+  std::unordered_set<SubParallelComputationGraphEdge> post_edges = [&] {
+    std::unordered_set<SubParallelComputationGraphEdge> post_edges_from_orig =
+        filter(pre_data.edges, [&](SubParallelComputationGraphEdge const &e) {
+          if (e.raw_edge.has<DataflowInputEdge>()) {
+            return true;
+          } else {
+            DataflowEdge dfe = e.raw_edge.get<DataflowEdge>();
+            parallel_layer_guid_t src = parallel_layer_guid_t{dfe.src.node};
+            parallel_layer_guid_t dst = parallel_layer_guid_t{dfe.dst.node};
+            return !(contains(matched_nodes, src) ||
+                     contains(matched_nodes, dst));
+          }
+        });
+
+    std::unordered_set<SubParallelComputationGraphEdge> post_edges_from_sub =
+        filter(output_graph_data.edges,
+               [&](SubParallelComputationGraphEdge const &e) {
+                 return !e.raw_edge.has<DataflowInputEdge>();
+               });
+
+    bidict<PatternNodeOutput, parallel_tensor_guid_t>
+        output_orig_pattern_mapping = get_output_mapping_for_pcg_pattern_match(
+            match, sub.pcg_pattern, spcg);
+    bidict<parallel_tensor_guid_t, OutputGraphExprNodeOutput>
+        output_post_outexpr_mapping = get_output_graph_expr_output_mapping(
+            output_expr_to_result_sub_pcg_mapping,
+            sub.output_graph_expr,
+            substitution_output_graph);
+
+    std::unordered_set<SubParallelComputationGraphEdge> incoming_to_sub_edges;
+    for (auto const &[pattern_input, base_graph_tensor] :
+         match.input_assignment) {
+      OutputGraphExprInput output_expr_input =
+          sub.inputs_mapping.at_l(pattern_input);
+      input_parallel_tensor_guid_t output_graph_input =
+          output_expr_to_result_sub_pcg_mapping.input_mapping.at_r(
+              output_expr_input);
+      std::unordered_set<parallel_tensor_use_t> uses = get_parallel_tensor_uses(
+          substitution_output_graph,
+          open_parallel_tensor_guid_from_input(output_graph_input));
+      for (parallel_tensor_use_t const &use : uses) {
+        SubParallelComputationGraphEdge new_edge =
+            subpcg_edge_from_tensor_and_use(base_graph_tensor, use);
+        incoming_to_sub_edges.insert(new_edge);
+      }
+    }
+
+    std::unordered_set<SubParallelComputationGraphEdge> outgoing_from_sub_edges;
+    for (ParallelComputationGraphEdge const &outgoing_edge :
+         get_subgraph_outgoing_edges(spcg, matched_nodes)) {
+      parallel_tensor_guid_t original_tensor =
+          get_parallel_tensor(outgoing_edge);
+      PatternNodeOutput pattern_tensor =
+          output_orig_pattern_mapping.at_r(original_tensor);
+      OutputGraphExprNodeOutput output_graph_tensor =
+          sub.outputs_mapping.at_l(pattern_tensor);
+      parallel_tensor_guid_t new_tensor =
+          output_post_outexpr_mapping.at_r(output_graph_tensor);
+
+      SubParallelComputationGraphEdge new_edge =
+          subpcg_edge_from_tensor_and_dst(
+              new_tensor,
+              get_dst_layer(outgoing_edge),
+              get_dst_layer_input_idx(outgoing_edge));
+      outgoing_from_sub_edges.insert(new_edge);
+    }
+
+    return set_union(std::vector{
+        post_edges_from_orig,
+        post_edges_from_sub,
+        incoming_to_sub_edges,
+        outgoing_from_sub_edges,
+    });
+  }();
+
+  std::unordered_set<input_parallel_tensor_guid_t> post_inputs =
+      pre_data.inputs;
+
+  std::unordered_map<open_parallel_tensor_guid_t, ParallelTensorAttrs>
+      post_value_data = [&] {
+        std::unordered_map<open_parallel_tensor_guid_t, ParallelTensorAttrs>
+            post_value_data_from_orig = filter_keys(
+                pre_data.value_data, [&](open_parallel_tensor_guid_t const &t) {
+                  return visit_open_parallel_tensor_guid(
+                      t,
+                      overload{
+                          [&](parallel_tensor_guid_t const &t) {
+                            return contains(post_nodes_from_original_graph,
+                                            get_source_layer(t));
+                          },
+                          [](input_parallel_tensor_guid_t const &) {
+                            return true;
+                          },
+                      });
+                });
+
+        std::unordered_map<open_parallel_tensor_guid_t, ParallelTensorAttrs>
+            post_value_data_from_sub = output_graph_data.value_data;
+        return merge_disjoint_maps(post_value_data_from_orig,
+                                   post_value_data_from_sub);
+      }();
+
+  SubParallelComputationGraphData post_data = SubParallelComputationGraphData{
+      post_node_data,
+      post_edges,
+      post_inputs,
+      post_value_data,
+  };
+
+  assert(is_subseteq_of(keys(post_node_data), keys(machine_views)));
+
+  for (auto it = machine_views.begin(); it != machine_views.end();) {
+    if (post_node_data.find(it->first) == post_node_data.end()) {
+      it = machine_views.erase(it);
+    } else {
+      ++it;
+    }
+  }
+
+  assert(keys(post_node_data) == keys(machine_views));
+
+  return SearchResult{
+      pcg_from_sub_pcg_by_dropping_inputs(sub_pcg_from_graph_data(post_data)),
+      MachineMapping{machine_views}};
+}
+
+} // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc b/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
index 49d528e4ab..0743301e8f 100644
--- a/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
+++ b/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
@@ -16,9 +16,13 @@
 #include "pcg/machine_view.h"
 #include "pcg/parallel_computation_graph/parallel_computation_graph.h"
 #include "utils/containers/contains.h"
+#include "utils/containers/contains_key.h"
 #include "utils/containers/flatmap.h"
 #include "utils/containers/generate_map.h"
 #include "utils/containers/get_all_assignments.h"
+#include "utils/containers/keys.h"
+#include "utils/containers/merge_maps.h"
+#include "utils/containers/set_minus.h"
 #include "utils/containers/unordered_set_of.h"
 #include "utils/exception.h"
 #include "utils/overload.h"
@@ -80,17 +84,23 @@ MachineMappingResult
                                     &parallel_split_transformation) {
 
   auto get_boundary_machine_view_assignments =
-      [&](std::unordered_set<BinaryTreePath> const &boundary_layers)
+      [&](std::unordered_set<BinaryTreePath> const &boundary_layers,
+          MachineMappingProblemTree const &t,
+          BinaryTreePathEntry const &prefix)
       -> std::unordered_set<ParallelLayerGuidObliviousMachineMapping> {
+    std::unordered_set<BinaryTreePath> unconstrained_boundary_layers =
+        set_minus(boundary_layers,
+                  keys(restrict_to_child(constraints, prefix).machine_views));
+
     std::unordered_map<BinaryTreePath, std::unordered_set<MachineView>>
         allowed = generate_map(
-            boundary_layers,
+            unconstrained_boundary_layers,
             [&](BinaryTreePath const &l) -> std::unordered_set<MachineView> {
+              MachineMappingProblemTree subtree_at_path =
+                  expect(mm_problem_tree_get_subtree_at_path(t, l),
+                         "Failed to get subtree at path");
               UnmappedOpCostEstimateKey leaf =
-                  mm_problem_tree_get_subtree_at_path(
-                      MachineMappingProblemTree{series_split}, l)
-                      .value()
-                      .get<UnmappedOpCostEstimateKey>();
+                  subtree_at_path.get<UnmappedOpCostEstimateKey>();
               return context.allowed_machine_views(leaf, resources);
             });
     return transform(
@@ -138,24 +148,37 @@ MachineMappingResult
 
   for (ParallelLayerGuidObliviousMachineMapping const
            &assigned_pre_machine_views :
-       get_boundary_machine_view_assignments(get_src_layers(tensor_movement))) {
+       get_boundary_machine_view_assignments(get_src_layers(tensor_movement),
+                                             series_split.get_left_child(),
+                                             BinaryTreePathEntry::LEFT_CHILD)) {
 
     MachineMappingResult pre_result =
         eval_pre_boundary_mapping(assigned_pre_machine_views);
 
+    if (is_infeasible(pre_result)) {
+      continue;
+    }
+
     for (ParallelLayerGuidObliviousMachineMapping const
              &assigned_post_machine_views :
          get_boundary_machine_view_assignments(
-             get_dst_layers(tensor_movement))) {
+             get_dst_layers(tensor_movement),
+             series_split.get_right_child(),
+             BinaryTreePathEntry::RIGHT_CHILD)) {
 
       MachineMappingResult post_result =
           eval_post_boundary_mapping(assigned_post_machine_views);
 
+      if (is_infeasible(post_result)) {
+        continue;
+      }
+
       TensorSetMovement comm_across_split =
           concretize_abstracted_tensor_set_movement(
               tensor_movement,
-              /*pre_mapping=*/assigned_pre_machine_views,
-              /*post_mapping=*/assigned_post_machine_views);
+              /*pre_mapping=*/pre_result.raw_result.value().machine_mapping,
+              /*post_mapping=*/post_result.raw_result.value().machine_mapping);
+
       float cost_across_split =
           context.cost_estimator.estimate_cost(comm_across_split);
 
diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc
index 82c8274808..07bde820e9 100644
--- a/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc
+++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc
@@ -1,7 +1,16 @@
 #include "compiler/machine_mapping/machine_mapping.h"
+#include "compiler/series_parallel/pcg/pcg_binary_sp_decomposition.h"
+#include "pcg/machine_specification.h"
+#include "pcg/machine_view.h"
+#include "pcg/operator_task_space.dtg.h"
+#include "pcg/operator_task_space.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.h"
 #include "utils/containers/are_disjoint.h"
 #include "utils/containers/keys.h"
+#include "utils/containers/map_keys.h"
 #include "utils/containers/merge_maps.h"
+#include "utils/containers/transform.h"
+#include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/get_subtree_at_path.h"
 
 namespace FlexFlow {
 
@@ -15,4 +24,39 @@ bool nodes_are_disjoint(MachineMapping const &m1, MachineMapping const &m2) {
   return are_disjoint(keys(m1.machine_views), keys(m2.machine_views));
 }
 
+parallel_layer_guid_t
+    get_layer_from_path(PCGBinarySPDecomposition const &sp_decomposition,
+                        BinaryTreePath const &path) {
+  std::optional<PCGBinarySPDecomposition> subtree_optional =
+      get_subtree_at_path(
+          sp_decomposition, generic_impl_for_pcg_sp_tree(), path);
+
+  if (!subtree_optional.has_value()) {
+    throw std::runtime_error(fmt::format("Invalid tree path {}", path));
+  }
+
+  PCGBinarySPDecomposition subtree = subtree_optional.value();
+  if (!subtree.is_leaf()) {
+    throw std::runtime_error(
+        fmt::format("Invalid tree path to a leaf: found {} instead", subtree));
+  }
+  return subtree.require_leaf();
+}
+
+std::optional<MachineMapping> get_machine_mapping_from_machine_mapping_result(
+    PCGBinarySPDecomposition const &sp_decomposition,
+    MachineMappingResult const &mm_result) {
+
+  return transform(
+      mm_result.raw_result,
+      [&](FeasibleMachineMappingResult const &feasible_mm_result) {
+        return MachineMapping{
+            map_keys(feasible_mm_result.machine_mapping.raw_mapping,
+                     [&](BinaryTreePath const &path) {
+                       return get_layer_from_path(sp_decomposition, path);
+                     }),
+        };
+      });
+}
+
 } // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping_mutation_set.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping_mutation_set.cc
new file mode 100644
index 0000000000..15648eab74
--- /dev/null
+++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping_mutation_set.cc
@@ -0,0 +1,52 @@
+#include "compiler/machine_mapping/machine_mapping_mutation_set.h"
+#include "compiler/machine_mapping/allowed_machine_views.h"
+#include "pcg/machine_view.h"
+#include "pcg/operator_task_space.h"
+#include "utils/containers/vector_of.h"
+#include "utils/nonnegative_int/nonnegative_range.h"
+#include "utils/random_utils.h"
+#include "utils/vector.h"
+
+namespace FlexFlow {
+
+std::optional<MachineMapping>
+    get_naive_mapping(ParallelComputationGraph &pcg,
+                      MachineSpecification const &resources,
+                      DeviceType const &device_type) {
+  std::vector<parallel_layer_guid_t> layers = topological_ordering(pcg);
+  std::unordered_map<parallel_layer_guid_t, MachineView> machine_views;
+  for (parallel_layer_guid_t layer : layers) {
+    OperatorTaskSpace task = get_operator_task_space(pcg, layer);
+    std::unordered_set<MachineView> allowed_machine_views =
+        get_allowed_machine_views(resources, task, DeviceType::GPU);
+    if (allowed_machine_views.empty()) {
+      return std::nullopt;
+    }
+    machine_views.insert({layer, *(allowed_machine_views.begin())});
+  }
+  return MachineMapping{machine_views};
+}
+
+std::optional<MachineMapping>
+    get_random_mutation(SearchResult mapped_pcg,
+                        MachineSpecification const &resources,
+                        DeviceType const &device_type) {
+  ParallelComputationGraph pcg = mapped_pcg.pcg;
+  std::vector<parallel_layer_guid_t> layers = topological_ordering(pcg);
+  if (layers.size() == 0) {
+    return std::nullopt;
+  }
+  parallel_layer_guid_t random_layer = select_random(layers);
+
+  MachineMapping machine_mapping = mapped_pcg.machine_mapping;
+  MachineView machine_view = machine_mapping.machine_views.at(random_layer);
+  OperatorTaskSpace task = get_operator_task_space(pcg, random_layer);
+
+  std::vector<MachineView> allowed_machine_views =
+      vector_of(get_allowed_machine_views(resources, task, device_type));
+  MachineView random_new_machine_view = select_random(allowed_machine_views);
+
+  machine_mapping.machine_views.at(random_layer) = random_new_machine_view;
+  return machine_mapping;
+}
+} // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc
index 367af3701e..1d000ff041 100644
--- a/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc
+++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc
@@ -1,14 +1,50 @@
 #include "compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.h"
+#include "compiler/machine_mapping/abstracted_tensor_set_movement/abstracted_tensor_set_movement.h"
 #include "compiler/machine_mapping/abstracted_tensor_set_movement/get_abstracted_tensor_set_movement_across_split.h"
 #include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h"
 #include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h"
 #include "compiler/machine_mapping/transitive_reduced_pcg.h"
 #include "compiler/series_parallel/pcg/pcg_binary_sp_decomposition.h"
 #include "pcg/parallel_computation_graph/parallel_computation_graph.h"
+#include "utils/containers/all_of.h"
 #include "utils/overload.h"
 
 namespace FlexFlow {
 
+bool is_valid_machine_mapping_problem_tree(
+    MachineMappingProblemTree const &problem_tree) {
+  return problem_tree.visit<bool>(overload{
+      [&](MMProblemTreeSeriesSplit const &series_split) {
+        AbstractedTensorSetMovement tensor_movement =
+            series_split.tensor_set_movement;
+
+        auto contains_paths =
+            [](MachineMappingProblemTree const &t,
+               std::unordered_set<BinaryTreePath> const &paths) {
+              return all_of(paths, [&](BinaryTreePath const &p) {
+                return mm_problem_tree_get_subtree_at_path(t, p).has_value();
+              });
+            };
+
+        return contains_paths(series_split.get_left_child(),
+                              get_src_layers(tensor_movement)) &&
+               contains_paths(series_split.get_right_child(),
+                              get_dst_layers(tensor_movement)) &&
+               is_valid_machine_mapping_problem_tree(
+                   series_split.get_left_child()) &&
+               is_valid_machine_mapping_problem_tree(
+                   series_split.get_right_child());
+      },
+      [&](MMProblemTreeParallelSplit const &parallel_split) {
+        return is_valid_machine_mapping_problem_tree(
+                   parallel_split.get_left_child()) &&
+               is_valid_machine_mapping_problem_tree(
+                   parallel_split.get_right_child());
+      },
+      [&](UnmappedOpCostEstimateKey const &leaf) { return true; },
+  });
+}
+
 MachineMappingProblemTree get_machine_mapping_problem_tree(
     ParallelComputationGraph const &pcg,
     PCGBinarySPDecomposition const &sp_decomposition_tree) {
@@ -23,31 +59,43 @@ MachineMappingProblemTree get_machine_mapping_problem_tree(
         [&](PCGBinarySeriesSplit const &series) {
           AbstractedTensorSetMovement tensor_movement =
               get_abstracted_tensor_set_movement_across_split(tr_pcg, series);
-          return MachineMappingProblemTree{
+          MachineMappingProblemTree result = MachineMappingProblemTree{
               MMProblemTreeSeriesSplit{
                   /*tensor_set_movement=*/tensor_movement,
                   /*lhs=*/to_problem_tree(series.get_left_child()),
                   /*rhs=*/to_problem_tree(series.get_right_child()),
               },
           };
+          assert(is_valid_machine_mapping_problem_tree(result));
+          return result;
         },
         [&](PCGBinaryParallelSplit const &parallel) {
-          return MachineMappingProblemTree{
+          MachineMappingProblemTree result = MachineMappingProblemTree{
               MMProblemTreeParallelSplit{
                   to_problem_tree(parallel.get_left_child()),
                   to_problem_tree(parallel.get_right_child()),
               },
           };
+          assert(is_valid_machine_mapping_problem_tree(result));
+          return result;
         },
         [&](parallel_layer_guid_t const &leaf) {
-          return MachineMappingProblemTree{
+          MachineMappingProblemTree result = MachineMappingProblemTree{
               get_unmapped_op_cost_estimate_key_for_layer(pcg, leaf),
           };
+          assert(is_valid_machine_mapping_problem_tree(result));
+          return result;
         },
     });
   };
 
-  return to_problem_tree(sp_decomposition_tree);
+  MachineMappingProblemTree mm_tree = to_problem_tree(sp_decomposition_tree);
+
+  if (!is_valid_machine_mapping_problem_tree(mm_tree)) {
+    throw std::runtime_error("Invalid machine mapping problem tree generated");
+  }
+
+  return mm_tree;
 }
 
 } // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.cc
index 1e39a7be19..7834938e41 100644
--- a/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.cc
+++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.cc
@@ -1,4 +1,6 @@
 #include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h"
+#include "compiler/machine_mapping/abstracted_tensor_set_movement/abstracted_tensor_set_movement.h"
+#include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/as_dot.h"
 #include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/get_all_leaf_paths.h"
 #include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/get_leaves.h"
 #include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/get_subtree_at_path.h"
@@ -88,4 +90,54 @@ std::optional<MachineMappingProblemTree>
       tree, generic_binary_sp_impl_for_mm_problem_tree(), path);
 }
 
+std::string as_dot(MachineMappingProblemTree const &tree) {
+  std::function<std::string(MMProblemTreeSeriesSplit const &)>
+      get_series_label =
+          [](MMProblemTreeSeriesSplit const &series) -> std::string {
+    auto path_as_dot = [](BinaryTreePath const &path) -> std::string {
+      return "(" +
+             join_strings(path.entries,
+                          ", ",
+                          [](BinaryTreePathEntry const &entry) -> std::string {
+                            if (entry == BinaryTreePathEntry::LEFT_CHILD) {
+                              return "l";
+                            } else {
+                              assert(entry == BinaryTreePathEntry::RIGHT_CHILD);
+                              return "r";
+                            }
+                          }) +
+             ")";
+    };
+
+    auto path_set_as_dot =
+        [&](std::unordered_set<BinaryTreePath> const &path_set) -> std::string {
+      return "(" + join_strings(path_set, ", ", path_as_dot) + ")";
+    };
+
+    return fmt::format(
+        "srcs={} dsts={}",
+        path_set_as_dot(get_src_layers(series.tensor_set_movement)),
+        path_set_as_dot(get_dst_layers(series.tensor_set_movement)));
+  };
+
+  std::function<std::string(MMProblemTreeParallelSplit const &)>
+      get_parallel_label =
+          [](MMProblemTreeParallelSplit const &parallel) -> std::string {
+    return "P";
+  };
+
+  std::function<std::string(UnmappedOpCostEstimateKey const &)> get_leaf_label =
+      [](UnmappedOpCostEstimateKey const &leaf) -> std::string { return ""; };
+
+  return as_dot(tree,
+                generic_binary_sp_impl_for_mm_problem_tree(),
+                get_series_label,
+                get_parallel_label,
+                get_leaf_label);
+}
+
+void debug_print_dot(MachineMappingProblemTree const &tree) {
+  std::cout << as_dot(tree) << std::endl;
+}
+
 } // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.cc
index 990b287f8b..b6d701cb98 100644
--- a/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.cc
+++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.cc
@@ -1,4 +1,5 @@
 #include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h"
+#include "pcg/operator_task_space.h"
 #include "pcg/parallel_computation_graph/parallel_computation_graph.h"
 #include "pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.h"
 
@@ -18,6 +19,8 @@ UnmappedOpCostEstimateKey get_unmapped_op_cost_estimate_key_for_layer(
       transform(get_incoming_weights(pcg, layer), get_tensor_shape),
       /*output_shapes=*/
       transform(get_layer_outputs(pcg, layer), get_tensor_shape),
+      /*op_task_space=*/
+      get_operator_task_space(pcg, layer),
   };
 }
 
diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping_result.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping_result.cc
index 3409f7f871..031b7f7fc5 100644
--- a/lib/compiler/src/compiler/machine_mapping/machine_mapping_result.cc
+++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping_result.cc
@@ -135,4 +135,12 @@ MachineMappingResult
   };
 }
 
+float get_runtime_cost(MachineMappingResult const &mm_result) {
+  if (mm_result.raw_result == std::nullopt) {
+    return std::numeric_limits<float>::infinity();
+  } else {
+    return mm_result.raw_result.value().runtime;
+  }
+}
+
 } // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/mcmc/generic_mcmc_algorithm.cc b/lib/compiler/src/compiler/mcmc/generic_mcmc_algorithm.cc
new file mode 100644
index 0000000000..1bf4f5c2b7
--- /dev/null
+++ b/lib/compiler/src/compiler/mcmc/generic_mcmc_algorithm.cc
@@ -0,0 +1 @@
+#include "compiler/mcmc/generic_mcmc_algorithm.h"
diff --git a/lib/compiler/src/compiler/mcmc/generic_mcmc_state.cc b/lib/compiler/src/compiler/mcmc/generic_mcmc_state.cc
new file mode 100644
index 0000000000..6aa4dd5eff
--- /dev/null
+++ b/lib/compiler/src/compiler/mcmc/generic_mcmc_state.cc
@@ -0,0 +1,12 @@
+#include "compiler/mcmc/generic_mcmc_state.h"
+#include "utils/archetypes/ordered_value_type.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+using State = value_type<0>;
+using Score = ordered_value_type<1>;
+
+template struct Generic_MCMC_state<State, Score>;
+template struct Generic_MCMC_state<State, float>;
+
+} // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/mcmc/mcmc_over_mapped_pcg.cc b/lib/compiler/src/compiler/mcmc/mcmc_over_mapped_pcg.cc
new file mode 100644
index 0000000000..ab7769679e
--- /dev/null
+++ b/lib/compiler/src/compiler/mcmc/mcmc_over_mapped_pcg.cc
@@ -0,0 +1,73 @@
+#include "compiler/mcmc/mcmc_over_mapped_pcg.h"
+#include "compiler/machine_mapping/apply_substitution_and_update_machine_mapping.h"
+#include "compiler/machine_mapping/machine_mapping_mutation_set.h"
+#include "compiler/mcmc/generic_mcmc_algorithm.h"
+#include "compiler/search_result.h"
+#include "compiler/task_graph_simulator/task_simulator.h"
+#include "substitutions/pcg_pattern.h"
+#include "substitutions/pcg_pattern_match.h"
+#include "substitutions/unity_substitution_set.h"
+#include "utils/optional.h"
+
+namespace FlexFlow {
+
+SearchResult mcmc_graph_optimize(ParallelComputationGraph &pcg,
+                                 CostEstimator const &cost_estimator,
+                                 MachineSpecification const &resources,
+                                 MCMCOverMappedPCGConfig const &search_config) {
+
+  std::vector<Substitution> substitutions = get_substitution_set(resources);
+
+  std::optional<MachineMapping> naive_mapping =
+      get_naive_mapping(pcg, resources, search_config.device_type);
+  if (naive_mapping == std::nullopt) {
+    throw std::runtime_error("Failed to find any solutions");
+  }
+
+  SearchResult starting_state = SearchResult{pcg, naive_mapping.value()};
+
+  auto generating_func = [&](SearchResult mapped_pcg,
+                             nonnegative_int i) -> std::optional<SearchResult> {
+    if (i.unwrap_nonnegative() %
+            search_config.substitution_interval.unwrap_nonnegative() ==
+        0) {
+      // substitutions every (substitution_interval) iterations
+      std::optional<Substitution> random_substitution =
+          get_random_substitution(resources);
+      if (random_substitution != std::nullopt) {
+        std::optional<PCGPatternMatch> pattern_match =
+            get_random_pattern_match(random_substitution.value().pcg_pattern,
+                                     sub_pcg_from_full_pcg(mapped_pcg.pcg));
+        if (pattern_match != std::nullopt) {
+          return apply_substitution_and_update_machine_mapping(
+              mapped_pcg, random_substitution.value(), pattern_match.value());
+        }
+      }
+      return std::nullopt;
+    } else {
+      // machine mapping mutations otherwise
+      std::optional<MachineMapping> new_machine_mapping =
+          get_random_mutation(mapped_pcg, resources, search_config.device_type);
+      if (new_machine_mapping == std::nullopt) {
+        return std::nullopt;
+      }
+      return SearchResult{mapped_pcg.pcg, new_machine_mapping.value()};
+    }
+  };
+
+  auto scoring_func = [&](SearchResult mapped_pcg) -> float {
+    return task_simulator_estimate_forward_pass_time(
+        mapped_pcg.pcg, cost_estimator, mapped_pcg.machine_mapping, resources);
+  };
+
+  GenericMCMCConfig config =
+      GenericMCMCConfig{/*temperature*/ search_config.temperature,
+                        /*num_iterations*/ search_config.num_iterations};
+
+  Generic_MCMC_state<SearchResult, float> result =
+      minimize_score(starting_state, generating_func, scoring_func, config);
+
+  return result.get_state();
+}
+
+} // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/search_result.cc b/lib/compiler/src/compiler/search_result.cc
new file mode 100644
index 0000000000..0afc10723a
--- /dev/null
+++ b/lib/compiler/src/compiler/search_result.cc
@@ -0,0 +1,15 @@
+#include "compiler/search_result.h"
+
+namespace FlexFlow {
+
+std::string format_as(SearchResult const &r) {
+  return fmt::format("<SearchResult\npcg={}\nmachine_mapping={}>",
+                     as_dot(r.pcg),
+                     r.machine_mapping);
+}
+
+std::ostream &operator<<(std::ostream &s, SearchResult const &r) {
+  return (s << fmt::to_string(r));
+}
+
+} // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/series_parallel/pcg/pcg_binary_sp_decomposition.cc b/lib/compiler/src/compiler/series_parallel/pcg/pcg_binary_sp_decomposition.cc
index 5eb993c6ef..7b4670c608 100644
--- a/lib/compiler/src/compiler/series_parallel/pcg/pcg_binary_sp_decomposition.cc
+++ b/lib/compiler/src/compiler/series_parallel/pcg/pcg_binary_sp_decomposition.cc
@@ -1,7 +1,10 @@
 #include "compiler/series_parallel/pcg/pcg_binary_sp_decomposition.h"
+#include "compiler/series_parallel/pcg/get_pcg_series_parallel_decomposition.h"
+#include "compiler/series_parallel/pcg/pcg_binary_parallel_split.h"
 #include "compiler/series_parallel/pcg/pcg_binary_series_split.h"
 #include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/find_paths_to_leaf.h"
 #include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/get_leaves.h"
+#include "utils/graph/series_parallel/binary_sp_decomposition_tree/left_associative_binary_sp_tree_from_nary.h"
 #include "utils/overload.h"
 
 namespace FlexFlow {
@@ -82,8 +85,63 @@ BinarySPDecompositionTree
 }
 
 std::optional<PCGBinarySPDecomposition>
-    get_pcg_balanced_binary_sp_decomposition(ParallelComputationGraph const &) {
-  NOT_IMPLEMENTED();
+    get_pcg_balanced_binary_sp_decomposition(
+        ParallelComputationGraph const &pcg) {
+  SeriesParallelDecomposition sp_decomp =
+      expect(get_pcg_series_parallel_decomposition(pcg),
+             "Failed to get SP decomposition of PCG");
+  BinarySPDecompositionTree binary_sp_tree =
+      left_associative_binary_sp_tree_from_nary(sp_decomp);
+  return pcg_binary_sp_decomposition_from_binary_sp_decomposition_tree(
+      binary_sp_tree);
+}
+
+PCGBinarySeriesSplit pcg_binary_series_split_from_binary_series_split(
+    BinarySeriesSplit const &split) {
+  return PCGBinarySeriesSplit{
+      pcg_binary_sp_decomposition_from_binary_sp_decomposition_tree(
+          split.get_left_child()),
+      pcg_binary_sp_decomposition_from_binary_sp_decomposition_tree(
+          split.get_right_child()),
+  };
+}
+
+PCGBinaryParallelSplit pcg_binary_parallel_split_from_binary_parallel_split(
+    BinaryParallelSplit const &split) {
+  return PCGBinaryParallelSplit{
+      pcg_binary_sp_decomposition_from_binary_sp_decomposition_tree(
+          split.get_left_child()),
+      pcg_binary_sp_decomposition_from_binary_sp_decomposition_tree(
+          split.get_right_child()),
+  };
+}
+
+PCGBinarySPDecomposition
+    pcg_binary_sp_decomposition_from_binary_sp_decomposition_tree(
+        BinarySPDecompositionTree const &sp_tree) {
+
+  return sp_tree.visit<PCGBinarySPDecomposition>(overload{
+      [](BinarySeriesSplit const &series) -> PCGBinarySPDecomposition {
+        return PCGBinarySPDecomposition{
+            pcg_binary_series_split_from_binary_series_split(series),
+        };
+      },
+      [](BinaryParallelSplit const &parallel) -> PCGBinarySPDecomposition {
+        return PCGBinarySPDecomposition{
+            PCGBinaryParallelSplit{
+                pcg_binary_sp_decomposition_from_binary_sp_decomposition_tree(
+                    parallel.get_left_child()),
+                pcg_binary_sp_decomposition_from_binary_sp_decomposition_tree(
+                    parallel.get_right_child()),
+            },
+        };
+      },
+      [](Node const &node) -> PCGBinarySPDecomposition {
+        return PCGBinarySPDecomposition{
+            parallel_layer_guid_t{node},
+        };
+      },
+  });
 }
 
 std::unordered_multiset<parallel_layer_guid_t>
diff --git a/lib/compiler/src/compiler/unity_algorithm/graph_optimize_state.cc b/lib/compiler/src/compiler/unity_algorithm/graph_optimize_state.cc
new file mode 100644
index 0000000000..22e319321b
--- /dev/null
+++ b/lib/compiler/src/compiler/unity_algorithm/graph_optimize_state.cc
@@ -0,0 +1,61 @@
+#include "compiler/unity_algorithm/graph_optimize_state.h"
+#include "pcg/parallel_computation_graph/parallel_tensor_guid_t.h"
+
+namespace FlexFlow {
+
+GraphOptimizeState::GraphOptimizeState(ParallelComputationGraph const &pcg,
+                                       float runtime_with_optimal_mm)
+    : pcg(pcg), runtime_with_optimal_mm(runtime_with_optimal_mm) {}
+
+bool GraphOptimizeState::operator==(GraphOptimizeState const &other) const {
+  return pcgs_are_isomorphic(pcg, other.pcg);
+}
+
+bool GraphOptimizeState::operator!=(GraphOptimizeState const &other) const {
+  return !(*this == other);
+}
+
+bool GraphOptimizeState::operator<(GraphOptimizeState const &other) const {
+  return runtime_with_optimal_mm < other.runtime_with_optimal_mm;
+}
+
+std::string format_as(GraphOptimizeState const &st) {
+  return fmt::format("<GraphOptimizeState pcg={} runtime_with_optimal_mm={}>",
+                     as_dot(st.pcg),
+                     st.runtime_with_optimal_mm);
+}
+
+std::ostream &operator<<(std::ostream &s, GraphOptimizeState const &st) {
+  return (s << fmt::to_string(st));
+}
+
+} // namespace FlexFlow
+
+namespace std {
+
+size_t hash<::FlexFlow::GraphOptimizeState>::operator()(
+    ::FlexFlow::GraphOptimizeState const &state) const {
+  // TODO(@wmdi): Eventually it might be good to use a proper graph hash like
+  // https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.graph_hashing.weisfeiler_lehman_graph_hash.html#networkx.algorithms.graph_hashing.weisfeiler_lehman_graph_hash
+  size_t seed = 0;
+  std::vector<::FlexFlow::parallel_layer_guid_t> layers =
+      topological_ordering(state.pcg);
+  ::FlexFlow::hash_combine(seed, layers.size());
+  for (::FlexFlow::parallel_layer_guid_t const &layer : layers) {
+    ::FlexFlow::hash_combine(seed, get_parallel_layer_attrs(state.pcg, layer));
+    std::vector<::FlexFlow::parallel_tensor_guid_t> inputs =
+        get_incoming_tensors(state.pcg, layer);
+    ::FlexFlow::hash_combine(seed, inputs.size());
+    for (::FlexFlow::parallel_tensor_guid_t input : inputs) {
+      for (size_t i = 0; i < layers.size(); ++i) {
+        if (get_source_layer(input) == layers.at(i)) {
+          ::FlexFlow::hash_combine(seed, i);
+          break;
+        }
+      }
+    }
+  }
+  return seed;
+}
+
+} // namespace std
diff --git a/lib/compiler/src/compiler/unity_algorithm/unity_algorithm.cc b/lib/compiler/src/compiler/unity_algorithm/unity_algorithm.cc
new file mode 100644
index 0000000000..caaefbfdbf
--- /dev/null
+++ b/lib/compiler/src/compiler/unity_algorithm/unity_algorithm.cc
@@ -0,0 +1,138 @@
+#include "compiler/unity_algorithm/unity_algorithm.h"
+#include "compiler/machine_mapping/allowed_machine_views.h"
+#include "compiler/machine_mapping/get_optimal_machine_mapping.h"
+#include "compiler/machine_mapping/machine_mapping.h"
+#include "compiler/machine_mapping/machine_mapping_cache.h"
+#include "compiler/machine_mapping/machine_mapping_constraints.h"
+#include "compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.h"
+#include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h"
+#include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h"
+#include "compiler/machine_mapping/machine_mapping_result.h"
+#include "compiler/series_parallel/pcg/get_pcg_balanced_binary_sp_decomposition.h"
+#include "compiler/series_parallel/pcg/get_pcg_series_parallel_decomposition.h"
+#include "compiler/unity_algorithm/graph_optimize_state.h"
+#include "pcg/machine_specification.dtg.h"
+#include "pcg/operator_task_space.h"
+#include "substitutions/apply_substitution/apply_substitution.h"
+#include "substitutions/pcg_pattern.h"
+#include "substitutions/sub_parallel_computation_graph.h"
+#include "substitutions/substitution.h"
+#include "substitutions/unity_substitution_set.h"
+#include "utils/containers/generate_map.h"
+#include "utils/deduplicated_priority_queue.h"
+#include "utils/graph/node/algorithms.h"
+#include "utils/optional.h"
+
+namespace FlexFlow {
+
+/*
+ * Applies a substitution to all possible positions in PCG
+ */
+std::vector<ParallelComputationGraph>
+    all_pcgs_obtained_by_applying_a_substitution(
+        ParallelComputationGraph const &pcg,
+        std::vector<Substitution> const &substitutions) {
+  std::vector<ParallelComputationGraph> results;
+  SubParallelComputationGraph subpcg = sub_pcg_from_full_pcg(pcg);
+  for (Substitution const &substitution : substitutions) {
+    for (PCGPatternMatch const &pattern_match :
+         find_pattern_matches(substitution.pcg_pattern, subpcg)) {
+      SubParallelComputationGraph subpcg_from_substitution =
+          apply_substitution(subpcg, substitution, pattern_match);
+      results.push_back(
+          pcg_from_sub_pcg_by_dropping_inputs(subpcg_from_substitution));
+    }
+  }
+  return results;
+}
+
+SearchResult graph_optimize(ParallelComputationGraph &pcg,
+                            CostEstimator const &cost_estimator,
+                            MachineSpecification const &resources,
+                            UnitySearchConfig const &search_config) {
+
+  std::vector<Substitution> substitutions = get_substitution_set(resources);
+
+  MachineMappingCache cached_subgraph_costs = empty_machine_mapping_cache();
+  DeduplicatedPriorityQueue<GraphOptimizeState> candidates;
+
+  MachineMappingContext context = MachineMappingContext{
+      /*cost_estimator=*/cost_estimator,
+      /*allowed_machine_views=*/
+      [&](UnmappedOpCostEstimateKey const &key,
+          MachineSpecification const &resources)
+          -> std::unordered_set<MachineView> {
+        return get_allowed_machine_views(
+            resources, key.op_task_space, DeviceType::GPU);
+      },
+  };
+
+  auto optimize_pcg = [&](ParallelComputationGraph const &pcg)
+      -> std::pair<GraphOptimizeState, std::optional<MachineMapping>> {
+    PCGBinarySPDecomposition sp_decomp =
+        expect(get_pcg_balanced_binary_sp_decomposition(pcg),
+               "Failed to get SP decomposition of PCG");
+
+    MachineMappingProblemTree problem_tree =
+        get_machine_mapping_problem_tree(pcg, sp_decomp);
+    MachineMappingConstraints constraints =
+        get_unconstrained_solution_for_layers(get_all_leaf_paths(problem_tree));
+
+    MachineMappingResult mm_result = get_optimal_machine_mapping(
+        cached_subgraph_costs, context, problem_tree, resources, constraints);
+
+    return {
+        GraphOptimizeState{
+            /*pcg=*/pcg,
+            /*runtime_with_optimal_mm=*/get_runtime_cost(mm_result),
+        },
+        get_machine_mapping_from_machine_mapping_result(sp_decomp, mm_result),
+    };
+  };
+
+  GraphOptimizeState best_state = optimize_pcg(pcg).first;
+  candidates.push(best_state);
+
+  for (int iteration = 0;
+       !candidates.empty() && iteration < search_config.budget;
+       ++iteration) {
+    GraphOptimizeState current_state = candidates.top();
+    candidates.pop();
+
+    if (current_state < best_state) {
+      best_state = current_state;
+    } else if (current_state.runtime_with_optimal_mm >
+               best_state.runtime_with_optimal_mm * search_config.alpha) {
+      continue;
+    }
+
+    for (ParallelComputationGraph const &new_pcg :
+         all_pcgs_obtained_by_applying_a_substitution(current_state.pcg,
+                                                      substitutions)) {
+      std::optional<GraphOptimizeState> new_pcg_optimize_result =
+          optimize_pcg(new_pcg).first;
+      if (new_pcg_optimize_result == std::nullopt) {
+        continue;
+      }
+      GraphOptimizeState new_state = new_pcg_optimize_result.value();
+      if (new_state.runtime_with_optimal_mm <= search_config.threshold &&
+          get_nodes(new_pcg.raw_graph).size() <= search_config.max_num_ops) {
+        candidates.push(new_state);
+      }
+    }
+  }
+
+  std::optional<MachineMapping> best_mapping =
+      optimize_pcg(best_state.pcg).second;
+
+  if (best_mapping == std::nullopt) {
+    throw std::runtime_error("Failed to find any solutions");
+  }
+
+  return SearchResult{
+      /*pcg=*/best_state.pcg,
+      /*machine_mapping=*/best_mapping.value(),
+  };
+}
+
+} // namespace FlexFlow
diff --git a/lib/compiler/src/unity_algorithm.cc b/lib/compiler/src/unity_algorithm.cc
deleted file mode 100644
index 86a211c535..0000000000
--- a/lib/compiler/src/unity_algorithm.cc
+++ /dev/null
@@ -1,93 +0,0 @@
-#include "compiler/unity_algorithm.h"
-#include "compiler/graph_optimize_state.h"
-#include "compiler/machine_mapping/get_optimal_machine_mapping.h"
-#include "pcg/machine_specification.dtg.h"
-#include "substitutions/substitution.h"
-#include "utils/deduplicated_priority_queue.h"
-#include "utils/graph/node/algorithms.h"
-namespace FlexFlow {
-
-/*
- * Gets all substitutions applicable to a PCG
- */
-std::vector<Substitution>
-    get_all_applicable_substitutions(ParallelComputationGraph const &pcg) {
-  NOT_IMPLEMENTED();
-}
-
-/*
- * Applies a substitution to all possible positions in PCG
- */
-std::vector<ParallelComputationGraph>
-    apply_substitution(ParallelComputationGraph const &pcg,
-                       Substitution const &) {
-  NOT_IMPLEMENTED();
-}
-
-GraphOptimizeResult graph_optimize(
-    ParallelComputationGraph &pcg,
-    CostEstimator const &cost_estimator,
-    MachineSpecification const &resources,
-    std::function<std::unordered_set<MachineView>(
-        ParallelLayerAttrs const &, MachineSpecification const &)> const
-        &allowed_machine_views,
-    OptimizerConfig const &opt_config) {
-  NOT_IMPLEMENTED();
-
-  // std::vector<Substitution> substitutions =
-  //     get_all_applicable_substitutions(pcg);
-  //
-  // MachineMappingCache cached_subgraph_costs;
-  // DeduplicatedPriorityQueue<GraphOptimizeState> candidates;
-  //
-  // MachineMappingResult original_pcg_cost =
-  //     get_optimal_machine_mapping(pcg,
-  //                                 allowed_machine_views,
-  //                                 cost_estimator,
-  //                                 resources,
-  //                                 cached_subgraph_costs);
-  //
-  // GraphOptimizeState initial_state = {
-  //     GraphOptimizeResult(pcg, original_pcg_cost.machine_mapping),
-  //     original_pcg_cost.runtime};
-  //
-  // GraphOptimizeState best_state = initial_state;
-  // candidates.push(initial_state);
-  //
-  // for (int iteration = 0; !candidates.empty() && iteration <
-  // opt_config.budget;
-  //      ++iteration) {
-  //   GraphOptimizeState current_state = candidates.top();
-  //   candidates.pop();
-  //
-  //   if (current_state.runtime < best_state.runtime) {
-  //     best_state = current_state;
-  //   } else if (current_state.runtime > best_state.runtime * opt_config.alpha)
-  //   {
-  //     continue;
-  //   }
-  //
-  //   for (Substitution const &substitution : substitutions) {
-  //     for (ParallelComputationGraph const &new_pcg : apply_substitution(
-  //              current_state.graph_optimize_result.pcg, substitution)) {
-  //       MachineMappingResult new_pcg_cost =
-  //           get_optimal_machine_mapping(new_pcg,
-  //                                       allowed_machine_views,
-  //                                       cost_estimator,
-  //                                       resources,
-  //                                       cached_subgraph_costs);
-  //       GraphOptimizeState new_state{
-  //           GraphOptimizeResult(new_pcg, new_pcg_cost.machine_mapping),
-  //           new_pcg_cost.runtime};
-  //       if (new_pcg_cost.runtime <= opt_config.threshold &&
-  //           get_nodes(new_pcg.raw_graph).size() <= opt_config.max_num_ops) {
-  //         candidates.push(new_state);
-  //       }
-  //     }
-  //   }
-  // }
-
-  // return best_state.graph_optimize_result;
-}
-
-} // namespace FlexFlow
diff --git a/lib/compiler/test/src/allowed_machine_views.cc b/lib/compiler/test/src/allowed_machine_views.cc
deleted file mode 100644
index 817cc80700..0000000000
--- a/lib/compiler/test/src/allowed_machine_views.cc
+++ /dev/null
@@ -1,110 +0,0 @@
-#include "compiler/allowed_machine_views.h"
-#include "doctest/doctest.h"
-#include "utils/containers/extend.h"
-#include "utils/containers/range.h"
-#include "utils/containers/transform.h"
-#include "utils/containers/unordered_set_of.h"
-#include "utils/containers/zip.h"
-#include "utils/fmt/unordered_set.h"
-
-using namespace FlexFlow;
-
-TEST_SUITE(FF_TEST_SUITE) {
-
-  TEST_CASE("get_allowed_machine_views") {
-
-    SUBCASE("1 degree of parallelism") {
-      MachineSpecification ms = MachineSpecification{
-          /*num_nodes=*/1_n,
-          /*num_cpus_per_node=*/5_n,
-          /*num_gpus_per_node=*/5_n,
-          /*inter_node_bandwidth=*/0,
-          /*intra_node_bandwidth=*/0,
-      };
-
-      OperatorTaskSpace task = OperatorTaskSpace{{3_n}};
-
-      std::unordered_set<MachineView> correct = {
-          MachineView{
-              MachineSpaceCoordinate{
-                  /*node_idx=*/0_n, /*device_idx=*/0_n, DeviceType::GPU},
-              {MachineViewDimension{stride_t{1_n},
-                                    MachineSpecificationDimension::INTRA_NODE}},
-          },
-
-          MachineView{
-              MachineSpaceCoordinate{
-                  /*node_idx=*/0_n, /*device_idx=*/1_n, DeviceType::GPU},
-              {MachineViewDimension{stride_t{1_n},
-                                    MachineSpecificationDimension::INTRA_NODE}},
-          },
-          MachineView{
-              MachineSpaceCoordinate{
-                  /*node_idx=*/0_n, /*device_idx=*/2_n, DeviceType::GPU},
-              {MachineViewDimension{stride_t{1_n},
-                                    MachineSpecificationDimension::INTRA_NODE}},
-          },
-          MachineView{
-              MachineSpaceCoordinate{
-                  /*node_idx=*/0_n, /*device_idx=*/0_n, DeviceType::GPU},
-              {MachineViewDimension{stride_t{2_n},
-                                    MachineSpecificationDimension::INTRA_NODE}},
-          },
-      };
-
-      std::unordered_set<MachineView> result =
-          get_allowed_machine_views(ms, task, DeviceType::GPU);
-
-      CHECK(correct == result);
-    }
-
-    SUBCASE("2 degrees of parallelism") {
-
-      MachineSpecification ms = MachineSpecification{
-          /*num_nodes=*/3_n,
-          /*num_cpus_per_node=*/3_n,
-          /*num_gpus_per_node=*/3_n,
-          /*inter_node_bandwidth=*/0,
-          /*intra_node_bandwidth=*/0,
-      };
-      OperatorTaskSpace task = OperatorTaskSpace{{2_n, 3_n}};
-
-      auto make_2d_view = [&](nonnegative_int start_node_idx,
-                              nonnegative_int start_device_idx,
-                              nonnegative_int stride1,
-                              nonnegative_int stride2,
-                              MachineSpecificationDimension m1,
-                              MachineSpecificationDimension m2) {
-        return MachineView{
-            MachineSpaceCoordinate{
-                start_node_idx, start_device_idx, DeviceType::GPU},
-            {MachineViewDimension{stride_t{stride1}, m1},
-             MachineViewDimension{stride_t{stride2}, m2}},
-        };
-      };
-
-      auto intra = MachineSpecificationDimension::INTRA_NODE;
-      auto inter = MachineSpecificationDimension::INTER_NODE;
-      std::unordered_set<MachineView> correct = {
-          make_2d_view(
-              0_n, 0_n, /*stride1=*/1_n, /*stride2=*/1_n, inter, intra),
-          make_2d_view(
-              1_n, 0_n, /*stride1=*/1_n, /*stride2=*/1_n, inter, intra),
-          make_2d_view(
-              0_n, 0_n, /*stride1=*/2_n, /*stride2=*/1_n, inter, intra),
-
-          make_2d_view(
-              0_n, 0_n, /*stride1=*/1_n, /*stride2=*/1_n, intra, inter),
-          make_2d_view(
-              0_n, 1_n, /*stride1=*/1_n, /*stride2=*/1_n, intra, inter),
-          make_2d_view(
-              0_n, 0_n, /*stride1=*/2_n, /*stride2=*/1_n, intra, inter),
-      };
-
-      std::unordered_set<MachineView> result =
-          get_allowed_machine_views(ms, task, DeviceType::GPU);
-
-      CHECK(correct == result);
-    }
-  }
-}
diff --git a/lib/compiler/test/src/compiler/machine_mapping/allowed_machine_views.cc b/lib/compiler/test/src/compiler/machine_mapping/allowed_machine_views.cc
new file mode 100644
index 0000000000..f176621a18
--- /dev/null
+++ b/lib/compiler/test/src/compiler/machine_mapping/allowed_machine_views.cc
@@ -0,0 +1,156 @@
+#include "compiler/machine_mapping/allowed_machine_views.h"
+#include "doctest/doctest.h"
+#include "utils/containers/extend.h"
+#include "utils/containers/range.h"
+#include "utils/containers/transform.h"
+#include "utils/containers/unordered_set_of.h"
+#include "utils/containers/zip.h"
+#include "utils/fmt/unordered_set.h"
+
+using namespace FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+
+  TEST_CASE("get_allowed_machine_views") {
+
+    auto make_2d_view = [&](nonnegative_int start_node_idx,
+                            nonnegative_int start_device_idx,
+                            nonnegative_int stride_1,
+                            nonnegative_int stride_2,
+                            MachineSpecificationDimension m1,
+                            MachineSpecificationDimension m2) {
+      return MachineView{
+          MachineSpaceCoordinate{
+              start_node_idx, start_device_idx, DeviceType::GPU},
+          {MachineViewDimension{stride_t{stride_1}, m1},
+           MachineViewDimension{stride_t{stride_2}, m2}},
+      };
+    };
+    auto intra = MachineSpecificationDimension::INTRA_NODE;
+    auto inter = MachineSpecificationDimension::INTER_NODE;
+
+    SUBCASE("1 degree of parallelism") {
+      MachineSpecification ms = MachineSpecification{
+          /*num_nodes=*/1_n,
+          /*num_cpus_per_node=*/5_n,
+          /*num_gpus_per_node=*/5_n,
+          /*inter_node_bandwidth=*/0,
+          /*intra_node_bandwidth=*/0,
+      };
+
+      OperatorTaskSpace task = OperatorTaskSpace{{3_n}};
+
+      std::unordered_set<MachineView> correct = {
+          MachineView{
+              MachineSpaceCoordinate{
+                  /*node_idx=*/0_n, /*device_idx=*/0_n, DeviceType::GPU},
+              {MachineViewDimension{stride_t{1_n},
+                                    MachineSpecificationDimension::INTRA_NODE}},
+          },
+
+          MachineView{
+              MachineSpaceCoordinate{
+                  /*node_idx=*/0_n, /*device_idx=*/1_n, DeviceType::GPU},
+              {MachineViewDimension{stride_t{1_n},
+                                    MachineSpecificationDimension::INTRA_NODE}},
+          },
+          MachineView{
+              MachineSpaceCoordinate{
+                  /*node_idx=*/0_n, /*device_idx=*/2_n, DeviceType::GPU},
+              {MachineViewDimension{stride_t{1_n},
+                                    MachineSpecificationDimension::INTRA_NODE}},
+          },
+          MachineView{
+              MachineSpaceCoordinate{
+                  /*node_idx=*/0_n, /*device_idx=*/0_n, DeviceType::GPU},
+              {MachineViewDimension{stride_t{2_n},
+                                    MachineSpecificationDimension::INTRA_NODE}},
+          },
+      };
+
+      std::unordered_set<MachineView> result =
+          get_allowed_machine_views(ms, task, DeviceType::GPU);
+
+      CHECK(correct == result);
+    }
+
+    SUBCASE("2 degrees of parallelism") {
+
+      MachineSpecification ms = MachineSpecification{
+          /*num_nodes=*/3_n,
+          /*num_cpus_per_node=*/3_n,
+          /*num_gpus_per_node=*/3_n,
+          /*inter_node_bandwidth=*/0,
+          /*intra_node_bandwidth=*/0,
+      };
+      OperatorTaskSpace task = OperatorTaskSpace{{2_n, 3_n}};
+
+      std::unordered_set<MachineView> correct = {
+          make_2d_view(
+              0_n, 0_n, /*stride_1=*/1_n, /*stride_2=*/1_n, inter, intra),
+          make_2d_view(
+              1_n, 0_n, /*stride_1=*/1_n, /*stride_2=*/1_n, inter, intra),
+          make_2d_view(
+              0_n, 0_n, /*stride_1=*/2_n, /*stride_2=*/1_n, inter, intra),
+
+          make_2d_view(
+              0_n, 0_n, /*stride_1=*/1_n, /*stride_2=*/1_n, intra, inter),
+          make_2d_view(
+              0_n, 1_n, /*stride_1=*/1_n, /*stride_2=*/1_n, intra, inter),
+          make_2d_view(
+              0_n, 0_n, /*stride_1=*/2_n, /*stride_2=*/1_n, intra, inter),
+      };
+
+      std::unordered_set<MachineView> result =
+          get_allowed_machine_views(ms, task, DeviceType::GPU);
+
+      CHECK(correct == result);
+    }
+
+    SUBCASE("2D operator task space, dimensions (1,1)") {
+      MachineSpecification full_machine_spec = MachineSpecification{
+          /*num_nodes=*/nonnegative_int{2},
+          /*num_cpus_per_node=*/nonnegative_int{1},
+          /*num_gpus_per_node=*/nonnegative_int{1},
+          /*inter_node_bandwidth=*/1,
+          /*intra_node_bandwidth=*/1,
+      };
+      OperatorTaskSpace task = OperatorTaskSpace{{1_n, 1_n}};
+
+      std::unordered_set<MachineView> result =
+          get_allowed_machine_views(full_machine_spec, task, DeviceType::GPU);
+
+      std::unordered_set<MachineView> correct = {
+          make_2d_view(
+              0_n, 0_n, /*stride_1=*/1_n, /*stride_2=*/1_n, intra, intra),
+          make_2d_view(
+              1_n, 0_n, /*stride_1=*/1_n, /*stride_2=*/1_n, intra, intra)};
+      CHECK(correct == result);
+    }
+
+    SUBCASE("2D operator task space, dimensions (2,1)") {
+      MachineSpecification full_machine_spec = MachineSpecification{
+          /*num_nodes=*/nonnegative_int{2},
+          /*num_cpus_per_node=*/nonnegative_int{2},
+          /*num_gpus_per_node=*/nonnegative_int{2},
+          /*inter_node_bandwidth=*/1,
+          /*intra_node_bandwidth=*/1,
+      };
+      OperatorTaskSpace task = OperatorTaskSpace{{1_n, 2_n}};
+
+      std::unordered_set<MachineView> result =
+          get_allowed_machine_views(full_machine_spec, task, DeviceType::GPU);
+
+      std::unordered_set<MachineView> correct = {
+          make_2d_view(
+              0_n, 0_n, /*stride_1=*/1_n, /*stride_2=*/1_n, intra, intra),
+          make_2d_view(
+              0_n, 0_n, /*stride_1=*/1_n, /*stride_2=*/1_n, intra, inter),
+          make_2d_view(
+              1_n, 0_n, /*stride_1=*/1_n, /*stride_2=*/1_n, intra, intra),
+          make_2d_view(
+              0_n, 1_n, /*stride_1=*/1_n, /*stride_2=*/1_n, intra, inter)};
+      CHECK(correct == result);
+    }
+  }
+}
diff --git a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
index e506dea1d7..a45227011c 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
@@ -109,11 +109,14 @@ TEST_SUITE(FF_TEST_SUITE) {
         DataType::FLOAT,
     };
 
+    OperatorTaskSpace fake_op_task_space = OperatorTaskSpace{{}};
+
     UnmappedOpCostEstimateKey k1 = UnmappedOpCostEstimateKey{
         /*op_attrs=*/PCGOperatorAttrs{InputAttrs{tensor_shape}},
         /*input_shapes=*/{},
         /*weight_shapes=*/{},
         /*output_shapes=*/{},
+        /*op_task_space=*/fake_op_task_space,
     };
 
     UnmappedOpCostEstimateKey k2 = UnmappedOpCostEstimateKey{
@@ -126,6 +129,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*input_shapes=*/{},
         /*weight_shapes=*/{},
         /*output_shapes=*/{},
+        /*op_task_space=*/fake_op_task_space,
     };
 
     ParallelTensorShape par_tensor_shape = lift_to_parallel(tensor_shape);
diff --git a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc
index 048f1ddcac..9059950742 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc
@@ -1,8 +1,15 @@
 #include "compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.h"
 #include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h"
+#include "compiler/series_parallel/pcg/get_pcg_balanced_binary_sp_decomposition.h"
 #include "op-attrs/parallel_tensor_shape.h"
+#include "pcg/computation_graph_builder.h"
+#include "pcg/operator_task_space.h"
 #include "pcg/parallel_computation_graph/parallel_computation_graph.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h"
+#include "pcg/pcg_from_computation_graph.h"
+#include "utils/containers/extend.h"
 #include "utils/containers/get_only.h"
+#include "utils/containers/vector_of.h"
 #include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
@@ -90,6 +97,14 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     PCGOperatorAttrs input_attrs = PCGOperatorAttrs{InputAttrs{input_shape}};
 
+    auto make_operator_task_space = [&](ParallelTensorShape const &shape) {
+      std::vector<nonnegative_int> degrees;
+      extend(degrees, vector_of(ff_ordered_shard_degrees(shape)));
+      degrees.push_back(get_sum_degree(shape));
+      degrees.push_back(get_discard_copy_degree(shape));
+      return OperatorTaskSpace{degrees};
+    };
+
     auto make_input_key =
         [&](ParallelTensorShape const &parallel_tensor_shape) {
           return UnmappedOpCostEstimateKey{
@@ -97,6 +112,7 @@ TEST_SUITE(FF_TEST_SUITE) {
               /*input_shapes=*/{},
               /*weight_shapes=*/{},
               /*output_shapes=*/{parallel_tensor_shape},
+              /*op_task_space=*/make_operator_task_space(parallel_tensor_shape),
           };
         };
 
@@ -143,11 +159,15 @@ TEST_SUITE(FF_TEST_SUITE) {
       parallel_layer_guid_t relu_layer = relu_added.parallel_layer;
       parallel_tensor_guid_t relu_output = get_only(relu_added.outputs);
 
+      OperatorTaskSpace relu_task_space =
+          get_operator_task_space(pcg, relu_layer);
+
       UnmappedOpCostEstimateKey relu_key = UnmappedOpCostEstimateKey{
           /*op_attrs=*/relu_attrs,
           /*input_shapes=*/{par_input_shape},
           /*weight_shapes=*/{},
           /*output_shapes=*/{relu_output_shape},
+          /*op_task_space=*/relu_task_space,
       };
 
       PCGBinarySPDecomposition sp_decomposition = pcg_make_series(
@@ -228,11 +248,14 @@ TEST_SUITE(FF_TEST_SUITE) {
                              {input1_tensor, input2_tensor},
                              {});
       parallel_layer_guid_t ew_op_layer = ew_op_added.parallel_layer;
+      OperatorTaskSpace ew_op_task_space =
+          get_operator_task_space(pcg, ew_op_layer);
       UnmappedOpCostEstimateKey ew_op_key = UnmappedOpCostEstimateKey{
           /*op_attrs=*/ew_op_attrs,
           /*input_shapes=*/{par_input_shape, par_input_shape},
           /*weight_shapes=*/{},
           /*output_shapes=*/{ew_op_output_shape},
+          /*op_task_space=*/ew_op_task_space,
       };
 
       PCGBinarySPDecomposition sp_decomposition =
@@ -280,4 +303,43 @@ TEST_SUITE(FF_TEST_SUITE) {
       CHECK(result == correct);
     }
   }
+
+  TEST_CASE("from pcg") {
+    ComputationGraph cg = [&] {
+      ComputationGraphBuilder b;
+      TensorShape input_tensor_shape = TensorShape{
+          TensorDims{
+              FFOrdered<nonnegative_int>{nonnegative_int{32},
+                                         nonnegative_int{64}},
+          },
+          DataType::FLOAT,
+      };
+      tensor_guid_t t = b.create_input(input_tensor_shape, CreateGrad::YES);
+      t = b.dense(t,
+                  /*outDim=*/nonnegative_int{16},
+                  /*activation=*/std::nullopt);
+      t = b.gelu(t);
+      t = b.dense(t,
+                  /*outDim=*/nonnegative_int{12},
+                  /*activation=*/std::nullopt,
+                  /*use_bias=*/false,
+                  /*data_type=*/DataType::FLOAT,
+                  /*kernel_initializer=*/std::nullopt,
+                  /*bias_initializer=*/std::nullopt);
+      t = b.relu(t);
+      t = b.dense(t,
+                  /*outDim=*/nonnegative_int{8},
+                  /*activation=*/Activation::RELU);
+      return b.computation_graph;
+    }();
+
+    ParallelComputationGraph pcg = pcg_from_computation_graph(cg);
+
+    PCGBinarySPDecomposition sp_decomp =
+        expect(get_pcg_balanced_binary_sp_decomposition(pcg),
+               "Failed to get SP decomposition of PCG");
+
+    MachineMappingProblemTree problem_tree =
+        get_machine_mapping_problem_tree(pcg, sp_decomp);
+  }
 }
diff --git a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
index 8ae1ebe753..f049f4b288 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
@@ -99,6 +99,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       }
     };
 
+    OperatorTaskSpace fake_op_task_space = OperatorTaskSpace{{}};
     TensorShape tensor_shape = TensorShape{
         TensorDims{
             FFOrdered<nonnegative_int>{
@@ -116,6 +117,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*input_shapes=*/{},
         /*weight_shapes=*/{},
         /*output_shapes=*/{},
+        /*op_task_space=*/fake_op_task_space,
     };
 
     UnmappedOpCostEstimateKey k2 = UnmappedOpCostEstimateKey{
@@ -128,6 +130,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*input_shapes=*/{},
         /*weight_shapes=*/{},
         /*output_shapes=*/{},
+        /*op_task_space=*/fake_op_task_space,
     };
 
     AbstractedTensorSetMovement movement1 = AbstractedTensorSetMovement{{
diff --git a/lib/compiler/test/src/compiler/mcmc/generic_mcmc_algorithm.cc b/lib/compiler/test/src/compiler/mcmc/generic_mcmc_algorithm.cc
new file mode 100644
index 0000000000..ba6faa93c4
--- /dev/null
+++ b/lib/compiler/test/src/compiler/mcmc/generic_mcmc_algorithm.cc
@@ -0,0 +1,32 @@
+#include "compiler/mcmc/generic_mcmc_algorithm.h"
+#include "doctest/doctest.h"
+
+using namespace FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("generic_mcmc_algorithm") {
+    float starting_state = 0.1;
+    auto generating_func = [](float x,
+                              nonnegative_int i) -> std::optional<float> {
+      float new_x = x + (randf() - 0.5) / (i.unwrap_nonnegative() + 1);
+      if (new_x < 0) {
+        return std::nullopt;
+      }
+      if (new_x > 1) {
+        return std::nullopt;
+      }
+      return new_x;
+    };
+    auto scoring_func = [](float x) { return (x - 0.5) * (x - 0.5); };
+    GenericMCMCConfig config = GenericMCMCConfig{/*temperature=*/1.0,
+                                                 /*num_iterations=*/10_n};
+    Generic_MCMC_state<float, float> result =
+        minimize_score(starting_state, generating_func, scoring_func, config);
+    float answer = result.get_state();
+    float error = result.get_score();
+    CHECK(answer > 0.49);
+    CHECK(answer < 0.51);
+    CHECK(error >= 0);
+    CHECK(error < 0.01);
+  }
+}
diff --git a/lib/compiler/test/src/compiler/mcmc/mcmc_over_mapped_pcg.cc b/lib/compiler/test/src/compiler/mcmc/mcmc_over_mapped_pcg.cc
new file mode 100644
index 0000000000..7d74d897e4
--- /dev/null
+++ b/lib/compiler/test/src/compiler/mcmc/mcmc_over_mapped_pcg.cc
@@ -0,0 +1,79 @@
+#include "compiler/mcmc/mcmc_over_mapped_pcg.h"
+#include "../cost_estimator_for_test.h"
+#include "compiler/task_graph_simulator/task_simulator.h"
+#include "doctest/doctest.h"
+#include "op-attrs/parallel_tensor_dims.h"
+#include "op-attrs/parallel_tensor_shape.dtg.h"
+#include "op-attrs/replica_type.dtg.h"
+#include "op-attrs/shard_parallel_dim.h"
+#include "pcg/computation_graph_builder.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h"
+#include "pcg/pcg_from_computation_graph.h"
+#include "utils/integer_conversions.h"
+
+using namespace FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("mcmc_graph_optimize") {
+    ComputationGraph cg = [&] {
+      ComputationGraphBuilder b;
+      TensorShape input_tensor_shape = TensorShape{
+          TensorDims{
+              FFOrdered<nonnegative_int>{32_n, 64_n},
+          },
+          DataType::FLOAT,
+      };
+      tensor_guid_t t = b.create_input(input_tensor_shape, CreateGrad::YES);
+      t = b.dense(t,
+                  /*outDim=*/16_n,
+                  /*activation=*/std::nullopt);
+      t = b.gelu(t);
+      t = b.dense(t,
+                  /*outDim=*/12_n,
+                  /*activation=*/std::nullopt,
+                  /*use_bias=*/false,
+                  /*data_type=*/DataType::FLOAT,
+                  /*kernel_initializer=*/std::nullopt,
+                  /*bias_initializer=*/std::nullopt);
+      t = b.relu(t);
+      t = b.dense(t,
+                  /*outDim=*/8_n,
+                  /*activation=*/Activation::RELU);
+      return b.computation_graph;
+    }();
+
+    ParallelComputationGraph pcg = pcg_from_computation_graph(cg);
+
+    CostEstimator cost_estimator = make_fake_cost_estimator(
+        [](OpCostEstimateKey const &k) {
+          return OpCostMetrics{
+              /*forward_runtime=*/1.0,
+              /*backward_runtime=*/2.0,
+              /*memory=*/1_n,
+          };
+        },
+        [](TensorSetMovement const &) { return 1.0; });
+
+    MachineSpecification full_machine_spec = MachineSpecification{
+        /*num_nodes=*/2_n,
+        /*num_cpus_per_node=*/1_n,
+        /*num_gpus_per_node=*/1_n,
+        /*inter_node_bandwidth=*/1,
+        /*intra_node_bandwidth=*/1,
+    };
+
+    MCMCOverMappedPCGConfig search_config =
+        MCMCOverMappedPCGConfig{/*temperature=*/1.0,
+                                /*num_iterations=*/100_n,
+                                /*substitution_interval=*/5_n,
+                                /*device_type=*/DeviceType::GPU};
+
+    SearchResult result = mcmc_graph_optimize(
+        pcg, cost_estimator, full_machine_spec, search_config);
+    float runtime = task_simulator_estimate_forward_pass_time(
+        result.pcg, cost_estimator, result.machine_mapping, full_machine_spec);
+    std::cout << runtime << std::endl;
+
+    CHECK(runtime < 12);
+  }
+}
diff --git a/lib/compiler/test/src/graph_optimize_state.cc b/lib/compiler/test/src/compiler/unity_algorithm/graph_optimize_state.cc
similarity index 68%
rename from lib/compiler/test/src/graph_optimize_state.cc
rename to lib/compiler/test/src/compiler/unity_algorithm/graph_optimize_state.cc
index 5c00ce1558..3b146be93f 100644
--- a/lib/compiler/test/src/graph_optimize_state.cc
+++ b/lib/compiler/test/src/compiler/unity_algorithm/graph_optimize_state.cc
@@ -1,4 +1,4 @@
-#include "compiler/graph_optimize_state.h"
+#include "compiler/unity_algorithm/graph_optimize_state.h"
 #include "doctest/doctest.h"
 #include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h"
 
@@ -15,24 +15,6 @@ TEST_SUITE(FF_TEST_SUITE) {
         },
         DataType::FLOAT,
     };
-    // ParallelTensorShape input_shape =
-    //     ParallelTensorShape{ParallelTensorDims{
-    //                             FFOrdered<ShardParallelDim>{
-    //                                 ShardParallelDim{32_n, 2_n},
-    //                                 ShardParallelDim{16_n, 1_n},
-    //                             },
-    //                             ReplicaParallelDimSet{
-    //                                 SumDegree{1_n},
-    //                                 DiscardCopyDegree{1_n},
-    //                             },
-    //                         },
-    //                         DataType::FLOAT};
-
-    // `machine_mapping` is determined by the PCG and the device mapping
-    // algorithm, and `runtime` is determined by the PCG and the device mapping,
-    // so their values here do not matter.
-    std::unordered_map<parallel_layer_guid_t, MachineView> empty_machine_views;
-    MachineMapping empty_machine_mapping(empty_machine_views);
 
     InitializerAttrs zero_init = InitializerAttrs{ZeroInitializerAttrs{}};
 
@@ -70,13 +52,12 @@ TEST_SUITE(FF_TEST_SUITE) {
       ParallelComputationGraph pcg2 = create_pcg();
 
       GraphOptimizeState state1 = GraphOptimizeState{
-          GraphOptimizeResult{pcg1, empty_machine_mapping},
-          0,
+          pcg1,
+          .0,
       };
-
       GraphOptimizeState state2 = GraphOptimizeState{
-          GraphOptimizeResult{pcg2, empty_machine_mapping},
-          0,
+          pcg2,
+          .0,
       };
 
       CHECK(state1 == state2);
@@ -100,16 +81,30 @@ TEST_SUITE(FF_TEST_SUITE) {
       ParallelComputationGraph pcg_ = builder_.pcg;
 
       GraphOptimizeState state1 = GraphOptimizeState{
-          GraphOptimizeResult{pcg1, empty_machine_mapping},
-          0,
+          pcg1,
+          .0,
       };
 
       GraphOptimizeState state_ = GraphOptimizeState{
-          GraphOptimizeResult{pcg_, empty_machine_mapping},
-          0,
+          pcg_,
+          .0,
       };
 
       CHECK_FALSE(state1 == state_);
     }
   }
+
+  TEST_CASE("GraphOptimizeState::operator<") {
+    ParallelComputationGraph pcg1 = empty_parallel_computation_graph();
+    ParallelComputationGraph pcg2 = empty_parallel_computation_graph();
+    GraphOptimizeState state1 = GraphOptimizeState{
+        pcg1,
+        1.0,
+    };
+    GraphOptimizeState state2 = GraphOptimizeState{
+        pcg2,
+        2.0,
+    };
+    CHECK(state1 < state2);
+  }
 }
diff --git a/lib/compiler/test/src/compiler/unity_algorithm/unity_algorithm.cc b/lib/compiler/test/src/compiler/unity_algorithm/unity_algorithm.cc
new file mode 100644
index 0000000000..4ca23710e2
--- /dev/null
+++ b/lib/compiler/test/src/compiler/unity_algorithm/unity_algorithm.cc
@@ -0,0 +1,77 @@
+#include "compiler/unity_algorithm/unity_algorithm.h"
+#include "../cost_estimator_for_test.h"
+#include "doctest/doctest.h"
+#include "op-attrs/parallel_tensor_dims.h"
+#include "op-attrs/parallel_tensor_shape.dtg.h"
+#include "op-attrs/replica_type.dtg.h"
+#include "op-attrs/shard_parallel_dim.h"
+#include "pcg/computation_graph_builder.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h"
+#include "pcg/pcg_from_computation_graph.h"
+#include "utils/integer_conversions.h"
+
+using namespace FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("graph_optimize") {
+    ComputationGraph cg = [&] {
+      ComputationGraphBuilder b;
+      TensorShape input_tensor_shape = TensorShape{
+          TensorDims{
+              FFOrdered<nonnegative_int>{nonnegative_int{32},
+                                         nonnegative_int{64}},
+          },
+          DataType::FLOAT,
+      };
+      tensor_guid_t t = b.create_input(input_tensor_shape, CreateGrad::YES);
+      t = b.dense(t,
+                  /*outDim=*/nonnegative_int{16},
+                  /*activation=*/std::nullopt);
+      t = b.gelu(t);
+      t = b.dense(t,
+                  /*outDim=*/nonnegative_int{12},
+                  /*activation=*/std::nullopt,
+                  /*use_bias=*/false,
+                  /*data_type=*/DataType::FLOAT,
+                  /*kernel_initializer=*/std::nullopt,
+                  /*bias_initializer=*/std::nullopt);
+      t = b.relu(t);
+      t = b.dense(t,
+                  /*outDim=*/nonnegative_int{8},
+                  /*activation=*/Activation::RELU);
+      return b.computation_graph;
+    }();
+
+    ParallelComputationGraph pcg = pcg_from_computation_graph(cg);
+
+    CostEstimator cost_estimator = make_fake_cost_estimator(
+        [](OpCostEstimateKey const &k) {
+          return OpCostMetrics{
+              /*forward_runtime=*/1.0,
+              /*backward_runtime=*/2.0,
+              /*memory=*/nonnegative_int{1},
+          };
+        },
+        [](TensorSetMovement const &) { return 1.0; });
+
+    MachineSpecification full_machine_spec = MachineSpecification{
+        /*num_nodes=*/nonnegative_int{2},
+        /*num_cpus_per_node=*/nonnegative_int{1},
+        /*num_gpus_per_node=*/nonnegative_int{1},
+        /*inter_node_bandwidth=*/1,
+        /*intra_node_bandwidth=*/1,
+    };
+
+    UnitySearchConfig search_config = UnitySearchConfig{
+        /*alpha=*/1.0,
+        /*budget=*/0,
+        /*threshold=*/1000.0,
+        /*max_num_ops=*/100,
+    };
+
+    SearchResult result =
+        graph_optimize(pcg, cost_estimator, full_machine_spec, search_config);
+
+    // TODO: check the result
+  }
+}
diff --git a/lib/compiler/test/src/unity_algorithm.cc b/lib/compiler/test/src/unity_algorithm.cc
deleted file mode 100644
index 8ff0978ea5..0000000000
--- a/lib/compiler/test/src/unity_algorithm.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-#include "compiler/unity_algorithm.h"
-#include "doctest/doctest.h"
-
-TEST_SUITE(FF_TEST_SUITE) {
-  // Rapidcheck does not work for now
-  // TEST_CASE("graph_optimize") {
-  //   RC_SUBCASE([](ComputationGraph const &g,
-  //                float alpha,
-  //                int budget,
-  //                float threshold,
-  //                int max_num_ops) {
-  //     Strategy s = graph_optimize(
-  //         g,
-  //         TestCostEstimator{},
-  //         MachineSpecification{1, 1, 4, 0.1, 0.2},
-  //         [](Operator const &, MachineSpecification const &) {
-  //           return std::unordered_set<MachineView>{make_1d_machine_view(0, 1,
-  //           1)};
-  //         },
-  //         OptimizerConfig{alpha, budget, threshold, max_num_ops});
-  //     RC_ASSERT(get_nodes(s.pcg).size() > 0);
-  //     RC_ASSERT(s.machine_mapping.runtime > 0);
-  //     RC_ASSERT(keys(s.machine_mapping.machine_views) == get_nodes(s.pcg));
-  //   });
-  // }
-}
diff --git a/lib/kernels/CMakeLists.txt b/lib/kernels/CMakeLists.txt
index 8ccd7c1011..f5d88f102f 100644
--- a/lib/kernels/CMakeLists.txt
+++ b/lib/kernels/CMakeLists.txt
@@ -7,8 +7,7 @@ file(GLOB_RECURSE SRC
      CONFIGURE_DEPENDS
      LIST_DIRECTORIES False
      src/*.cc
-     src/cuda/cuda_helper.cu
-     src/cuda/ops/*.cu
+     src/cuda/*.cu
      )
 
 add_library(
@@ -30,6 +29,7 @@ target_link_libraries(
   cudnn
   nccl
   utils
+  pcg
 )
 
 define_ff_vars(${project_target})
diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h
index 39da65c3be..f9bef91b25 100644
--- a/lib/kernels/include/kernels/accessor.h
+++ b/lib/kernels/include/kernels/accessor.h
@@ -1,25 +1,88 @@
 #ifndef _FLEXFLOW_KERNELS_ACCESSOR_H
 #define _FLEXFLOW_KERNELS_ACCESSOR_H
 
-#include "array_shape.h"
-#include "device.h"
+#include "kernels/array_shape.h"
+#include "kernels/device.h"
 #include "kernels/ff_handle.h"
 #include "op-attrs/datatype.h"
-#include "utils/exception.h"
+#include "pcg/device_type.dtg.h"
+#include "utils/containers/transform.h"
 #include "utils/required.h"
+#include <libassert/assert.hpp>
 
 namespace FlexFlow {
 
+nonnegative_int
+    calculate_accessor_offset(LegionOrdered<nonnegative_int> const &,
+                              ArrayShape const &);
+
+class GenericTensorAccessorR {
+public:
+  template <DataType DT>
+  typename data_type_enum_to_class<DT>::type const *get() const {
+    ASSERT(this->data_type == DT, "Invalid datatype requested");
+
+    return static_cast<real_type_t<DT> const *>(this->ptr);
+  }
+
+  int32_t const *get_int32_ptr() const;
+  int64_t const *get_int64_ptr() const;
+  float const *get_float_ptr() const;
+  double const *get_double_ptr() const;
+  half const *get_half_ptr() const;
+
+  GenericTensorAccessorR() = delete;
+
+  GenericTensorAccessorR(DataType data_type,
+                         ArrayShape const &shape,
+                         void const *ptr,
+                         DeviceType device_type);
+
+  bool operator==(GenericTensorAccessorR const &) const;
+  bool operator!=(GenericTensorAccessorR const &) const;
+
+  template <DataType DT>
+  real_type_t<DT> const &at(FFOrdered<nonnegative_int> const &indices) const {
+    return this->at<DT>(legion_ordered_from_ff_ordered(indices));
+  }
+
+  template <DataType DT>
+  real_type_t<DT> const &
+      at(LegionOrdered<nonnegative_int> const &indices) const {
+    ASSERT(this->device_type == DeviceType::CPU,
+           "GenericTensorAccessorR::at() requires CPU-allocated tensor");
+    ASSERT(this->data_type == DT, "Invalid datatype requested");
+
+    using T = real_type_t<DT>;
+    T const *data_ptr = static_cast<T const *>(this->ptr);
+    nonnegative_int offset = calculate_accessor_offset(indices, this->shape);
+    return data_ptr[offset.unwrap_nonnegative()];
+  }
+
+public:
+  DataType data_type;
+  ArrayShape shape;
+  void const *ptr;
+  DeviceType device_type;
+
+private:
+  std::tuple<decltype(data_type) const &,
+             decltype(shape) const &,
+             decltype(ptr) const &,
+             decltype(device_type) const &>
+      tie() const;
+};
+
+std::string format_as(GenericTensorAccessorR const &);
+std::ostream &operator<<(std::ostream &, GenericTensorAccessorR const &);
+
 class GenericTensorAccessorW {
 public:
   template <DataType DT>
   typename data_type_enum_to_class<DT>::type *get() const {
-    if (this->data_type == DT) {
-      return static_cast<real_type_t<DT> *>(this->ptr);
-    } else {
-      throw mk_runtime_error(fmt::format(
-          "Invalid access data type ({} != {})", this->data_type, DT));
-    }
+    ASSERT(this->data_type == DT, "Invalid datatype requested");
+
+    return static_cast<real_type_t<DT> *>(this->ptr);
   }
 
   int32_t *get_int32_ptr() const;
@@ -28,76 +91,76 @@ class GenericTensorAccessorW {
   double *get_double_ptr() const;
   half *get_half_ptr() const;
 
-public:
-  DataType data_type;
-  ArrayShape shape;
-  req<void *> ptr;
-};
-FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GenericTensorAccessorW,
-                                             data_type,
-                                             shape,
-                                             ptr);
+  GenericTensorAccessorW() = delete;
 
-std::string format_as(GenericTensorAccessorW const &);
-std::ostream &operator<<(std::ostream &, GenericTensorAccessorW const &);
+  GenericTensorAccessorW(DataType data_type,
+                         ArrayShape const &shape,
+                         void *ptr,
+                         DeviceType device_type);
+
+  bool operator==(GenericTensorAccessorW const &) const;
+  bool operator!=(GenericTensorAccessorW const &) const;
+
+  operator GenericTensorAccessorR() const;
 
-class GenericTensorAccessorR {
-public:
   template <DataType DT>
-  typename data_type_enum_to_class<DT>::type const *get() const {
-    if (this->data_type == DT) {
-      return static_cast<real_type_t<DT> const *>(this->ptr);
-    } else {
-      throw mk_runtime_error(fmt::format(
-          "Invalid access data type ({} != {})", this->data_type, DT));
-    }
+  real_type_t<DT> &at(FFOrdered<nonnegative_int> const &indices) {
+    return this->at<DT>(legion_ordered_from_ff_ordered(indices));
   }
 
-  int32_t const *get_int32_ptr() const;
-  int64_t const *get_int64_ptr() const;
-  float const *get_float_ptr() const;
-  double const *get_double_ptr() const;
-  half const *get_half_ptr() const;
+  template <DataType DT>
+  real_type_t<DT> &at(LegionOrdered<nonnegative_int> const &indices) {
+    ASSERT(this->device_type == DeviceType::CPU,
+           "GenericTensorAccessorW::at() requires CPU-allocated tensor");
+    ASSERT(this->data_type == DT, "Invalid datatype requested");
+
+    using T = real_type_t<DT>;
+    T *data_ptr = static_cast<T *>(this->ptr);
+    nonnegative_int offset = calculate_accessor_offset(indices, this->shape);
+    return data_ptr[offset.unwrap_nonnegative()];
+  }
+
+  template <DataType DT>
+  real_type_t<DT> const &at(FFOrdered<nonnegative_int> const &indices) const {
+    return this->at<DT>(legion_ordered_from_ff_ordered(indices));
+  }
+
+  template <DataType DT>
+  real_type_t<DT> &at(LegionOrdered<nonnegative_int> const &indices) const {
+    ASSERT(this->device_type == DeviceType::CPU,
+           "GenericTensorAccessorW::at() requires CPU-allocated tensor");
+    ASSERT(this->data_type == DT, "Invalid datatype requested");
+
+    using T = real_type_t<DT>;
+    T const *data_ptr = static_cast<T const *>(this->ptr);
+    nonnegative_int offset = calculate_accessor_offset(indices, this->shape);
+    return data_ptr[offset];
+  }
 
 public:
   DataType data_type;
   ArrayShape shape;
-  req<void const *> ptr;
+  void *ptr;
+  DeviceType device_type;
+
+private:
+  std::tuple<decltype(data_type) const &,
+             decltype(shape) const &,
+             decltype(ptr) const &,
+             decltype(device_type) const &>
+      tie() const;
 };
-FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GenericTensorAccessorR,
-                                             data_type,
-                                             shape,
-                                             ptr);
-
-std::string format_as(GenericTensorAccessorR const &);
-std::ostream &operator<<(std::ostream &, GenericTensorAccessorR const &);
 
-int32_t *get_int32_ptr(GenericTensorAccessorW const &);
-int64_t *get_int64_ptr(GenericTensorAccessorW const &);
-float *get_float_ptr(GenericTensorAccessorW const &);
-double *get_double_ptr(GenericTensorAccessorW const &);
-half *get_half_ptr(GenericTensorAccessorW const &);
-std::vector<int32_t *>
-    get_int32_ptrs(std::vector<GenericTensorAccessorW> const &);
-std::vector<int64_t *>
-    get_int64_ptrs(std::vector<GenericTensorAccessorW> const &);
-std::vector<float *>
-    get_float_ptrs(std::vector<GenericTensorAccessorW> const &);
-std::vector<double *>
-    get_double_ptrs(std::vector<GenericTensorAccessorW> const &);
-std::vector<half *> get_half_ptrs(std::vector<GenericTensorAccessorW> const &);
+std::string format_as(GenericTensorAccessorW const &);
+std::ostream &operator<<(std::ostream &, GenericTensorAccessorW const &);
 
 static_assert(is_fmtable<req<DataType> const &>::value, "");
 
 template <DataType DT>
 typename data_type_enum_to_class<DT>::type *
     get(GenericTensorAccessorW const &a) {
-  if (a.data_type == DT) {
-    return static_cast<real_type_t<DT> *>(a.ptr);
-  } else {
-    throw mk_runtime_error(
-        fmt::format("Invalid access data type ({} != {})", a.data_type, DT));
-  }
+  ASSERT(a.data_type == DT, "Invalid datatype requested");
+  return static_cast<real_type_t<DT> *>(a.ptr);
 }
 
 template <DataType DT>
@@ -113,12 +176,8 @@ std::vector<real_type_t<DT> *>
 template <DataType DT>
 typename data_type_enum_to_class<DT>::type const *
     get(GenericTensorAccessorR const &a) {
-  if (a.data_type == DT) {
-    return static_cast<real_type_t<DT> const *>(a.ptr);
-  } else {
-    throw mk_runtime_error(
-        fmt::format("Invalid access data type ({} != {})", a.data_type, DT));
-  }
+  ASSERT(a.data_type == DT, "Invalid datatype requested");
+  return static_cast<real_type_t<DT> const *>(a.ptr);
 }
 
 int32_t const *get_int32_ptr(GenericTensorAccessorR const &);
@@ -137,6 +196,21 @@ std::vector<double const *>
 std::vector<half const *>
     get_half_ptrs(std::vector<GenericTensorAccessorR> const &);
 
+int32_t *get_int32_ptr(GenericTensorAccessorW const &);
+int64_t *get_int64_ptr(GenericTensorAccessorW const &);
+float *get_float_ptr(GenericTensorAccessorW const &);
+double *get_double_ptr(GenericTensorAccessorW const &);
+half *get_half_ptr(GenericTensorAccessorW const &);
+std::vector<int32_t *>
+    get_int32_ptrs(std::vector<GenericTensorAccessorW> const &);
+std::vector<int64_t *>
+    get_int64_ptrs(std::vector<GenericTensorAccessorW> const &);
+std::vector<float *>
+    get_float_ptrs(std::vector<GenericTensorAccessorW> const &);
+std::vector<double *>
+    get_double_ptrs(std::vector<GenericTensorAccessorW> const &);
+std::vector<half *> get_half_ptrs(std::vector<GenericTensorAccessorW> const &);
+
 template <DataType DT>
 std::vector<real_type_t<DT> const *>
     get(std::vector<GenericTensorAccessorR> const &accs) {
@@ -150,12 +224,8 @@ std::vector<real_type_t<DT> const *>
 GenericTensorAccessorR read_only_accessor_from_write_accessor(
     GenericTensorAccessorW const &write_accessor);
 
-bool is_shape_and_dtype_equal(GenericTensorAccessorW const &acc1,
-                              GenericTensorAccessorW const &acc2);
-
-bool shape_and_dtype_matches(GenericTensorAccessorW const &accessor,
-                             ArrayShape const &expected_shape,
-                             DataType const &expected_dtype);
+bool is_shape_and_dtype_equal(GenericTensorAccessorR const &acc1,
+                              GenericTensorAccessorR const &acc2);
 
 bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor,
                              ArrayShape const &expected_shape,
@@ -163,8 +233,9 @@ bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor,
 
 std::pair<ArrayShape, DataType>
     get_shape_and_datatype(GenericTensorAccessorR const &accessor);
-std::pair<ArrayShape, DataType>
-    get_shape_and_datatype(GenericTensorAccessorW const &accessor);
+
+void copy_accessor_data_to_l_from_r(GenericTensorAccessorW &dst_accessor,
+                                    GenericTensorAccessorR const &src_accessor);
 
 } // namespace FlexFlow
 
diff --git a/lib/kernels/include/kernels/allocation.h b/lib/kernels/include/kernels/allocation.h
index 6500899394..39bad6599c 100644
--- a/lib/kernels/include/kernels/allocation.h
+++ b/lib/kernels/include/kernels/allocation.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_KERNELS_ALLOCATION_H
 #define _FLEXFLOW_KERNELS_ALLOCATION_H
 
-#include "accessor.h"
+#include "kernels/accessor.h"
 #include <cstddef>
 #include <memory>
 
@@ -11,6 +11,8 @@ struct IAllocator {
   virtual void *allocate(size_t) = 0;
   virtual void deallocate(void *) = 0;
 
+  virtual DeviceType get_allocation_device_type() const = 0;
+
   virtual ~IAllocator() = default;
 };
 
@@ -18,9 +20,14 @@ struct Allocator {
   Allocator() = delete;
 
   GenericTensorAccessorW allocate_tensor(TensorShape const &tensor_shape);
+  void deallocate_tensor(GenericTensorAccessorW const &);
+  void deallocate_tensor(GenericTensorAccessorR const &);
+
   void *allocate(size_t mem_size);
   void deallocate(void *ptr);
 
+  DeviceType get_allocation_device_type() const;
+
   template <typename T, typename... Args>
   static typename std::enable_if<std::is_base_of<IAllocator, T>::value,
                                  Allocator>::type
diff --git a/lib/kernels/include/kernels/array_coord.struct.toml b/lib/kernels/include/kernels/array_coord.struct.toml
new file mode 100644
index 0000000000..8ce121f2bf
--- /dev/null
+++ b/lib/kernels/include/kernels/array_coord.struct.toml
@@ -0,0 +1,19 @@
+namespace = "FlexFlow"
+name = "ArrayCoord"
+features = [
+  "eq",
+  "ord",
+  "hash",
+  "fmt",
+  "rapidcheck",
+  "json",
+]
+
+includes = [
+  "op-attrs/ff_ordered/ff_ordered.h",
+  "utils/nonnegative_int/nonnegative_int.h"
+]
+
+[[fields]]
+name = "ff_ordered"
+type = "::FlexFlow::FFOrdered<::FlexFlow::nonnegative_int>"
diff --git a/lib/kernels/include/kernels/array_shape.h b/lib/kernels/include/kernels/array_shape.h
index 57498ee466..25ef8116f2 100644
--- a/lib/kernels/include/kernels/array_shape.h
+++ b/lib/kernels/include/kernels/array_shape.h
@@ -1,6 +1,7 @@
 #ifndef _FLEXFLOW_KERNELS_ARRAY_SHAPE_H
 #define _FLEXFLOW_KERNELS_ARRAY_SHAPE_H
 
+#include "kernels/array_coord.dtg.h"
 #include "kernels/legion_dim.h"
 #include "op-attrs/tensor_shape.dtg.h"
 #include "utils/nonnegative_int/nonnegative_int.h"
@@ -15,9 +16,7 @@ namespace FlexFlow {
 struct ArrayShape {
 public:
   ArrayShape() = delete;
-  ArrayShape(nonnegative_int *dims, nonnegative_int num_dims);
-  ArrayShape(TensorShape const &shape);
-  ArrayShape(std::vector<nonnegative_int> const &);
+  explicit ArrayShape(LegionOrdered<nonnegative_int> const &dims);
 
   /**
    * @brief Alias of ArrayShape::num_elements for compatibility with
@@ -46,24 +45,40 @@ struct ArrayShape {
   std::optional<nonnegative_int> at_maybe(legion_dim_t) const;
   std::optional<nonnegative_int> at_maybe(ff_dim_t) const;
 
-  ArrayShape
-      sub_shape(std::optional<std::variant<ff_dim_t, legion_dim_t>> start,
-                std::optional<std::variant<ff_dim_t, legion_dim_t>> end) const;
+  ArrayShape sub_shape(ff_dim_t const &start,
+                       std::optional<ff_dim_t> const &end) const;
+
+  ArrayShape sub_shape(legion_dim_t const &start,
+                       std::optional<legion_dim_t> const &end) const;
 
 public:
   LegionOrdered<nonnegative_int> dims;
 
 private:
   std::tuple<decltype(dims) const &> tie() const;
+
+  friend ::std::hash<ArrayShape>;
 };
 
+std::string format_as(ArrayShape const &);
+std::ostream &operator<<(std::ostream &, ArrayShape const &);
+
 nonnegative_int get_volume(ArrayShape const &);
 
+ArrayShape array_shape_from_tensor_shape(TensorShape const &);
 TensorShape get_tensor_shape(ArrayShape const &, DataType);
 
-std::string format_as(ArrayShape const &);
-std::ostream &operator<<(std::ostream &, ArrayShape const &);
+std::unordered_set<ArrayCoord> get_array_coord_set(ArrayShape const &);
 
 } // namespace FlexFlow
 
+namespace std {
+
+template <>
+struct hash<::FlexFlow::ArrayShape> {
+  size_t operator()(::FlexFlow::ArrayShape const &) const;
+};
+
+} // namespace std
+
 #endif
diff --git a/lib/kernels/include/kernels/attention_kernels.h b/lib/kernels/include/kernels/attention_kernels.h
index eb5a1b8198..b3c77d3430 100644
--- a/lib/kernels/include/kernels/attention_kernels.h
+++ b/lib/kernels/include/kernels/attention_kernels.h
@@ -1,7 +1,6 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_ATTENTION_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_ATTENTION_KERNELS_H
 
-#include "device.h"
 #include "kernels/allocation.h"
 #include "kernels/device.h"
 #include "kernels/ff_handle.h"
@@ -64,8 +63,7 @@ FF_VISITABLE_STRUCT_NO_EQ(MHAPerDeviceState,
 std::string format_as(MHAPerDeviceState const &x);
 std::ostream &operator<<(std::ostream &s, MHAPerDeviceState const &x);
 
-namespace Kernels {
-namespace MultiHeadAttention {
+namespace Kernels::MultiHeadAttention {
 
 MHAPerDeviceState init_kernel(PerDeviceFFHandle const &,
                               Allocator &,
@@ -105,8 +103,7 @@ void backward_kernel(ffStream_t stream,
 void cleanup_kernel(Allocator &allocator,
                     MHAPerDeviceState const &device_state);
 
-} // namespace MultiHeadAttention
-} // namespace Kernels
+} // namespace Kernels::MultiHeadAttention
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/kernels/include/kernels/batch_matmul_kernels.h b/lib/kernels/include/kernels/batch_matmul_kernels.h
index bfd72647b0..8b67f564d2 100644
--- a/lib/kernels/include/kernels/batch_matmul_kernels.h
+++ b/lib/kernels/include/kernels/batch_matmul_kernels.h
@@ -1,13 +1,11 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_BATCH_MATMUL_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_BATCH_MATMUL_KERNELS_H
 
-#include "device.h"
 #include "kernels/allocation.h"
+#include "kernels/device.h"
 #include "kernels/ff_handle.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace BatchMatmul {
+namespace FlexFlow::Kernels::BatchMatmul {
 
 void forward_kernel(ffStream_t stream,
                     PerDeviceFFHandle const &handle,
@@ -35,8 +33,6 @@ void backward_kernel(ffStream_t stream,
                      int k,
                      int batch);
 
-} // namespace BatchMatmul
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::BatchMatmul
 
 #endif
diff --git a/lib/kernels/include/kernels/batch_norm_kernels.h b/lib/kernels/include/kernels/batch_norm_kernels.h
index f2ca17f429..9bb2753a12 100644
--- a/lib/kernels/include/kernels/batch_norm_kernels.h
+++ b/lib/kernels/include/kernels/batch_norm_kernels.h
@@ -1,15 +1,13 @@
 #ifndef _FLEXFLOW_KERNELS_BATCH_NORM_KERNELS_H
 #define _FLEXFLOW_KERNELS_BATCH_NORM_KERNELS_H
 
-#include "device.h"
 #include "kernels/allocation.h"
 #include "kernels/batch_norm_per_device_state.dtg.h"
+#include "kernels/device.h"
 #include "kernels/ff_handle.h"
 #include <memory>
 
-namespace FlexFlow {
-namespace Kernels {
-namespace BatchNorm {
+namespace FlexFlow::Kernels::BatchNorm {
 
 BatchNormPerDeviceState init_kernel(PerDeviceFFHandle handle,
                                     Allocator allocator,
@@ -29,9 +27,9 @@ void forward_kernel(ffStream_t stream,
 
 void backward_kernel(ffStream_t stream,
                      BatchNormPerDeviceState const &per_device_state,
-                     float const *input_ptr,
-                     float *output_grad_ptr,
                      float const *output_ptr,
+                     float *output_grad_ptr,
+                     float const *input_ptr,
                      float *input_grad_ptr,
                      float const *scale_ptr,
                      float *scale_grad_ptr,
@@ -46,8 +44,5 @@ void cleanup_kernel(Allocator allocator,
                     bool relu,
                     float *runningMean);
 
-} // namespace BatchNorm
-} // namespace Kernels
-} // namespace FlexFlow
-
+} // namespace FlexFlow::Kernels::BatchNorm
 #endif
diff --git a/lib/kernels/include/kernels/cast_kernels.h b/lib/kernels/include/kernels/cast_kernels.h
index 96f9aadd52..5ec4cb3975 100644
--- a/lib/kernels/include/kernels/cast_kernels.h
+++ b/lib/kernels/include/kernels/cast_kernels.h
@@ -1,29 +1,19 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_CAST_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_CAST_KERNELS_H
 
-#include "device.h"
 #include "kernels/accessor.h"
-#include "kernels/ff_handle.h"
-#include "op-attrs/activation.dtg.h"
+#include "kernels/device.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Cast {
+namespace FlexFlow::Kernels::Cast {
 
 void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorR const &input,
-                    GenericTensorAccessorW const &output,
-                    DataType input_type,
-                    DataType output_type);
+                    GenericTensorAccessorW const &output);
 
 void backward_kernel(ffStream_t stream,
-                     GenericTensorAccessorR const &input,
-                     GenericTensorAccessorW const &output,
-                     DataType input_type,
-                     DataType output_type);
+                     GenericTensorAccessorR const &output,
+                     GenericTensorAccessorW const &input);
 
-} // namespace Cast
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Cast
 
 #endif
diff --git a/lib/kernels/include/kernels/cast_kernels_cpu.h b/lib/kernels/include/kernels/cast_kernels_cpu.h
new file mode 100644
index 0000000000..343ba253d9
--- /dev/null
+++ b/lib/kernels/include/kernels/cast_kernels_cpu.h
@@ -0,0 +1,17 @@
+#ifndef _FLEXFLOW_OPS_KERNELS_CAST_KERNELS_CPU_H
+#define _FLEXFLOW_OPS_KERNELS_CAST_KERNELS_CPU_H
+
+#include "kernels/accessor.h"
+#include "kernels/device.h"
+
+namespace FlexFlow::Kernels::Cast {
+
+void cpu_forward_kernel(GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output);
+
+void cpu_backward_kernel(GenericTensorAccessorR const &output,
+                         GenericTensorAccessorW const &input);
+
+} // namespace FlexFlow::Kernels::Cast
+
+#endif
diff --git a/lib/kernels/include/kernels/combine_kernels.h b/lib/kernels/include/kernels/combine_kernels.h
index eb263e0734..c87465a01f 100644
--- a/lib/kernels/include/kernels/combine_kernels.h
+++ b/lib/kernels/include/kernels/combine_kernels.h
@@ -1,12 +1,10 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_H
 
-#include "device.h"
 #include "kernels/accessor.h"
+#include "kernels/device.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Combine {
+namespace FlexFlow::Kernels::Combine {
 
 void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorR const &input,
@@ -16,8 +14,6 @@ void backward_kernel(ffStream_t stream,
                      GenericTensorAccessorR const &output_grad,
                      GenericTensorAccessorW const &input_grad);
 
-} // namespace Combine
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Combine
 
 #endif // _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_H
diff --git a/lib/kernels/include/kernels/combine_kernels_cpu.h b/lib/kernels/include/kernels/combine_kernels_cpu.h
new file mode 100644
index 0000000000..75fdd56498
--- /dev/null
+++ b/lib/kernels/include/kernels/combine_kernels_cpu.h
@@ -0,0 +1,17 @@
+#ifndef _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_CPU_H
+#define _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_CPU_H
+
+#include "kernels/accessor.h"
+#include "kernels/device.h"
+
+namespace FlexFlow::Kernels::Combine {
+
+void cpu_forward_kernel(GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output);
+
+void cpu_backward_kernel(GenericTensorAccessorR const &output_grad,
+                         GenericTensorAccessorW const &input_grad);
+
+} // namespace FlexFlow::Kernels::Combine
+
+#endif // _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_CPU_H
diff --git a/lib/kernels/include/kernels/concat_kernels.h b/lib/kernels/include/kernels/concat_kernels.h
index a44affc1f2..1e3c55bf59 100644
--- a/lib/kernels/include/kernels/concat_kernels.h
+++ b/lib/kernels/include/kernels/concat_kernels.h
@@ -1,12 +1,10 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_CONCAT_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_CONCAT_KERNELS_H
 
-#include "device.h"
 #include "kernels/accessor.h"
+#include "kernels/device.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Concat {
+namespace FlexFlow::Kernels::Concat {
 
 void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorW const &output,
@@ -18,8 +16,6 @@ void backward_kernel(ffStream_t stream,
                      std::vector<GenericTensorAccessorW> const &input_grads,
                      ff_dim_t axis);
 
-} // namespace Concat
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Concat
 
 #endif
diff --git a/lib/kernels/include/kernels/conv_2d_kernels.h b/lib/kernels/include/kernels/conv_2d_kernels.h
index cfc64f963d..3b7c0672df 100644
--- a/lib/kernels/include/kernels/conv_2d_kernels.h
+++ b/lib/kernels/include/kernels/conv_2d_kernels.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_CONV_2D_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_CONV_2D_KERNELS_H
 
-#include "device.h"
 #include "kernels/accessor.h"
+#include "kernels/device.h"
 #include "kernels/ff_handle.h"
 #include "op-attrs/activation.dtg.h"
 #include "utils/visitable.h"
@@ -34,8 +34,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(Conv2DPerDeviceState,
                                              bwdFilterAlgo,
                                              bwdDataAlgo);
 
-namespace Kernels {
-namespace Conv2D {
+namespace Kernels::Conv2D {
 
 Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle,
                                  std::optional<Activation> activation,
@@ -61,17 +60,16 @@ void forward_kernel(ffStream_t stream,
 
 void backward_kernel(ffStream_t stream,
                      Conv2DPerDeviceState const &m,
-                     float const *input_ptr,
-                     float *input_grad_ptr,
                      float const *output_ptr,
                      float *output_grad_ptr,
+                     float const *input_ptr,
+                     float *input_grad_ptr,
                      float const *filter_ptr,
                      float *filter_grad_ptr,
                      float *bias_grad_ptr,
                      std::optional<Activation> activation);
 
-} // namespace Conv2D
-} // namespace Kernels
+} // namespace Kernels::Conv2D
 } // namespace FlexFlow
 
 #endif // _FLEXFLOW_OPS_KERNELS_CONV_2D_KERNELS_H
diff --git a/lib/kernels/include/kernels/copy_tensor_accessor.h b/lib/kernels/include/kernels/copy_tensor_accessor.h
new file mode 100644
index 0000000000..81fd59dafb
--- /dev/null
+++ b/lib/kernels/include/kernels/copy_tensor_accessor.h
@@ -0,0 +1,27 @@
+#ifndef _FLEXFLOW_KERNELS_COPY_TENSOR_ACCESSOR_H
+#define _FLEXFLOW_KERNELS_COPY_TENSOR_ACCESSOR_H
+
+#include "kernels/accessor.h"
+#include "kernels/allocation.h"
+
+namespace FlexFlow {
+
+GenericTensorAccessorR
+    copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor,
+                           Allocator &allocator);
+
+GenericTensorAccessorW
+    copy_tensor_accessor_w(GenericTensorAccessorW const &src_accessor,
+                           Allocator &allocator);
+
+GenericTensorAccessorR
+    copy_tensor_accessor_r_to_cpu_if_necessary(GenericTensorAccessorR const &,
+                                               Allocator &cpu_allocator);
+
+GenericTensorAccessorW
+    copy_tensor_accessor_w_to_cpu_if_necessary(GenericTensorAccessorW const &,
+                                               Allocator &cpu_allocator);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/kernels/include/kernels/datatype_dispatch.h b/lib/kernels/include/kernels/datatype_dispatch.h
index e83fc3325d..50ca66a820 100644
--- a/lib/kernels/include/kernels/datatype_dispatch.h
+++ b/lib/kernels/include/kernels/datatype_dispatch.h
@@ -1,7 +1,8 @@
 #ifndef _FLEXFLOW_KERNELS_DATATYPE_DISPATCH_H
 #define _FLEXFLOW_KERNELS_DATATYPE_DISPATCH_H
 
-#include "accessor.h"
+#include "op-attrs/datatype.h"
+#include "utils/exception.h"
 
 namespace FlexFlow {
 
@@ -33,7 +34,7 @@ struct DataTypeDispatch1 {
     template <typename... Args,
               typename Out = decltype(std::declval<F<DataType::FLOAT>>()(
                   std::declval<Args>()...))>
-    Out operator()(Args... args) const {
+    Out operator()(Args &&...args) const {
       return F<DT>{}(std::forward<Args>(args)...);
     }
   };
@@ -41,7 +42,7 @@ struct DataTypeDispatch1 {
   template <typename... Args,
             typename Out = decltype(std::declval<F<DataType::FLOAT>>()(
                 std::declval<Args>()...))>
-  Out operator()(DataType data_type, Args... args) {
+  Out operator()(DataType data_type, Args &&...args) {
     return dispatch<Type1Dispatch>(data_type, std::forward<Args>(args)...);
   }
 };
@@ -54,13 +55,13 @@ struct DataTypeDispatch2 {
     template <DataType OT>
     struct OutputType {
       template <typename... Args>
-      void operator()(Args... args) const {
+      void operator()(Args &&...args) const {
         F<IT, OT>{}(std::forward<Args>(args)...);
       }
     };
 
     template <typename... Args>
-    void operator()(DataType output_type, Args... args) const {
+    void operator()(DataType output_type, Args &&...args) const {
       dispatch<OutputType>(output_type, std::forward<Args>(args)...);
     }
   };
@@ -68,7 +69,7 @@ struct DataTypeDispatch2 {
   template <typename... Args>
   void operator()(DataType input_data_type,
                   DataType output_data_type,
-                  Args... args) {
+                  Args &&...args) {
     dispatch<InputType>(
         input_data_type, output_data_type, std::forward<Args>(args)...);
   }
diff --git a/lib/kernels/include/kernels/dropout_kernels.h b/lib/kernels/include/kernels/dropout_kernels.h
index c0e503be5b..2cc6dd60a3 100644
--- a/lib/kernels/include/kernels/dropout_kernels.h
+++ b/lib/kernels/include/kernels/dropout_kernels.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_DROPOUT_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_DROPOUT_KERNELS_H
 
-#include "device.h"
 #include "kernels/allocation.h"
 #include "kernels/array_shape.h"
+#include "kernels/device.h"
 #include "kernels/ff_handle.h"
 #include <cstddef>
 
@@ -31,8 +31,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(DropoutPerDeviceState,
                                              reserveSpaceSize,
                                              dropoutStateSize);
 
-namespace Kernels {
-namespace Dropout {
+namespace Kernels::Dropout {
 
 DropoutPerDeviceState init_kernel(PerDeviceFFHandle handle,
                                   float rate,
@@ -56,8 +55,7 @@ void cleanup_kernel(Allocator allocator,
                     ffDropoutDescriptor_t dropoutDesc,
                     void *dropoutStates);
 
-} // namespace Dropout
-} // namespace Kernels
+} // namespace Kernels::Dropout
 } // namespace FlexFlow
 
 #endif // _FLEXFLOW_OPS_KERNELS_DROPOUT_KERNELS_H
diff --git a/lib/kernels/include/kernels/element_binary_kernels.h b/lib/kernels/include/kernels/element_binary_kernels.h
index 41447e98e6..fd596f2ccf 100644
--- a/lib/kernels/include/kernels/element_binary_kernels.h
+++ b/lib/kernels/include/kernels/element_binary_kernels.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_ELEMENT_BINARY_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_ELEMENT_BINARY_KERNELS_H
 
-#include "device.h"
 #include "ff_handle.h"
 #include "kernels/array_shape.h"
+#include "kernels/device.h"
 #include "op-attrs/datatype.h"
 #include "op-attrs/operator_type.h"
 
@@ -26,8 +26,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(ElementBinaryPerDeviceState,
                                              opDesc,
                                              reduceAddDesc);
 
-namespace Kernels {
-namespace ElementBinary {
+namespace Kernels::ElementBinary {
 
 ElementBinaryPerDeviceState init_kernel(PerDeviceFFHandle handle,
                                         OperatorType op_type,
@@ -58,8 +57,7 @@ void backward_kernel(ffStream_t stream,
                      bool broadcast_inputRHS,
                      PerDeviceFFHandle handle);
 
-} // namespace ElementBinary
-} // namespace Kernels
+} // namespace Kernels::ElementBinary
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/kernels/include/kernels/element_unary_kernels.h b/lib/kernels/include/kernels/element_unary_kernels.h
index 8c6864b2d9..0257b3b4a6 100644
--- a/lib/kernels/include/kernels/element_unary_kernels.h
+++ b/lib/kernels/include/kernels/element_unary_kernels.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_ELEMENT_UNARY_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_ELEMENT_UNARY_KERNELS_H
 
-#include "device.h"
 #include "kernels/accessor.h"
+#include "kernels/device.h"
 #include "kernels/ff_handle.h"
 #include "op-attrs/ops/element_unary.h"
 #include <cstddef>
@@ -19,8 +19,7 @@ FF_VISITABLE_STRUCT_NO_EQ(ElementUnaryPerDeviceState,
                           outputTensor,
                           actiDesc);
 
-namespace Kernels {
-namespace ElementUnary {
+namespace Kernels::ElementUnary {
 
 ElementUnaryPerDeviceState init_kernel(ArrayShape const &input_shape,
                                        ArrayShape const &output_shape,
@@ -37,13 +36,12 @@ void backward_kernel(ffStream_t stream,
                      ElementUnaryPerDeviceState const &device_state,
                      ElementUnaryAttrs const &attrs,
                      PerDeviceFFHandle const &handle,
-                     GenericTensorAccessorR const &input,
-                     GenericTensorAccessorW const &input_grad,
                      GenericTensorAccessorR const &output,
-                     GenericTensorAccessorR const &output_grad);
+                     GenericTensorAccessorR const &output_grad,
+                     GenericTensorAccessorR const &input,
+                     GenericTensorAccessorW const &input_grad);
 
-} // namespace ElementUnary
-} // namespace Kernels
+} // namespace Kernels::ElementUnary
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/kernels/include/kernels/embedding_kernels.h b/lib/kernels/include/kernels/embedding_kernels.h
index 06582ca1d5..f51a730314 100644
--- a/lib/kernels/include/kernels/embedding_kernels.h
+++ b/lib/kernels/include/kernels/embedding_kernels.h
@@ -1,13 +1,11 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_EMBEDDING_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_EMBEDDING_KERNELS_H
 
-#include "device.h"
 #include "kernels/accessor.h"
+#include "kernels/device.h"
 #include "op-attrs/ops/embedding.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Embedding {
+namespace FlexFlow::Kernels::Embedding {
 void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorR const &input,
                     GenericTensorAccessorW const &output,
@@ -19,11 +17,11 @@ void forward_kernel(ffStream_t stream,
                     int out_dim,
                     int batch_size);
 void backward_kernel(ffStream_t stream,
-                     GenericTensorAccessorR const &input,
                      GenericTensorAccessorR const &output,
+                     GenericTensorAccessorR const &input,
                      GenericTensorAccessorW const &weight_grad,
-                     DataType input_data_type,
                      DataType output_data_type,
+                     DataType input_data_type,
                      std::optional<AggregateOp> aggr,
                      int in_dim,
                      int out_dim,
@@ -35,8 +33,6 @@ void rand_generate_int32_wrapper(int32_t *ptr, size_t size, int32_t p);
 template <typename TD>
 __global__ void rand_generate_int(TD *ptr, size_t size, TD p);
 
-} // namespace Embedding
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Embedding
 
 #endif // _FLEXFLOW_OPS_KERNELS_EMBEDDING_KERNELS_H
diff --git a/lib/kernels/include/kernels/ff_handle.h b/lib/kernels/include/kernels/ff_handle.h
index 179ce41cbf..31b3296a98 100644
--- a/lib/kernels/include/kernels/ff_handle.h
+++ b/lib/kernels/include/kernels/ff_handle.h
@@ -5,7 +5,7 @@
 #include <nccl.h>
 #endif
 
-#include "device.h"
+#include "kernels/device.h"
 #include "utils/visitable.h"
 
 namespace FlexFlow {
diff --git a/lib/kernels/include/kernels/flat_kernels.h b/lib/kernels/include/kernels/flat_kernels.h
index 3e600c48de..b2b1164f92 100644
--- a/lib/kernels/include/kernels/flat_kernels.h
+++ b/lib/kernels/include/kernels/flat_kernels.h
@@ -1,23 +1,20 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_FLAT_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_FLAT_KERNELS_H
 
-#include "device.h"
 #include "kernels/accessor.h"
+#include "kernels/device.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Flat {
+namespace FlexFlow::Kernels::Flat {
 
 void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorR input,
                     float *output_ptr);
+
 void backward_kernel(ffStream_t stream,
                      GenericTensorAccessorR input,
-                     float *input_grad_ptr,
-                     float const *output_grad_ptr);
+                     float const *output_grad_ptr,
+                     float *input_grad_ptr);
 
-} // namespace Flat
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Flat
 
 #endif // _FLEXFLOW_OPS_KERNELS_FLAT_KERNELS_H
diff --git a/lib/kernels/include/kernels/format_accessor_contents.h b/lib/kernels/include/kernels/format_accessor_contents.h
new file mode 100644
index 0000000000..b50cffbbef
--- /dev/null
+++ b/lib/kernels/include/kernels/format_accessor_contents.h
@@ -0,0 +1,13 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_FORMAT_ACCESSOR_CONTENTS_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_FORMAT_ACCESSOR_CONTENTS_H
+
+#include "kernels/accessor.h"
+
+namespace FlexFlow {
+
+std::string format_accessor_r_contents(GenericTensorAccessorR const &);
+std::string format_accessor_w_contents(GenericTensorAccessorW const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/kernels/include/kernels/gather_kernels.h b/lib/kernels/include/kernels/gather_kernels.h
index 13bf4b898a..8cbc7e457e 100644
--- a/lib/kernels/include/kernels/gather_kernels.h
+++ b/lib/kernels/include/kernels/gather_kernels.h
@@ -15,23 +15,21 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GatherPerDeviceState,
                                              handle,
                                              legion_dim);
 
-namespace Kernels {
-namespace Gather {
+namespace Kernels::Gather {
 
 void forward_kernel(ffStream_t stream,
-                    GatherPerDeviceState const &m,
+                    GatherPerDeviceState const &per_device_state,
                     GenericTensorAccessorR const &input,
                     GenericTensorAccessorR const &index,
                     GenericTensorAccessorW const &output);
 
 void backward_kernel(ffStream_t stream,
-                     GatherPerDeviceState const &m,
+                     GatherPerDeviceState const &per_device_state,
                      GenericTensorAccessorR const &output_grad,
                      GenericTensorAccessorR const &index,
                      GenericTensorAccessorW const &input_grad);
 
-} // namespace Gather
-} // namespace Kernels
+} // namespace Kernels::Gather
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/kernels/include/kernels/layer_norm_kernels.h b/lib/kernels/include/kernels/layer_norm_kernels.h
index be13d32879..10cf2fb14b 100644
--- a/lib/kernels/include/kernels/layer_norm_kernels.h
+++ b/lib/kernels/include/kernels/layer_norm_kernels.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_LAYER_NORM_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_LAYER_NORM_KERNELS_H
 
-#include "device.h"
 #include "kernels/allocation.h"
+#include "kernels/device.h"
 #include "kernels/ff_handle.h"
 
 namespace FlexFlow {
@@ -30,8 +30,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(LayerNormPerDeviceState,
                                              bias,
                                              data_type);
 
-namespace Kernels {
-namespace LayerNorm {
+namespace Kernels::LayerNorm {
 
 // todo: this may have some problem.
 LayerNormPerDeviceState init_kernel(PerDeviceFFHandle const &handle,
@@ -57,8 +56,7 @@ void backward_kernel(ffStream_t stream,
                      GenericTensorAccessorW const &gamma_grad,
                      GenericTensorAccessorW const &beta_grad);
 
-} // namespace LayerNorm
-} // namespace Kernels
+} // namespace Kernels::LayerNorm
 } // namespace FlexFlow
 
 #endif // _FLEXFLOW_OPS_KERNELS_LAYER_NORM_KERNELS_H
diff --git a/lib/kernels/include/kernels/legion_dim.h b/lib/kernels/include/kernels/legion_dim.h
index 7b9b9c455c..947bbd00bb 100644
--- a/lib/kernels/include/kernels/legion_dim.h
+++ b/lib/kernels/include/kernels/legion_dim.h
@@ -2,7 +2,13 @@
 #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_LEGION_DIM_H
 
 #include "kernels/legion_dim_t.dtg.h"
-#include "op-attrs/dim_ordered/dim_ordered.h"
+#include "kernels/legion_ordered/legion_ordered.h"
+#include "op-attrs/ff_dim_t.dtg.h"
+#include "op-attrs/ff_ordered/ff_ordered.h"
+#include "utils/containers/set_of.h"
+#include "utils/containers/transform.h"
+#include "utils/nonnegative_int/nonnegative_range.h"
+#include "utils/nonnegative_int/num_elements.h"
 
 namespace FlexFlow {
 
@@ -11,7 +17,10 @@ legion_dim_t add_to_legion_dim(legion_dim_t legion_dim, int value);
 legion_dim_t legion_dim_from_ff_dim(ff_dim_t, nonnegative_int num_dimensions);
 
 template <typename T>
-using LegionOrdered = DimOrdered<legion_dim_t, T>;
+std::set<legion_dim_t> key_range(LegionOrdered<T> const &d) {
+  return transform(set_of(nonnegative_range(num_elements(d))),
+                   [](nonnegative_int i) { return legion_dim_t{i}; });
+}
 
 template <typename T>
 FFOrdered<T>
@@ -25,17 +34,6 @@ LegionOrdered<T>
   return LegionOrdered<T>(ff_ordered.rbegin(), ff_ordered.rend());
 }
 
-template <typename T>
-std::string format_as(LegionOrdered<T> const &v) {
-  std::vector<T> as_vec(v.cbegin(), v.cend());
-  return fmt::format("<legion_ordered {}>", as_vec);
-}
-
-template <typename T>
-std::ostream &operator<<(std::ostream &s, LegionOrdered<T> const &v) {
-  return (s << fmt::to_string(v));
-}
-
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/kernels/include/kernels/legion_ordered/legion_ordered.h b/lib/kernels/include/kernels/legion_ordered/legion_ordered.h
new file mode 100644
index 0000000000..ad8b3bad6d
--- /dev/null
+++ b/lib/kernels/include/kernels/legion_ordered/legion_ordered.h
@@ -0,0 +1,197 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LEGION_ORDERED_LEGION_ORDERED_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LEGION_ORDERED_LEGION_ORDERED_H
+
+#include "kernels/legion_dim_t.dtg.h"
+#include "utils/fmt/vector.h"
+#include "utils/stack_vector/stack_vector.h"
+
+namespace FlexFlow {
+
+template <typename T>
+struct LegionOrdered {
+  LegionOrdered() {}
+
+  LegionOrdered(std::initializer_list<T> const &l)
+      : contents(l.begin(), l.end()) {}
+
+  LegionOrdered(std::vector<T> const &contents)
+      : contents(contents.begin(), contents.end()) {}
+
+  template <typename It>
+  LegionOrdered(It begin, It end) : contents(begin, end) {}
+
+  template <size_t MAXSIZE>
+  LegionOrdered(stack_vector<T, MAXSIZE> const &contents)
+      : contents(contents.begin(), contents.end()) {}
+
+  T const &at(legion_dim_t idx) const {
+    int raw = idx.value.unwrap_nonnegative();
+    return this->contents.at(raw);
+  }
+
+  T &at(legion_dim_t idx) {
+    int raw = idx.value.unwrap_nonnegative();
+    return this->contents.at(raw);
+  }
+
+  T const &operator[](legion_dim_t idx) const {
+    return this->at(idx);
+  }
+
+  T &operator[](legion_dim_t idx) {
+    return this->at(idx);
+  }
+
+  bool idx_is_valid(legion_dim_t const &idx) const {
+    int raw = idx.value.unwrap_nonnegative();
+    return raw < this->contents.size();
+  }
+
+  bool operator==(LegionOrdered const &other) const {
+    return this->contents == other.contents;
+  }
+
+  bool operator!=(LegionOrdered const &other) const {
+    return this->contents != other.contents;
+  }
+
+  using iterator = typename stack_vector<T, MAX_TENSOR_DIM>::iterator;
+  using const_iterator =
+      typename stack_vector<T, MAX_TENSOR_DIM>::const_iterator;
+  using reverse_iterator =
+      typename stack_vector<T, MAX_TENSOR_DIM>::reverse_iterator;
+  using const_reverse_iterator =
+      typename stack_vector<T, MAX_TENSOR_DIM>::const_reverse_iterator;
+  using value_type = T;
+  using pointer = value_type *;
+  using const_pointer = value_type const *;
+  using reference = value_type &;
+  using const_reference = value_type const &;
+
+  iterator begin() {
+    return this->contents.begin();
+  }
+
+  const_iterator begin() const {
+    return this->cbegin();
+  }
+
+  const_iterator cbegin() const {
+    return this->contents.cbegin();
+  }
+
+  iterator end() {
+    return this->contents.end();
+  }
+
+  const_iterator end() const {
+    return this->cend();
+  }
+
+  const_iterator cend() const {
+    return this->contents.cend();
+  }
+
+  reverse_iterator rbegin() {
+    return this->contents.rbegin();
+  }
+
+  const_reverse_iterator rbegin() const {
+    return this->crbegin();
+  }
+
+  const_reverse_iterator crbegin() const {
+    return this->contents.crbegin();
+  }
+
+  reverse_iterator rend() {
+    return this->contents.rend();
+  }
+
+  const_reverse_iterator rend() const {
+    return this->crend();
+  }
+
+  const_reverse_iterator crend() const {
+    return this->contents.crend();
+  }
+
+  size_t size() const {
+    return this->contents.size();
+  }
+
+  size_t empty() const {
+    return this->contents.empty();
+  }
+
+  size_t num_dims() const {
+    return this->size();
+  }
+
+  friend struct ::std::hash<LegionOrdered>;
+
+private:
+  stack_vector<T, MAX_TENSOR_DIM> contents;
+};
+
+template <typename T>
+auto operator<(LegionOrdered<T> const &lhs, LegionOrdered<T> const &rhs)
+    -> std::enable_if_t<is_lt_comparable_v<T>, bool> {
+  return std::lexicographical_compare(
+      lhs.cbegin(), lhs.cend(), rhs.cbegin(), rhs.cend());
+}
+
+template <typename T>
+std::string format_as(LegionOrdered<T> const &v) {
+  std::vector<T> as_vec(v.cbegin(), v.cend());
+  return fmt::format("<legion_ordered {}>", as_vec);
+}
+
+template <typename T>
+std::ostream &operator<<(std::ostream &s, LegionOrdered<T> const &v) {
+  return (s << fmt::to_string(v));
+}
+
+} // namespace FlexFlow
+
+namespace nlohmann {
+template <typename T>
+struct adl_serializer<::FlexFlow::LegionOrdered<T>> {
+  static ::FlexFlow::LegionOrdered<T> from_json(nlohmann::json const &j) {
+    return {j.template get<std::vector<T>>()};
+  }
+
+  static void to_json(nlohmann::json &j,
+                      ::FlexFlow::LegionOrdered<T> const &x) {
+    j = std::vector<T>{x.cbegin(), x.cend()};
+  }
+};
+} // namespace nlohmann
+
+namespace std {
+
+template <typename T>
+struct hash<::FlexFlow::LegionOrdered<T>> {
+  size_t operator()(::FlexFlow::LegionOrdered<T> const &t) const {
+    static_assert(::FlexFlow::is_hashable<T>::value,
+                  "Elements must be hashable");
+
+    return get_std_hash(t.contents);
+  }
+};
+
+} // namespace std
+
+namespace rc {
+
+template <typename T>
+struct Arbitrary<::FlexFlow::LegionOrdered<T>> {
+  static Gen<::FlexFlow::LegionOrdered<T>> arbitrary() {
+    return gen::construct<::FlexFlow::LegionOrdered<T>>(
+        gen::arbitrary<::FlexFlow::stack_vector<T, MAX_TENSOR_DIM>>());
+  }
+};
+
+} // namespace rc
+
+#endif
diff --git a/lib/kernels/include/kernels/legion_ordered/slice.h b/lib/kernels/include/kernels/legion_ordered/slice.h
new file mode 100644
index 0000000000..6980c0d9ec
--- /dev/null
+++ b/lib/kernels/include/kernels/legion_ordered/slice.h
@@ -0,0 +1,24 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LEGION_ORDERED_SLICE_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LEGION_ORDERED_SLICE_H
+
+#include "kernels/legion_ordered/legion_ordered.h"
+#include "utils/containers/slice.h"
+#include "utils/containers/transform.h"
+#include "utils/containers/vector_of.h"
+
+namespace FlexFlow {
+
+template <typename T>
+LegionOrdered<T> slice(LegionOrdered<T> const &d,
+                       legion_dim_t const &start,
+                       std::optional<legion_dim_t> const &end) {
+  int raw_start = start.value.unwrap_nonnegative();
+  std::optional<int> raw_end = transform(
+      end, [](legion_dim_t const &i) { return i.value.unwrap_nonnegative(); });
+
+  return LegionOrdered<T>{slice(vector_of(d), raw_start, raw_end)};
+}
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/kernels/include/kernels/legion_ordered/transform.h b/lib/kernels/include/kernels/legion_ordered/transform.h
new file mode 100644
index 0000000000..55cc1ff1ea
--- /dev/null
+++ b/lib/kernels/include/kernels/legion_ordered/transform.h
@@ -0,0 +1,17 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LEGION_ORDERED_TRANSFORM_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LEGION_ORDERED_TRANSFORM_H
+
+#include "kernels/legion_ordered/legion_ordered.h"
+#include "utils/containers/vector_of.h"
+#include "utils/containers/vector_transform.h"
+
+namespace FlexFlow {
+
+template <typename T, typename F, typename Out = std::invoke_result_t<F, T>>
+LegionOrdered<Out> transform(LegionOrdered<T> const &d, F &&f) {
+  return LegionOrdered<Out>{vector_transform(vector_of(d), f)};
+}
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/kernels/include/kernels/linear_kernels.h b/lib/kernels/include/kernels/linear_kernels.h
index 3128e39fd0..21d84c2567 100644
--- a/lib/kernels/include/kernels/linear_kernels.h
+++ b/lib/kernels/include/kernels/linear_kernels.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_LINEAR_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_LINEAR_KERNELS_H
 
-#include "device.h"
 #include "ff_handle.h"
+#include "kernels/device.h"
 #include "op-attrs/datatype.h"
 #include "op-attrs/ops/linear_attrs.dtg.h"
 
@@ -33,8 +33,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(LinearPerDeviceState,
                                              weight_type,
                                              output_type);
 
-namespace Kernels {
-namespace Linear {
+namespace Kernels::Linear {
 
 LinearPerDeviceState init_kernel(PerDeviceFFHandle handle,
                                  float *one_ptr,
@@ -51,29 +50,28 @@ bool use_activation(Activation activation);
 
 void forward_kernel(ffStream_t stream,
                     LinearPerDeviceState const &m,
-                    void const *input_ptr,
-                    void *output_ptr,
-                    void const *filter_ptr,
-                    void const *bias_ptr,
+                    float const *input_ptr,
+                    float *output_ptr,
+                    float const *filter_ptr,
+                    float const *bias_ptr,
                     int in_dim,
                     int out_dim,
                     int batch_size);
 
 void backward_kernel(ffStream_t stream,
                      LinearPerDeviceState const &m,
-                     void const *input_ptr,
-                     void *input_grad_ptr,
-                     void const *output_ptr,
-                     void *output_grad_ptr,
-                     void const *kernel_ptr,
-                     void *kernel_grad_ptr,
-                     void *bias_ptr,
+                     float const *output_ptr,
+                     float *output_grad_ptr,
+                     float const *input_ptr,
+                     float *input_grad_ptr,
+                     float const *kernel_ptr,
+                     float *kernel_grad_ptr,
+                     float *bias_grad_ptr,
                      int in_dim,
                      int out_dim,
                      int batch_size);
 
-} // namespace Linear
-} // namespace Kernels
+} // namespace Kernels::Linear
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/local-execution/include/local-execution/local_cpu_allocator.h b/lib/kernels/include/kernels/local_cpu_allocator.h
similarity index 74%
rename from lib/local-execution/include/local-execution/local_cpu_allocator.h
rename to lib/kernels/include/kernels/local_cpu_allocator.h
index d1e81facf2..9653dcf00e 100644
--- a/lib/local-execution/include/local-execution/local_cpu_allocator.h
+++ b/lib/kernels/include/kernels/local_cpu_allocator.h
@@ -1,3 +1,6 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LOCAL_CPU_ALLOCATOR_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LOCAL_CPU_ALLOCATOR_H
+
 #include "kernels/allocation.h"
 #include <unordered_set>
 
@@ -12,6 +15,8 @@ struct LocalCPUAllocator : public IAllocator {
   void *allocate(size_t) override;
   void deallocate(void *) override;
 
+  DeviceType get_allocation_device_type() const override;
+
 private:
   std::unordered_map<void *, std::unique_ptr<void, decltype(&free)>> ptrs;
 };
@@ -20,3 +25,5 @@ CHECK_RC_COPY_VIRTUAL_COMPLIANT(LocalCPUAllocator);
 Allocator create_local_cpu_memory_allocator();
 
 } // namespace FlexFlow
+
+#endif
diff --git a/lib/kernels/include/kernels/local_cuda_allocator.h b/lib/kernels/include/kernels/local_cuda_allocator.h
index 18a4b6e78a..b8e0540974 100644
--- a/lib/kernels/include/kernels/local_cuda_allocator.h
+++ b/lib/kernels/include/kernels/local_cuda_allocator.h
@@ -12,6 +12,8 @@ struct LocalCudaAllocator : public IAllocator {
   void *allocate(size_t) override;
   void deallocate(void *) override;
 
+  DeviceType get_allocation_device_type() const override;
+
 private:
   std::unordered_set<void *> ptrs;
 };
diff --git a/lib/kernels/include/kernels/managed_ff_stream.h b/lib/kernels/include/kernels/managed_ff_stream.h
index 2f690b2eb3..576edb0ffa 100644
--- a/lib/kernels/include/kernels/managed_ff_stream.h
+++ b/lib/kernels/include/kernels/managed_ff_stream.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_KERNELS_MANAGED_FF_STREAM_H
 #define _FLEXFLOW_KERNELS_MANAGED_FF_STREAM_H
 
-#include "device.h"
+#include "kernels/device.h"
 
 namespace FlexFlow {
 
@@ -19,6 +19,9 @@ struct ManagedFFStream {
 
   ffStream_t const &raw_stream() const;
 
+private:
+  void cleanup();
+
 private:
   ffStream_t *stream;
 };
diff --git a/lib/kernels/include/kernels/managed_per_device_ff_handle.h b/lib/kernels/include/kernels/managed_per_device_ff_handle.h
index 0a83a5eecb..9bd9370685 100644
--- a/lib/kernels/include/kernels/managed_per_device_ff_handle.h
+++ b/lib/kernels/include/kernels/managed_per_device_ff_handle.h
@@ -7,7 +7,10 @@ namespace FlexFlow {
 
 struct ManagedPerDeviceFFHandle {
 public:
-  ManagedPerDeviceFFHandle();
+  ManagedPerDeviceFFHandle() = delete;
+
+  ManagedPerDeviceFFHandle(size_t workSpaceSize,
+                           bool allowTensorOpMathConversion);
 
   ManagedPerDeviceFFHandle(ManagedPerDeviceFFHandle const &) = delete;
   ManagedPerDeviceFFHandle &
@@ -21,6 +24,9 @@ struct ManagedPerDeviceFFHandle {
 
   PerDeviceFFHandle const &raw_handle() const;
 
+private:
+  void cleanup();
+
 private:
   PerDeviceFFHandle *handle;
 };
diff --git a/lib/kernels/include/kernels/metrics_kernels.h b/lib/kernels/include/kernels/metrics_kernels.h
index e4660808b9..430608db55 100644
--- a/lib/kernels/include/kernels/metrics_kernels.h
+++ b/lib/kernels/include/kernels/metrics_kernels.h
@@ -1,25 +1,24 @@
 #ifndef _FLEXFLOW_KERNELS_INCLUDE_KERNELS_METRICS_KERNELS_H
 #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_METRICS_KERNELS_H
 
-#include "perf_metrics.h"
+#include "kernels/perf_metrics.h"
+#include "pcg/metric_attrs.h"
 
 namespace FlexFlow {
 
-void update_metrics_sparse_label_kernel(ffStream_t,
-                                        MetricsAttrs const &,
-                                        float const *logit_ptr,
-                                        int const *label_ptr,
-                                        int num_samples,
-                                        int num_classes,
-                                        PerfMetrics &perf_zc);
-void update_metrics_label_kernel(ffStream_t,
-                                 MetricsAttrs const &,
-                                 float const *logit_ptr,
-                                 float const *label_ptr,
-                                 int num_samples,
-                                 int num_classes,
-                                 PerfMetrics &perf_zc);
+void update_metrics_sparse_label_kernel_wrapper(float const *logit_ptr,
+                                                int const *label_ptr,
+                                                MetricsAttrs const &me,
+                                                int num_effective_samples,
+                                                int num_classes,
+                                                PerfMetrics &perf_zc);
 
+void update_metrics_label_kernel_wrapper(float const *logit_ptr,
+                                         float const *label_ptr,
+                                         MetricsAttrs const &me,
+                                         int num_samples,
+                                         int num_classes,
+                                         PerfMetrics &perf_zc);
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/kernels/include/kernels/nccl.h b/lib/kernels/include/kernels/nccl.h
index b8a6784676..042911d172 100644
--- a/lib/kernels/include/kernels/nccl.h
+++ b/lib/kernels/include/kernels/nccl.h
@@ -23,15 +23,11 @@ struct ncclUniqueId {};
 struct ncclComm_t {};
 #endif
 
-namespace FlexFlow {
-namespace Kernels {
-namespace NCCL {
+namespace FlexFlow::Kernels::NCCL {
 
 ncclUniqueId generate_unique_id();
 ncclComm_t create_comm(ncclUniqueId const &, int num_ranks, int my_rank);
 
-} // namespace NCCL
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::NCCL
 
 #endif
diff --git a/lib/kernels/include/kernels/optimizer_kernels.h b/lib/kernels/include/kernels/optimizer_kernels.h
index 9ca6bf8e2b..d552831c78 100644
--- a/lib/kernels/include/kernels/optimizer_kernels.h
+++ b/lib/kernels/include/kernels/optimizer_kernels.h
@@ -1,7 +1,8 @@
 #ifndef _FLEXFLOW_KERNELS_INCLUDE_KERNELS_OPTIMIZER_KERNELS_H
 #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_OPTIMIZER_KERNELS_H
 
-#include "device.h"
+#include "kernels/device.h"
+#include "kernels/ff_handle.h"
 
 namespace FlexFlow {
 
@@ -16,15 +17,18 @@ void sgd_ps_update_task_gpu(ffStream_t,
                             float *weight_ptr,
                             float *sgd_v_ptr);
 
+#ifdef FF_USE_NCCL
 void sgd_nccl_update_task_gpu(ffStream_t,
                               float lr,
                               float momentum,
                               bool nesterov,
-                              float weight_decay PerDeviceFFHandle const &,
+                              float weight_decay,
+                              PerDeviceFFHandle const &,
                               float const *weight_grad_ptr,
                               size_t size,
                               float *weight_ptr,
                               float *sgd_v_ptr);
+#endif
 
 void adam_ps_update_task_gpu(ffStream_t,
                              float alpha_t,
@@ -33,9 +37,11 @@ void adam_ps_update_task_gpu(ffStream_t,
                              float weight_decay,
                              float epsilon,
                              float const *weight_grad_ptr,
-                             float *adam_m_ptr,
+                             size_t size,
+                             int num_replicas,
+                             float *weight_ptr,
                              float *adam_v_ptr,
-                             float *weight_ptr);
+                             float *adam_m_ptr);
 
 void adam_nccl_update_task_gpu(ffStream_t,
                                float alpha_t,
@@ -45,9 +51,10 @@ void adam_nccl_update_task_gpu(ffStream_t,
                                float epsilon,
                                PerDeviceFFHandle const &,
                                float const *weight_grad_ptr,
-                               float *adam_m_ptr,
+                               size_t size,
+                               float *weight_ptr,
                                float *adam_v_ptr,
-                               float *weight_ptr);
+                               float *adam_m_ptr);
 
 } // namespace FlexFlow
 
diff --git a/lib/kernels/include/kernels/partition_kernels.h b/lib/kernels/include/kernels/partition_kernels.h
index 64ef1a1352..aa3a7a1ef7 100644
--- a/lib/kernels/include/kernels/partition_kernels.h
+++ b/lib/kernels/include/kernels/partition_kernels.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_PARTITION_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_PARTITION_KERNELS_H
 
-#include "device.h"
 #include "kernels/accessor.h"
+#include "kernels/device.h"
 
 namespace FlexFlow {
 
@@ -13,8 +13,7 @@ struct RepartitionPerDeviceState {
 
 FF_VISITABLE_STRUCT_NO_EQ(RepartitionPerDeviceState, handle, data_type);
 
-namespace Kernels {
-namespace Repartition {
+namespace Kernels::Repartition {
 
 RepartitionPerDeviceState init_kernel(PerDeviceFFHandle const &handle,
                                       DataType data_type);
@@ -26,11 +25,10 @@ void forward_kernel(ffStream_t stream,
 
 void backward_kernel(ffStream_t stream,
                      RepartitionPerDeviceState const &m,
-                     GenericTensorAccessorW const &output_grad,
-                     GenericTensorAccessorR const &input_grad);
+                     GenericTensorAccessorR const &output_grad,
+                     GenericTensorAccessorW const &input_grad);
 
-} // namespace Repartition
-} // namespace Kernels
+} // namespace Kernels::Repartition
 } // namespace FlexFlow
 
 #endif // _FLEXFLOW_OPS_KERNELS_PARTITION_KERNELS_H
diff --git a/lib/local-execution/include/local-execution/per_device_op_state.variant.toml b/lib/kernels/include/kernels/per_device_op_state.variant.toml
similarity index 100%
rename from lib/local-execution/include/local-execution/per_device_op_state.variant.toml
rename to lib/kernels/include/kernels/per_device_op_state.variant.toml
diff --git a/lib/kernels/include/kernels/pool_2d_kernels.h b/lib/kernels/include/kernels/pool_2d_kernels.h
index 798c0507f8..76aa07d0a4 100644
--- a/lib/kernels/include/kernels/pool_2d_kernels.h
+++ b/lib/kernels/include/kernels/pool_2d_kernels.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_POOL_2D_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_POOL_2D_KERNELS_H
 
-#include "device.h"
+#include "kernels/device.h"
 #include "kernels/ff_handle.h"
 #include "op-attrs/activation.dtg.h"
 #include "op-attrs/ops/pool_2d.h"
@@ -25,8 +25,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(Pool2DPerDeviceState,
                                              poolDesc,
                                              relu);
 
-namespace Kernels {
-namespace Pool2D {
+namespace Kernels::Pool2D {
 
 Pool2DPerDeviceState init_kernel(PerDeviceFFHandle handle,
                                  std::optional<Activation> activation,
@@ -70,13 +69,12 @@ void forward_kernel(ffStream_t stream,
 
 void backward_kernel(ffStream_t stream,
                      Pool2DPerDeviceState const &m,
-                     void const *input_ptr,
-                     void *input_grad_ptr,
                      void const *output_ptr,
-                     void const *output_grad_ptr);
+                     void const *output_grad_ptr,
+                     void const *input_ptr,
+                     void *input_grad_ptr);
 
-} // namespace Pool2D
-} // namespace Kernels
+} // namespace Kernels::Pool2D
 } // namespace FlexFlow
 
 #endif // _FLEXFLOW_OPS_KERNELS_POOL_2D_KERNELS_H
diff --git a/lib/kernels/include/kernels/profiling.h b/lib/kernels/include/kernels/profiling.h
index 655d540685..7c4145c426 100644
--- a/lib/kernels/include/kernels/profiling.h
+++ b/lib/kernels/include/kernels/profiling.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_KERNELS_PROFILING_H
 #define _FLEXFLOW_KERNELS_PROFILING_H
 
-#include "device.h"
+#include "kernels/device.h"
 #include "kernels/profiling_settings.dtg.h"
 #include "utils/visitable.h"
 
diff --git a/lib/kernels/include/kernels/reduce_kernels.h b/lib/kernels/include/kernels/reduce_kernels.h
index 4287472875..10e8e4393b 100644
--- a/lib/kernels/include/kernels/reduce_kernels.h
+++ b/lib/kernels/include/kernels/reduce_kernels.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_REDUCE_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_REDUCE_KERNELS_H
 
-#include "array_shape.h"
-#include "device.h"
-#include "ff_handle.h"
+#include "kernels/array_shape.h"
+#include "kernels/device.h"
+#include "kernels/ff_handle.h"
 #include "op-attrs/operator_type.dtg.h"
 
 namespace FlexFlow {
@@ -25,8 +25,7 @@ FF_VISITABLE_STRUCT(ReducePerDeviceState,
                     op_type,
                     reduction_size);
 
-namespace Kernels {
-namespace Reduce {
+namespace Kernels::Reduce {
 
 ReducePerDeviceState init_kernel(PerDeviceFFHandle const &,
                                  OperatorType const &,
@@ -43,8 +42,7 @@ void backward_kernel(ffStream_t stream,
                      ReducePerDeviceState const &m,
                      float const *output_grad_ptr,
                      float *input_grad_ptr);
-} // namespace Reduce
-} // namespace Kernels
+} // namespace Kernels::Reduce
 } // namespace FlexFlow
 
 #endif // _FLEXFLOW_OPS_KERNELS_REDUCE_KERNELS_H
diff --git a/lib/kernels/include/kernels/reduction_kernels.h b/lib/kernels/include/kernels/reduction_kernels.h
index fb3baf215c..08f73cd9ab 100644
--- a/lib/kernels/include/kernels/reduction_kernels.h
+++ b/lib/kernels/include/kernels/reduction_kernels.h
@@ -1,12 +1,10 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_REDUCTION_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_REDUCTION_KERNELS_H
 
-#include "device.h"
 #include "kernels/accessor.h"
+#include "kernels/device.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Reduction {
+namespace FlexFlow::Kernels::Reduction {
 
 void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorR const &input,
@@ -14,11 +12,9 @@ void forward_kernel(ffStream_t stream,
                     size_t num_replicas);
 
 void backward_kernel(ffStream_t stream,
-                     GenericTensorAccessorW const &input,
-                     GenericTensorAccessorR const &output);
+                     GenericTensorAccessorR const &output,
+                     GenericTensorAccessorW const &input);
 
-} // namespace Reduction
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Reduction
 
 #endif // _FLEXFLOW_OPS_KERNELS_REDUCTION_KERNELS_H
diff --git a/lib/kernels/include/kernels/replicate_kernels.h b/lib/kernels/include/kernels/replicate_kernels.h
index 409fc81f44..0b113868ee 100644
--- a/lib/kernels/include/kernels/replicate_kernels.h
+++ b/lib/kernels/include/kernels/replicate_kernels.h
@@ -1,24 +1,20 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_H
 
-#include "device.h"
 #include "kernels/accessor.h"
+#include "kernels/device.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Replicate {
+namespace FlexFlow::Kernels::Replicate {
 
 void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorR const &input,
                     GenericTensorAccessorW const &output);
 
 void backward_kernel(ffStream_t stream,
-                     GenericTensorAccessorW const &input,
                      GenericTensorAccessorR const &output,
+                     GenericTensorAccessorW const &input,
                      size_t num_replicas);
 
-} // namespace Replicate
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Replicate
 
 #endif // _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_H
diff --git a/lib/kernels/include/kernels/replicate_kernels_cpu.h b/lib/kernels/include/kernels/replicate_kernels_cpu.h
new file mode 100644
index 0000000000..2a2eaa5eb6
--- /dev/null
+++ b/lib/kernels/include/kernels/replicate_kernels_cpu.h
@@ -0,0 +1,18 @@
+#ifndef _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_CPU_H
+#define _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_CPU_H
+
+#include "kernels/accessor.h"
+#include "kernels/device.h"
+
+namespace FlexFlow::Kernels::Replicate {
+
+void cpu_forward_kernel(GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW &output);
+
+void cpu_backward_kernel(GenericTensorAccessorR const &output,
+                         GenericTensorAccessorW &input,
+                         size_t num_replicas);
+
+} // namespace FlexFlow::Kernels::Replicate
+
+#endif // _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_CPU_H
diff --git a/lib/kernels/include/kernels/reshape_kernels.h b/lib/kernels/include/kernels/reshape_kernels.h
index a83caa6bea..88c11d2fb0 100644
--- a/lib/kernels/include/kernels/reshape_kernels.h
+++ b/lib/kernels/include/kernels/reshape_kernels.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H
 
-#include "device.h"
 #include "kernels/accessor.h"
+#include "kernels/device.h"
 #include "utils/required_core.h"
 
 namespace FlexFlow {
@@ -13,8 +13,7 @@ struct ReshapePerDeviceState {
 
 FF_VISITABLE_STRUCT(ReshapePerDeviceState, data_type);
 
-namespace Kernels {
-namespace Reshape {
+namespace Kernels::Reshape {
 
 ReshapePerDeviceState init_kernel(DataType data_type);
 
@@ -25,11 +24,10 @@ void forward_kernel(ffStream_t stream,
 
 void backward_kernel(ffStream_t stream,
                      ReshapePerDeviceState const &per_device_state,
-                     GenericTensorAccessorW const &input,
-                     GenericTensorAccessorR const &output);
+                     GenericTensorAccessorR const &output,
+                     GenericTensorAccessorW const &input);
 
-} // namespace Reshape
-} // namespace Kernels
+} // namespace Kernels::Reshape
 } // namespace FlexFlow
 
 #endif // _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H
diff --git a/lib/kernels/include/kernels/reverse_kernels.h b/lib/kernels/include/kernels/reverse_kernels.h
index 42a83ae219..768707175c 100644
--- a/lib/kernels/include/kernels/reverse_kernels.h
+++ b/lib/kernels/include/kernels/reverse_kernels.h
@@ -1,30 +1,21 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_H
 
-#include "device.h"
+#include "kernels/device.h"
+#include "kernels/reverse_kernels_cpu.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Reverse {
+namespace FlexFlow::Kernels::Reverse {
 
 void forward_kernel(ffStream_t stream,
-                    float const *in_ptr,
-                    float *out_ptr,
-                    coord_t num_out_blks,
-                    coord_t reverse_dim_size,
-                    coord_t in_blk_size,
-                    coord_t output_size);
+                    GenericTensorAccessorR const &input_accessor,
+                    GenericTensorAccessorW &output_accessor,
+                    ReverseAttrs const &);
 
 void backward_kernel(ffStream_t stream,
-                     float const *out_grad_ptr,
-                     float *in_grad_ptr,
-                     coord_t num_out_blks,
-                     coord_t reverse_dim_size,
-                     coord_t in_blk_size,
-                     coord_t input_size);
+                     GenericTensorAccessorR const &output_accessor,
+                     GenericTensorAccessorW &input_accessor,
+                     ReverseAttrs const &);
 
-} // namespace Reverse
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Reverse
 
 #endif // _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_H
diff --git a/lib/kernels/include/kernels/reverse_kernels_cpu.h b/lib/kernels/include/kernels/reverse_kernels_cpu.h
new file mode 100644
index 0000000000..ec82000f8f
--- /dev/null
+++ b/lib/kernels/include/kernels/reverse_kernels_cpu.h
@@ -0,0 +1,20 @@
+#ifndef _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H
+#define _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H
+
+#include "kernels/accessor.h"
+#include "kernels/device.h"
+#include "op-attrs/ops/reverse_attrs.dtg.h"
+
+namespace FlexFlow::Kernels::Reverse {
+
+void cpu_forward_kernel(GenericTensorAccessorR const &input_accessor,
+                        GenericTensorAccessorW &output_accessor,
+                        ReverseAttrs const &);
+
+void cpu_backward_kernel(GenericTensorAccessorR const &output_accessor,
+                         GenericTensorAccessorW &input_accessor,
+                         ReverseAttrs const &);
+
+} // namespace FlexFlow::Kernels::Reverse
+
+#endif // _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H
diff --git a/lib/kernels/include/kernels/reverse_kernels_params.h b/lib/kernels/include/kernels/reverse_kernels_params.h
new file mode 100644
index 0000000000..766d70b915
--- /dev/null
+++ b/lib/kernels/include/kernels/reverse_kernels_params.h
@@ -0,0 +1,16 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_REVERSE_KERNELS_PARAMS_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_REVERSE_KERNELS_PARAMS_H
+
+#include "kernels/array_shape.h"
+#include "kernels/reverse_kernels_params.dtg.h"
+#include "op-attrs/ops/reverse_attrs.dtg.h"
+
+namespace FlexFlow {
+
+ReverseKernelsParams
+    compute_reverse_kernels_params(ArrayShape const &output_shape,
+                                   ReverseAttrs const &attrs);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/kernels/include/kernels/reverse_kernels_params.struct.toml b/lib/kernels/include/kernels/reverse_kernels_params.struct.toml
new file mode 100644
index 0000000000..a5dbd750bc
--- /dev/null
+++ b/lib/kernels/include/kernels/reverse_kernels_params.struct.toml
@@ -0,0 +1,28 @@
+namespace = "FlexFlow"
+name = "ReverseKernelsParams"
+features = [
+  "eq",
+  "ord",
+  "hash",
+  "fmt",
+]
+
+includes = [
+  "utils/nonnegative_int/nonnegative_int.h",
+]
+
+[[fields]]
+name = "num_out_blks"
+type = "::FlexFlow::nonnegative_int"
+
+[[fields]]
+name = "reverse_dim_size"
+type = "::FlexFlow::nonnegative_int"
+
+[[fields]]
+name = "in_blk_size"
+type = "::FlexFlow::nonnegative_int"
+
+[[fields]]
+name = "out_size"
+type = "::FlexFlow::nonnegative_int"
diff --git a/lib/kernels/include/kernels/softmax_kernels.h b/lib/kernels/include/kernels/softmax_kernels.h
index 061230ec52..60101578e3 100644
--- a/lib/kernels/include/kernels/softmax_kernels.h
+++ b/lib/kernels/include/kernels/softmax_kernels.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_SOFTMAX_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_SOFTMAX_KERNELS_H
 
-#include "device.h"
 #include "ff_handle.h"
+#include "kernels/device.h"
 
 namespace FlexFlow {
 
@@ -15,8 +15,7 @@ struct SoftmaxPerDeviceState {
 
 FF_VISITABLE_STRUCT(SoftmaxPerDeviceState, handle, inputTensor, dim);
 
-namespace Kernels {
-namespace Softmax {
+namespace Kernels::Softmax {
 
 SoftmaxPerDeviceState init_kernel(PerDeviceFFHandle const &handle,
                                   int dim,
@@ -31,12 +30,11 @@ void forward_kernel(ffStream_t stream,
                     float *output_ptr);
 
 void backward_kernel(ffStream_t stream,
-                     float *input_grad_ptr,
                      float const *output_grad_ptr,
+                     float *input_grad_ptr,
                      size_t num_elements);
 
-} // namespace Softmax
-} // namespace Kernels
+} // namespace Kernels::Softmax
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/kernels/include/kernels/split_kernels.h b/lib/kernels/include/kernels/split_kernels.h
index 36434d4be8..3b580f94be 100644
--- a/lib/kernels/include/kernels/split_kernels.h
+++ b/lib/kernels/include/kernels/split_kernels.h
@@ -1,12 +1,9 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_SPLIT_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_SPLIT_KERNELS_H
 
-#include "device.h"
+#include "kernels/device.h"
 
-namespace FlexFlow {
-
-namespace Kernels {
-namespace Split {
+namespace FlexFlow::Kernels::Split {
 void forward_kernel(ffStream_t stream,
                     float **out_ptrs,
                     float const *in_ptr,
@@ -22,8 +19,6 @@ void backward_kernel(ffStream_t stream,
                      coord_t num_blks,
                      int numOutputs);
 
-} // namespace Split
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Split
 
 #endif // _FLEXFLOW_OPS_KERNELS_SPLIT_KERNELS_H
diff --git a/lib/kernels/include/kernels/topk_kernels.h b/lib/kernels/include/kernels/topk_kernels.h
index ae1c739f6c..085594d57f 100644
--- a/lib/kernels/include/kernels/topk_kernels.h
+++ b/lib/kernels/include/kernels/topk_kernels.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_TOPK_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_TOPK_KERNELS_H
 
-#include "device.h"
 #include "kernels/allocation.h"
+#include "kernels/device.h"
 
 namespace FlexFlow {
 
@@ -12,8 +12,7 @@ struct TopKPerDeviceState {
 
 FF_VISITABLE_STRUCT(TopKPerDeviceState, sorted);
 
-namespace Kernels {
-namespace TopK {
+namespace Kernels::TopK {
 
 TopKPerDeviceState init_kernel(bool sorted);
 
@@ -35,8 +34,7 @@ void backward_kernel(ffStream_t stream,
                      int length,
                      int k);
 
-} // namespace TopK
-} // namespace Kernels
+} // namespace Kernels::TopK
 } // namespace FlexFlow
 
 #endif // _FLEXFLOW_OPS_KERNELS_TOPK_KERNELS_H
diff --git a/lib/kernels/include/kernels/transpose_kernels.h b/lib/kernels/include/kernels/transpose_kernels.h
index 0f1cc2ae61..776370dcbd 100644
--- a/lib/kernels/include/kernels/transpose_kernels.h
+++ b/lib/kernels/include/kernels/transpose_kernels.h
@@ -1,15 +1,14 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_TRANSPOSE_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_TRANSPOSE_KERNELS_H
 
-#include "device.h"
 #include "kernels/accessor.h"
+#include "kernels/device.h"
 #include "op-attrs/ops/transpose_attrs.dtg.h"
 #include <vector>
 
 namespace FlexFlow {
 
-namespace Kernels {
-namespace Transpose {
+namespace Kernels::Transpose {
 
 void forward_kernel(cudaStream_t stream,
                     TransposeAttrs const &attrs,
@@ -18,11 +17,10 @@ void forward_kernel(cudaStream_t stream,
 
 void backward_kernel(cudaStream_t stream,
                      TransposeAttrs const &attrs,
-                     GenericTensorAccessorW const &in_grad,
-                     GenericTensorAccessorR const &out_grad);
+                     GenericTensorAccessorR const &out_grad,
+                     GenericTensorAccessorW const &in_grad);
 
-} // namespace Transpose
-} // namespace Kernels
+} // namespace Kernels::Transpose
 } // namespace FlexFlow
 
 #endif // _FLEXFLOW_OPS_KERNELS_TRANSPOSE_KERNELS_H
diff --git a/lib/kernels/src/accessor.cc b/lib/kernels/src/accessor.cc
deleted file mode 100644
index 27b7eb390d..0000000000
--- a/lib/kernels/src/accessor.cc
+++ /dev/null
@@ -1,192 +0,0 @@
-#include "kernels/accessor.h"
-
-namespace FlexFlow {
-
-int32_t *GenericTensorAccessorW::get_int32_ptr() const {
-  return this->get<DataType::INT32>();
-}
-
-int64_t *GenericTensorAccessorW::get_int64_ptr() const {
-  return this->get<DataType::INT64>();
-}
-
-float *GenericTensorAccessorW::get_float_ptr() const {
-  return this->get<DataType::FLOAT>();
-}
-
-double *GenericTensorAccessorW::get_double_ptr() const {
-  return this->get<DataType::DOUBLE>();
-}
-
-half *GenericTensorAccessorW::get_half_ptr() const {
-  return this->get<DataType::HALF>();
-}
-
-std::string format_as(GenericTensorAccessorW const &a) {
-  return fmt::format("<GenericTensorAccessorW data_type={} shape={} ptr={}>",
-                     a.data_type,
-                     a.shape,
-                     a.ptr);
-}
-
-std::ostream &operator<<(std::ostream &s, GenericTensorAccessorW const &a) {
-  return (s << fmt::to_string(a));
-}
-
-int32_t const *GenericTensorAccessorR::get_int32_ptr() const {
-  return this->get<DataType::INT32>();
-}
-
-int64_t const *GenericTensorAccessorR::get_int64_ptr() const {
-  return this->get<DataType::INT64>();
-}
-
-float const *GenericTensorAccessorR::get_float_ptr() const {
-  return this->get<DataType::FLOAT>();
-}
-
-double const *GenericTensorAccessorR::get_double_ptr() const {
-  return this->get<DataType::DOUBLE>();
-}
-
-half const *GenericTensorAccessorR::get_half_ptr() const {
-  return get<DataType::HALF>();
-}
-
-std::string format_as(GenericTensorAccessorR const &a) {
-  return fmt::format("<GenericTensorAccessorR data_type={} shape={} ptr={}>",
-                     a.data_type,
-                     a.shape,
-                     a.ptr);
-}
-
-std::ostream &operator<<(std::ostream &s, GenericTensorAccessorR const &a) {
-  return (s << fmt::to_string(a));
-}
-
-int32_t *get_int32_ptr(GenericTensorAccessorW const &a) {
-  return get<DataType::INT32>(a);
-}
-
-int64_t *get_int64_ptr(GenericTensorAccessorW const &a) {
-  return get<DataType::INT64>(a);
-}
-
-float *get_float_ptr(GenericTensorAccessorW const &a) {
-  return get<DataType::FLOAT>(a);
-}
-
-double *get_double_ptr(GenericTensorAccessorW const &a) {
-  return get<DataType::DOUBLE>(a);
-}
-
-half *get_half_ptr(GenericTensorAccessorW const &a) {
-  return get<DataType::HALF>(a);
-}
-
-std::vector<int32_t *>
-    get_int32_ptrs(std::vector<GenericTensorAccessorW> const &a) {
-  return get<DataType::INT32>(a);
-}
-
-std::vector<int64_t *>
-    get_int64_ptrs(std::vector<GenericTensorAccessorW> const &a) {
-  return get<DataType::INT64>(a);
-}
-
-std::vector<float *>
-    get_float_ptrs(std::vector<GenericTensorAccessorW> const &a) {
-  return get<DataType::FLOAT>(a);
-}
-
-std::vector<double *>
-    get_double_ptrs(std::vector<GenericTensorAccessorW> const &a) {
-  return get<DataType::DOUBLE>(a);
-}
-
-std::vector<half *>
-    get_half_ptrs(std::vector<GenericTensorAccessorW> const &a) {
-  return get<DataType::HALF>(a);
-}
-
-int32_t const *get_int32_ptr(GenericTensorAccessorR const &a) {
-  return get<DataType::INT32>(a);
-}
-
-int64_t const *get_int64_ptr(GenericTensorAccessorR const &a) {
-  return get<DataType::INT64>(a);
-}
-
-float const *get_float_ptr(GenericTensorAccessorR const &a) {
-  return get<DataType::FLOAT>(a);
-}
-
-double const *get_double_ptr(GenericTensorAccessorR const &a) {
-  return get<DataType::DOUBLE>(a);
-}
-
-half const *get_half_ptr(GenericTensorAccessorR const &a) {
-  return get<DataType::HALF>(a);
-}
-
-std::vector<int32_t const *>
-    get_int32_ptrs(std::vector<GenericTensorAccessorR> const &a) {
-  return get<DataType::INT32>(a);
-}
-
-std::vector<int64_t const *>
-    get_int64_ptrs(std::vector<GenericTensorAccessorR> const &a) {
-  return get<DataType::INT64>(a);
-}
-
-std::vector<float const *>
-    get_float_ptrs(std::vector<GenericTensorAccessorR> const &a) {
-  return get<DataType::FLOAT>(a);
-}
-
-std::vector<double const *>
-    get_double_ptrs(std::vector<GenericTensorAccessorR> const &a) {
-  return get<DataType::DOUBLE>(a);
-}
-
-std::vector<half const *>
-    get_half_ptrs(std::vector<GenericTensorAccessorR> const &a) {
-  return get<DataType::HALF>(a);
-}
-
-GenericTensorAccessorR read_only_accessor_from_write_accessor(
-    GenericTensorAccessorW const &writable) {
-  return GenericTensorAccessorR{
-      writable.data_type, writable.shape, req<void const *>(writable.ptr)};
-}
-
-bool is_shape_and_dtype_equal(GenericTensorAccessorW const &acc1,
-                              GenericTensorAccessorW const &acc2) {
-  return acc1.shape == acc2.shape && acc1.data_type == acc2.data_type;
-}
-
-bool shape_and_dtype_matches(GenericTensorAccessorW const &accessor,
-                             ArrayShape const &expected_shape,
-                             DataType const &expected_dtype) {
-  return accessor.shape == expected_shape &&
-         accessor.data_type == expected_dtype;
-}
-
-bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor,
-                             ArrayShape const &expected_shape,
-                             DataType const &expected_dtype) {
-  return accessor.shape == expected_shape &&
-         accessor.data_type == expected_dtype;
-}
-
-std::pair<ArrayShape, DataType>
-    get_shape_and_datatype(GenericTensorAccessorR const &accessor) {
-  return std::make_pair(accessor.shape, accessor.data_type);
-}
-
-std::pair<ArrayShape, DataType>
-    get_shape_and_datatype(GenericTensorAccessorW const &accessor) {
-  return std::make_pair(accessor.shape, accessor.data_type);
-}
-
-} // namespace FlexFlow
diff --git a/lib/kernels/src/allocation.cc b/lib/kernels/src/allocation.cc
deleted file mode 100644
index d666592e77..0000000000
--- a/lib/kernels/src/allocation.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-#include "kernels/allocation.h"
-#include "op-attrs/tensor_shape.h"
-
-namespace FlexFlow {
-
-void *Allocator::allocate(size_t mem_size) {
-  return this->i_allocator->allocate(mem_size);
-}
-
-void Allocator::deallocate(void *ptr) {
-  this->i_allocator->deallocate(ptr);
-}
-
-GenericTensorAccessorW
-    Allocator::allocate_tensor(TensorShape const &tensor_shape) {
-  void *ptr =
-      this->allocate(get_size_in_bytes(tensor_shape).unwrap_nonnegative());
-  return {tensor_shape.data_type, tensor_shape, ptr};
-}
-
-} // namespace FlexFlow
diff --git a/lib/kernels/src/cpu/ops/cast_kernels.cc b/lib/kernels/src/cpu/ops/cast_kernels.cc
new file mode 100644
index 0000000000..cdd57b8947
--- /dev/null
+++ b/lib/kernels/src/cpu/ops/cast_kernels.cc
@@ -0,0 +1,51 @@
+#include "kernels/cast_kernels_cpu.h"
+#include "kernels/datatype_dispatch.h"
+
+namespace FlexFlow::Kernels::Cast {
+
+template <typename IDT, typename ODT>
+void cpu_cast_forward(IDT const *input, ODT *output, size_t volume) {
+  for (size_t i = 0; i < volume; ++i) {
+    output[i] = static_cast<ODT>(input[i]);
+  }
+}
+
+template <typename IDT, typename ODT>
+void cpu_cast_backward(IDT const *input, ODT *output, size_t volume, ODT beta) {
+  for (size_t i = 0; i < volume; i++) {
+    output[i] = static_cast<ODT>(input[i]) + beta * output[i];
+  }
+}
+
+template <DataType IDT, DataType ODT>
+struct CPUForwardKernel {
+  void operator()(GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &output) {
+    size_t volume = input.shape.get_volume().unwrap_nonnegative();
+    cpu_cast_forward(input.get<IDT>(), output.get<ODT>(), volume);
+  }
+};
+
+template <DataType IDT, DataType ODT>
+struct CPUBackwardKernel {
+  void operator()(GenericTensorAccessorR const &output,
+                  GenericTensorAccessorW const &input) {
+    size_t volume = output.shape.get_volume().unwrap_nonnegative();
+    cpu_cast_backward(
+        output.get<IDT>(), input.get<ODT>(), volume, cast_to<ODT>(1.0f));
+  }
+};
+
+void cpu_forward_kernel(GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output) {
+  DataTypeDispatch2<CPUForwardKernel>{}(
+      input.data_type, output.data_type, input, output);
+}
+
+void cpu_backward_kernel(GenericTensorAccessorR const &output,
+                         GenericTensorAccessorW const &input) {
+  DataTypeDispatch2<CPUBackwardKernel>{}(
+      output.data_type, input.data_type, output, input);
+}
+
+} // namespace FlexFlow::Kernels::Cast
diff --git a/lib/kernels/src/cpu/ops/combine_kernels.cc b/lib/kernels/src/cpu/ops/combine_kernels.cc
new file mode 100644
index 0000000000..577984f21a
--- /dev/null
+++ b/lib/kernels/src/cpu/ops/combine_kernels.cc
@@ -0,0 +1,39 @@
+#include "kernels/combine_kernels_cpu.h"
+#include "kernels/datatype_dispatch.h"
+
+namespace FlexFlow::Kernels::Combine {
+
+template <DataType DT>
+struct CPUForwardKernel {
+  void operator()(GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &output) {
+    memcpy(output.get<DT>(),
+           input.get<DT>(),
+           input.shape.get_volume().unwrap_nonnegative() *
+               size_of_datatype(DT).unwrap_nonnegative());
+  }
+};
+
+template <DataType DT>
+struct CPUBackwardKernel {
+  void operator()(GenericTensorAccessorR const &output_grad,
+                  GenericTensorAccessorW const &input_grad) {
+    size_t num_elements = output_grad.shape.get_volume().unwrap_nonnegative();
+    for (int i = 0; i < num_elements; ++i) {
+      input_grad.get<DT>()[i] += output_grad.get<DT>()[i];
+    }
+  }
+};
+
+void cpu_forward_kernel(GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output) {
+  DataTypeDispatch1<CPUForwardKernel>{}(input.data_type, input, output);
+}
+
+void cpu_backward_kernel(GenericTensorAccessorR const &output_grad,
+                         GenericTensorAccessorW const &input_grad) {
+  DataTypeDispatch1<CPUBackwardKernel>{}(
+      input_grad.data_type, output_grad, input_grad);
+}
+
+} // namespace FlexFlow::Kernels::Combine
diff --git a/lib/kernels/src/cpu/initializer_kernels.cc b/lib/kernels/src/cpu/ops/initializer_kernels.cc
similarity index 100%
rename from lib/kernels/src/cpu/initializer_kernels.cc
rename to lib/kernels/src/cpu/ops/initializer_kernels.cc
diff --git a/lib/kernels/src/cpu/ops/replicate_kernels.cc b/lib/kernels/src/cpu/ops/replicate_kernels.cc
new file mode 100644
index 0000000000..798a4ea8c7
--- /dev/null
+++ b/lib/kernels/src/cpu/ops/replicate_kernels.cc
@@ -0,0 +1,51 @@
+#include "kernels/datatype_dispatch.h"
+#include "kernels/replicate_kernels_cpu.h"
+
+namespace FlexFlow::Kernels::Replicate {
+
+template <DataType DT>
+struct CPUForwardKernel {
+  void operator()(GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW &output) {
+    memcpy(output.get<DT>(),
+           input.get<DT>(),
+           input.shape.num_elements().unwrap_nonnegative() *
+               size_of_datatype(DT).unwrap_nonnegative());
+  }
+};
+
+template <DataType DT>
+struct CPUBackwardKernel {
+  void operator()(GenericTensorAccessorR const &output,
+                  GenericTensorAccessorW &input,
+                  nonnegative_int num_elements,
+                  nonnegative_int num_replicas) {
+    using T = real_type_t<DT>;
+
+    for (nonnegative_int i : nonnegative_range(num_elements)) {
+      T cur_sum = 0;
+      for (nonnegative_int replica_idx : nonnegative_range(num_replicas)) {
+        cur_sum += output.at<DT>(LegionOrdered{replica_idx, i});
+      }
+      input.at<DT>(LegionOrdered{i}) = cur_sum;
+    }
+  }
+};
+
+void cpu_forward_kernel(GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW &output) {
+  DataTypeDispatch1<CPUForwardKernel>{}(input.data_type, input, output);
+}
+
+void cpu_backward_kernel(GenericTensorAccessorR const &output,
+                         GenericTensorAccessorW &input,
+                         size_t num_replicas) {
+  nonnegative_int num_elements = input.shape.num_elements();
+  DataTypeDispatch1<CPUBackwardKernel>{}(input.data_type,
+                                         output,
+                                         input,
+                                         num_elements,
+                                         nonnegative_int{num_replicas});
+}
+
+} // namespace FlexFlow::Kernels::Replicate
diff --git a/lib/kernels/src/cpu/ops/reverse_kernels.cc b/lib/kernels/src/cpu/ops/reverse_kernels.cc
new file mode 100644
index 0000000000..4d9eb8cc09
--- /dev/null
+++ b/lib/kernels/src/cpu/ops/reverse_kernels.cc
@@ -0,0 +1,46 @@
+#include "kernels/datatype_dispatch.h"
+#include "kernels/reverse_kernels_cpu.h"
+#include <vector>
+
+namespace FlexFlow::Kernels::Reverse {
+
+template <DataType DT>
+struct CPUReverseForwardKernel {
+  void operator()(GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW &output,
+                  ReverseAttrs const &attrs) {
+    nonnegative_int reverse_axis_size = input.shape.at(attrs.axis);
+
+    for (ArrayCoord const &input_coord : get_array_coord_set(input.shape)) {
+      nonnegative_int input_reverse_axis_coord =
+          input_coord.ff_ordered.at(attrs.axis);
+
+      ArrayCoord output_coord = input_coord;
+      output_coord.ff_ordered.at(attrs.axis) =
+          nonnegative_int{reverse_axis_size.unwrap_nonnegative() -
+                          input_reverse_axis_coord.unwrap_nonnegative() - 1};
+
+      output.at<DT>(output_coord.ff_ordered) =
+          input.at<DT>(input_coord.ff_ordered);
+    }
+  }
+};
+
+void cpu_forward_kernel(GenericTensorAccessorR const &input_accessor,
+                        GenericTensorAccessorW &output_accessor,
+                        ReverseAttrs const &attrs) {
+
+  DataTypeDispatch1<CPUReverseForwardKernel>{}(
+      input_accessor.data_type, input_accessor, output_accessor, attrs);
+}
+
+void cpu_backward_kernel(GenericTensorAccessorR const &output_grad_accessor,
+                         GenericTensorAccessorW &input_grad_accessor,
+                         ReverseAttrs const &attrs) {
+  DataTypeDispatch1<CPUReverseForwardKernel>{}(output_grad_accessor.data_type,
+                                               output_grad_accessor,
+                                               input_grad_accessor,
+                                               attrs);
+}
+
+} // namespace FlexFlow::Kernels::Reverse
diff --git a/lib/kernels/src/cuda/cuda_helper.cu b/lib/kernels/src/cuda/cuda_helper.cu
index 66388c0ec8..86b2d8a437 100644
--- a/lib/kernels/src/cuda/cuda_helper.cu
+++ b/lib/kernels/src/cuda/cuda_helper.cu
@@ -1,4 +1,4 @@
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/datatype_dispatch.h"
 #include "utils/containers/reversed.h"
 
@@ -29,13 +29,13 @@ cudaError_t get_legion_stream(cudaStream_t *stream) {
 #error "Unknown device, please make sure if CUDA is enabled"
 #endif
 
-__global__ void scale_kernel(float *ptr, coord_t size, float a, float b) {
+__global__ void scale_kernel(float *ptr, size_t size, float a, float b) {
   CUDA_KERNEL_LOOP(i, size) {
     ptr[i] = (b - a) * ptr[i] + a;
   }
 }
 
-__global__ void ones_kernel(float *ptr, coord_t size) {
+__global__ void ones_kernel(float *ptr, size_t size) {
   CUDA_KERNEL_LOOP(i, size) {
     ptr[i] = 1.0f;
   }
@@ -49,7 +49,7 @@ __global__ void assign_kernel(DT *ptr, size_t size, DT value) {
 }
 
 template <typename DT>
-__global__ void copy_kernel(DT *dst, const DT *src, coord_t size) {
+__global__ void copy_kernel(DT *dst, const DT *src, size_t size) {
   CUDA_KERNEL_LOOP(i, size) {
     dst[i] = src[i];
   }
@@ -281,11 +281,11 @@ template __global__ void
     add_kernel<bool>(bool *dst, bool const *src, unsigned long size);
 
 template __global__ void
-    copy_kernel<float>(float *dst, float const *src, coord_t size);
+    copy_kernel<float>(float *dst, float const *src, size_t size);
 template __global__ void
-    copy_kernel<int32_t>(int32_t *dst, int32_t const *src, coord_t size);
+    copy_kernel<int32_t>(int32_t *dst, int32_t const *src, size_t size);
 template __global__ void
-    copy_kernel<int64_t>(int64_t *dst, int64_t const *src, coord_t size);
+    copy_kernel<int64_t>(int64_t *dst, int64_t const *src, size_t size);
 
 template __global__ void apply_add_with_scale<float>(float *data_ptr,
                                                      float const *grad_ptr,
diff --git a/lib/kernels/src/cuda/embedding_kernels.cu b/lib/kernels/src/cuda/embedding_kernels.cu
index e6a614ba70..cb84f0e777 100644
--- a/lib/kernels/src/cuda/embedding_kernels.cu
+++ b/lib/kernels/src/cuda/embedding_kernels.cu
@@ -13,16 +13,15 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/datatype_dispatch.h"
 #include "kernels/embedding_kernels.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Embedding {
+namespace FlexFlow::Kernels::Embedding {
 
 void rand_generate_int64_wrapper(int64_t *ptr, size_t size, int64_t p) {
   cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
 
   // Randomly initialize the intput tensor to avoid out of index range issues
   rand_generate_int<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
@@ -31,36 +30,14 @@ void rand_generate_int64_wrapper(int64_t *ptr, size_t size, int64_t p) {
 
 void rand_generate_int32_wrapper(int32_t *ptr, size_t size, int32_t p) {
   cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
 
   // Randomly initialize the intput tensor to avoid out of index range issues
   rand_generate_int<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
       ptr, size, p);
 }
 
-template <typename TI, typename TD>
-__global__ void embed_forward_no_aggr(
-    TI const *input, TD *output, TD const *embed, int out_dim, int batch_size);
-template <typename TI, typename TD>
-__global__ void embed_forward_with_aggr(TI const *input,
-                                        TD *output,
-                                        TD const *embed,
-                                        int out_dim,
-                                        int in_dim,
-                                        int batch_size,
-                                        std::optional<AggregateOp> aggr);
-template <typename TI, typename TD>
-__global__ void embed_backward_no_aggr(
-    TI const *input, TD const *output, TD *embed, int out_dim, int batch_size);
-template <typename TI, typename TD>
-__global__ void embed_backward_with_aggr(TI const *input,
-                                         TD const *output,
-                                         TD *embed,
-                                         int out_dim,
-                                         int in_dim,
-                                         int batch_size,
-                                         std::optional<AggregateOp> aggr);
-
-template <int32_t, typename TD>
+template <typename TD>
 __global__ void embed_forward_no_aggr(int32_t const *input,
                                       TD *output,
                                       TD const *embed,
@@ -75,7 +52,7 @@ __global__ void embed_forward_no_aggr(int32_t const *input,
   }
 }
 
-template <int64_t, typename TD>
+template <typename TD>
 __global__ void embed_forward_no_aggr(int64_t const *input,
                                       TD *output,
                                       TD const *embed,
@@ -90,14 +67,14 @@ __global__ void embed_forward_no_aggr(int64_t const *input,
   }
 }
 
-template <int32_t, typename TD>
+template <typename TD>
 __global__ void embed_forward_with_aggr(int32_t const *input,
                                         TD *output,
                                         TD const *embed,
                                         int out_dim,
                                         int in_dim,
                                         int batch_size,
-                                        std::optional<AggregateOp> aggr) {
+                                        AggregateOp aggr) {
   TD scale = 1.0f / in_dim;
   CUDA_KERNEL_LOOP(i, batch_size * out_dim) {
     output[i] = 0;
@@ -115,14 +92,14 @@ __global__ void embed_forward_with_aggr(int32_t const *input,
   }
 }
 
-template <int64_t, typename TD>
+template <typename TD>
 __global__ void embed_forward_with_aggr(int64_t const *input,
                                         TD *output,
                                         TD const *embed,
                                         int out_dim,
                                         int in_dim,
                                         int batch_size,
-                                        std::optional<AggregateOp> aggr) {
+                                        AggregateOp aggr) {
   TD scale = 1.0f / in_dim;
   CUDA_KERNEL_LOOP(i, batch_size * out_dim) {
     output[i] = 0;
@@ -140,7 +117,7 @@ __global__ void embed_forward_with_aggr(int64_t const *input,
   }
 }
 
-template <int32_t, typename TD>
+template <typename TD>
 __global__ void embed_backward_no_aggr(int32_t const *input,
                                        TD const *output,
                                        TD *embed,
@@ -154,7 +131,7 @@ __global__ void embed_backward_no_aggr(int32_t const *input,
   }
 }
 
-template <int64_t, typename TD>
+template <typename TD>
 __global__ void embed_backward_no_aggr(int64_t const *input,
                                        TD const *output,
                                        TD *embed,
@@ -171,11 +148,11 @@ __global__ void embed_backward_no_aggr(int64_t const *input,
 // Specialization for half type
 
 template <>
-__global__ void embed_backward_no_aggr<int32_t, half>(int32_t const *input,
-                                                      half const *output,
-                                                      half *embed,
-                                                      int out_dim,
-                                                      int batch_size) {
+__global__ void embed_backward_no_aggr<half>(int32_t const *input,
+                                             half const *output,
+                                             half *embed,
+                                             int out_dim,
+                                             int batch_size) {
   CUDA_KERNEL_LOOP(i, batch_size * out_dim) {
     int idx = i / out_dim;
     int off = i % out_dim;
@@ -192,11 +169,11 @@ __global__ void embed_backward_no_aggr<int32_t, half>(int32_t const *input,
 }
 
 template <>
-__global__ void embed_backward_no_aggr<int64_t, half>(int64_t const *input,
-                                                      half const *output,
-                                                      half *embed,
-                                                      int out_dim,
-                                                      int batch_size) {
+__global__ void embed_backward_no_aggr<half>(int64_t const *input,
+                                             half const *output,
+                                             half *embed,
+                                             int out_dim,
+                                             int batch_size) {
   CUDA_KERNEL_LOOP(i, batch_size * out_dim) {
     int idx = i / out_dim;
     int off = i % out_dim;
@@ -212,14 +189,14 @@ __global__ void embed_backward_no_aggr<int64_t, half>(int64_t const *input,
   }
 }
 
-template <int32_t, typename TD>
+template <typename TD>
 __global__ void embed_backward_with_aggr(int32_t const *input,
                                          TD const *output,
                                          TD *embed,
                                          int out_dim,
                                          int in_dim,
                                          int batch_size,
-                                         std::optional<AggregateOp> aggr) {
+                                         AggregateOp aggr) {
   TD scale = 1.0f / in_dim;
   CUDA_KERNEL_LOOP(i, batch_size * out_dim) {
     int idx = i / out_dim;
@@ -238,14 +215,14 @@ __global__ void embed_backward_with_aggr(int32_t const *input,
   }
 }
 
-template <int64_t, typename TD>
+template <typename TD>
 __global__ void embed_backward_with_aggr(int64_t const *input,
                                          TD const *output,
                                          TD *embed,
                                          int out_dim,
                                          int in_dim,
                                          int batch_size,
-                                         std::optional<AggregateOp> aggr) {
+                                         AggregateOp aggr) {
   TD scale = 1.0f / in_dim;
   CUDA_KERNEL_LOOP(i, batch_size * out_dim) {
     int idx = i / out_dim;
@@ -267,14 +244,13 @@ __global__ void embed_backward_with_aggr(int64_t const *input,
 // Specialization for half type
 
 template <>
-__global__ void
-    embed_backward_with_aggr<int32_t, half>(int32_t const *input,
-                                            half const *output,
-                                            half *embed,
-                                            int out_dim,
-                                            int in_dim,
-                                            int batch_size,
-                                            std::optional<AggregateOp> aggr) {
+__global__ void embed_backward_with_aggr<half>(int32_t const *input,
+                                               half const *output,
+                                               half *embed,
+                                               int out_dim,
+                                               int in_dim,
+                                               int batch_size,
+                                               AggregateOp aggr) {
   half scale = 1.0f / in_dim;
   CUDA_KERNEL_LOOP(i, batch_size * out_dim) {
     int idx = i / out_dim;
@@ -301,14 +277,13 @@ __global__ void
 }
 
 template <>
-__global__ void
-    embed_backward_with_aggr<int64_t, half>(int64_t const *input,
-                                            half const *output,
-                                            half *embed,
-                                            int out_dim,
-                                            int in_dim,
-                                            int batch_size,
-                                            std::optional<AggregateOp> aggr) {
+__global__ void embed_backward_with_aggr<half>(int64_t const *input,
+                                               half const *output,
+                                               half *embed,
+                                               int out_dim,
+                                               int in_dim,
+                                               int batch_size,
+                                               AggregateOp aggr) {
   half scale = 1.0f / in_dim;
   CUDA_KERNEL_LOOP(i, batch_size * out_dim) {
     int idx = i / out_dim;
@@ -351,35 +326,229 @@ struct ForwardKernel {
                   int in_dim,
                   int out_dim,
                   int batch_size) {
-    assert(input.data_type == DataType::INT32 ||
-           input.data_type == DataType::INT64);
-    assert(weight.data_type == DataType::HALF ||
-           weight.data_type == DataType::FLOAT ||
-           weight.data_type == DataType::DOUBLE);
+    throw mk_runtime_error(fmt::format(
+        "Invalid type combination: input type {} and output type {}", TI, TD));
+  }
+};
+
+template <>
+struct ForwardKernel<DataType::INT32, DataType::FLOAT> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &output,
+                  GenericTensorAccessorR const &weight,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    if (!aggr.has_value()) {
+      embed_forward_no_aggr<float>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::FLOAT>(),
+                       weight.get<DataType::FLOAT>(),
+                       out_dim,
+                       batch_size);
+    } else {
+      assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM);
+      embed_forward_with_aggr<float>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::FLOAT>(),
+                       weight.get<DataType::FLOAT>(),
+                       out_dim,
+                       in_dim,
+                       batch_size,
+                       aggr.value());
+    }
+  }
+};
 
+template <>
+struct ForwardKernel<DataType::INT32, DataType::HALF> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &output,
+                  GenericTensorAccessorR const &weight,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
     if (!aggr.has_value()) {
-      embed_forward_no_aggr<real_type_t<TI>, real_type_t<TD>>
-          <<<GET_BLOCKS(output.shape.get_volume()),
+      embed_forward_no_aggr<half>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
              CUDA_NUM_THREADS,
              0,
-             stream>>>(input.get<TI>(),
-                       output.get<TD>(),
-                       weight.get<TD>(),
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::HALF>(),
+                       weight.get<DataType::HALF>(),
                        out_dim,
                        batch_size);
     } else {
       assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM);
-      embed_forward_with_aggr<real_type_t<TI>, real_type_t<TD>>
-          <<<GET_BLOCKS(output.shape.get_volume()),
+      embed_forward_with_aggr<half>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
              CUDA_NUM_THREADS,
              0,
-             stream>>>(input.get<TI>(),
-                       output.get<TD>(),
-                       weight.get<TD>(),
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::HALF>(),
+                       weight.get<DataType::HALF>(),
                        out_dim,
                        in_dim,
                        batch_size,
-                       aggr);
+                       aggr.value());
+    }
+  }
+};
+
+template <>
+struct ForwardKernel<DataType::INT32, DataType::DOUBLE> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &output,
+                  GenericTensorAccessorR const &weight,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    if (!aggr.has_value()) {
+      embed_forward_no_aggr<double>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::DOUBLE>(),
+                       weight.get<DataType::DOUBLE>(),
+                       out_dim,
+                       batch_size);
+    } else {
+      assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM);
+      embed_forward_with_aggr<double>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::DOUBLE>(),
+                       weight.get<DataType::DOUBLE>(),
+                       out_dim,
+                       in_dim,
+                       batch_size,
+                       aggr.value());
+    }
+  }
+};
+
+template <>
+struct ForwardKernel<DataType::INT64, DataType::FLOAT> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &output,
+                  GenericTensorAccessorR const &weight,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    if (!aggr.has_value()) {
+      embed_forward_no_aggr<float>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::FLOAT>(),
+                       weight.get<DataType::FLOAT>(),
+                       out_dim,
+                       batch_size);
+    } else {
+      assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM);
+      embed_forward_with_aggr<float>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::FLOAT>(),
+                       weight.get<DataType::FLOAT>(),
+                       out_dim,
+                       in_dim,
+                       batch_size,
+                       aggr.value());
+    }
+  }
+};
+
+template <>
+struct ForwardKernel<DataType::INT64, DataType::HALF> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &output,
+                  GenericTensorAccessorR const &weight,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    if (!aggr.has_value()) {
+      embed_forward_no_aggr<half>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::HALF>(),
+                       weight.get<DataType::HALF>(),
+                       out_dim,
+                       batch_size);
+    } else {
+      assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM);
+      embed_forward_with_aggr<half>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::HALF>(),
+                       weight.get<DataType::HALF>(),
+                       out_dim,
+                       in_dim,
+                       batch_size,
+                       aggr.value());
+    }
+  }
+};
+
+template <>
+struct ForwardKernel<DataType::INT64, DataType::DOUBLE> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &output,
+                  GenericTensorAccessorR const &weight,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    if (!aggr.has_value()) {
+      embed_forward_no_aggr<double>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::DOUBLE>(),
+                       weight.get<DataType::DOUBLE>(),
+                       out_dim,
+                       batch_size);
+    } else {
+      assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM);
+      embed_forward_with_aggr<double>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::DOUBLE>(),
+                       weight.get<DataType::DOUBLE>(),
+                       out_dim,
+                       in_dim,
+                       batch_size,
+                       aggr.value());
     }
   }
 };
@@ -388,39 +557,229 @@ template <DataType TI, DataType TD>
 struct BackwardKernel {
   void operator()(cudaStream_t stream,
                   std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &output,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &weight_grad,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    throw mk_runtime_error(fmt::format(
+        "Invalid type combination: input type {} and output type {}", TI, TD));
+  }
+};
+
+template <>
+struct BackwardKernel<DataType::INT32, DataType::FLOAT> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &output,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &weight_grad,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    if (!aggr.has_value()) {
+      embed_backward_no_aggr<float>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::FLOAT>(),
+                       weight_grad.get<DataType::FLOAT>(),
+                       out_dim,
+                       batch_size);
+    } else {
+      embed_backward_with_aggr<float>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::FLOAT>(),
+                       weight_grad.get<DataType::FLOAT>(),
+                       out_dim,
+                       in_dim,
+                       batch_size,
+                       aggr.value());
+    }
+  }
+};
+
+template <>
+struct BackwardKernel<DataType::INT32, DataType::DOUBLE> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &output,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &weight_grad,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    if (!aggr.has_value()) {
+      embed_backward_no_aggr<double>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::DOUBLE>(),
+                       weight_grad.get<DataType::DOUBLE>(),
+                       out_dim,
+                       batch_size);
+    } else {
+      embed_backward_with_aggr<double>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::DOUBLE>(),
+                       weight_grad.get<DataType::DOUBLE>(),
+                       out_dim,
+                       in_dim,
+                       batch_size,
+                       aggr.value());
+    }
+  }
+};
+
+template <>
+struct BackwardKernel<DataType::INT32, DataType::HALF> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &output,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &weight_grad,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    if (!aggr.has_value()) {
+      embed_backward_no_aggr<half>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::HALF>(),
+                       weight_grad.get<DataType::HALF>(),
+                       out_dim,
+                       batch_size);
+    } else {
+      embed_backward_with_aggr<half>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT32>(),
+                       output.get<DataType::HALF>(),
+                       weight_grad.get<DataType::HALF>(),
+                       out_dim,
+                       in_dim,
+                       batch_size,
+                       aggr.value());
+    }
+  }
+};
+
+template <>
+struct BackwardKernel<DataType::INT64, DataType::FLOAT> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &output,
                   GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &weight_grad,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    if (!aggr.has_value()) {
+      embed_backward_no_aggr<float>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::FLOAT>(),
+                       weight_grad.get<DataType::FLOAT>(),
+                       out_dim,
+                       batch_size);
+    } else {
+      embed_backward_with_aggr<float>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::FLOAT>(),
+                       weight_grad.get<DataType::FLOAT>(),
+                       out_dim,
+                       in_dim,
+                       batch_size,
+                       aggr.value());
+    }
+  }
+};
+
+template <>
+struct BackwardKernel<DataType::INT64, DataType::DOUBLE> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
+                  GenericTensorAccessorR const &output,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &weight_grad,
+                  int in_dim,
+                  int out_dim,
+                  int batch_size) {
+    if (!aggr.has_value()) {
+      embed_backward_no_aggr<double>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::DOUBLE>(),
+                       weight_grad.get<DataType::DOUBLE>(),
+                       out_dim,
+                       batch_size);
+    } else {
+      embed_backward_with_aggr<double>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+             CUDA_NUM_THREADS,
+             0,
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::DOUBLE>(),
+                       weight_grad.get<DataType::DOUBLE>(),
+                       out_dim,
+                       in_dim,
+                       batch_size,
+                       aggr.value());
+    }
+  }
+};
+
+template <>
+struct BackwardKernel<DataType::INT64, DataType::HALF> {
+  void operator()(cudaStream_t stream,
+                  std::optional<AggregateOp> aggr,
                   GenericTensorAccessorR const &output,
+                  GenericTensorAccessorR const &input,
                   GenericTensorAccessorW const &weight_grad,
                   int in_dim,
                   int out_dim,
                   int batch_size) {
-    assert(input.data_type == DataType::INT32 ||
-           input.data_type == DataType::INT64);
-    assert(output.data_type == DataType::HALF ||
-           output.data_type == DataType::FLOAT ||
-           output.data_type == DataType::DOUBLE);
     if (!aggr.has_value()) {
-      embed_backward_no_aggr<real_type_t<TI>, real_type_t<TD>>
-          <<<GET_BLOCKS(output.shape.get_volume()),
+      embed_backward_no_aggr<half>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
              CUDA_NUM_THREADS,
              0,
-             stream>>>(input.get<TI>(),
-                       output.get<TD>(),
-                       weight_grad.get<TD>(),
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::HALF>(),
+                       weight_grad.get<DataType::HALF>(),
                        out_dim,
                        batch_size);
     } else {
-      embed_backward_with_aggr<real_type_t<TI>, real_type_t<TD>>
-          <<<GET_BLOCKS(output.shape.get_volume()),
+      embed_backward_with_aggr<half>
+          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
              CUDA_NUM_THREADS,
              0,
-             stream>>>(input.get<TI>(),
-                       output.get<TD>(),
-                       weight_grad.get<TD>(),
+             stream>>>(input.get<DataType::INT64>(),
+                       output.get<DataType::HALF>(),
+                       weight_grad.get<DataType::HALF>(),
                        out_dim,
                        in_dim,
                        batch_size,
-                       aggr);
+                       aggr.value());
     }
   }
 };
@@ -448,27 +807,25 @@ void forward_kernel(ffStream_t stream,
 }
 
 void backward_kernel(cudaStream_t stream,
-                     GenericTensorAccessorR const &input,
                      GenericTensorAccessorR const &output,
+                     GenericTensorAccessorR const &input,
                      GenericTensorAccessorW const &weight_grad,
-                     DataType input_data_type,
                      DataType output_data_type,
+                     DataType input_data_type,
                      std::optional<AggregateOp> aggr,
                      int in_dim,
                      int out_dim,
                      int batch_size) {
-  DataTypeDispatch2<BackwardKernel>{}(input_data_type,
-                                      output_data_type,
+  DataTypeDispatch2<BackwardKernel>{}(output_data_type,
+                                      input_data_type,
                                       stream,
                                       aggr,
-                                      input,
                                       output,
+                                      input,
                                       weight_grad,
                                       in_dim,
                                       out_dim,
                                       batch_size);
 }
 
-} // namespace Embedding
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Embedding
diff --git a/lib/kernels/src/cuda/loss_function_kernels.cu b/lib/kernels/src/cuda/loss_function_kernels.cu
index 6c22efda21..2fccf4b48f 100644
--- a/lib/kernels/src/cuda/loss_function_kernels.cu
+++ b/lib/kernels/src/cuda/loss_function_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/loss_function_kernels.h"
 
 namespace FlexFlow {
diff --git a/lib/kernels/src/cuda/metrics_functions.cu b/lib/kernels/src/cuda/metrics_functions.cu
index 2e037eb472..54ecd076f6 100644
--- a/lib/kernels/src/cuda/metrics_functions.cu
+++ b/lib/kernels/src/cuda/metrics_functions.cu
@@ -13,17 +13,42 @@
  * limitations under the License.
  */
 
-#include "flexflow/model.h"
-#include "flexflow/utils/cuda_helper.h"
+#include "internal/device.h"
+#include "kernels/metrics_kernels.h"
+#include "kernels/perf_metrics.h"
+#include "pcg/metric_attrs.h"
 
 namespace FlexFlow {
 
+struct CUDAPerfMetrics {
+  int train_all;
+  int train_correct;
+  float cce_loss;
+  float sparse_cce_loss;
+  float mse_loss;
+  float rmse_loss;
+  float mae_loss;
+  double start_time;
+  double current_time;
+
+  CUDAPerfMetrics() = delete;
+  CUDAPerfMetrics(PerfMetrics const &perf)
+      : train_all(perf.train_all),
+        train_correct(perf.train_correct.value_or(-1)),
+        cce_loss(perf.cce_loss.value_or(-1)),
+        sparse_cce_loss(perf.sparse_cce_loss.value_or(-1)),
+        mse_loss(perf.mse_loss.value_or(-1)),
+        rmse_loss(perf.rmse_loss.value_or(-1)),
+        mae_loss(perf.mae_loss.value_or(-1)), start_time(perf.start_time),
+        current_time(perf.current_time) {}
+};
+
 float const LOG_MIN_VALUE = 0.00000001f;
 
 __global__ void update_metrics_sparse_label_kernel(float const *logits,
                                                    int const *labels,
-                                                   PerfMetrics *perf,
-                                                   const Metrics metrics,
+                                                   CUDAPerfMetrics *perf,
+                                                   const MetricsAttrs metrics,
                                                    int num_samples,
                                                    int num_classes) {
   CUDA_KERNEL_LOOP(b, num_samples) {
@@ -72,8 +97,8 @@ __global__ void update_metrics_sparse_label_kernel(float const *logits,
 
 __global__ void update_metrics_label_kernel(float const *logits,
                                             float const *labels,
-                                            PerfMetrics *perf,
-                                            const Metrics metrics,
+                                            CUDAPerfMetrics *perf,
+                                            const MetricsAttrs metrics,
                                             int num_samples,
                                             int num_classes) {
   CUDA_KERNEL_LOOP(b, num_samples) {
@@ -136,17 +161,17 @@ __global__ void update_metrics_label_kernel(float const *logits,
   }
 }
 
-void Metrics::update_metrics_sparse_label_kernel_wrapper(
-    float const *logit_ptr,
-    int const *label_ptr,
-    Metrics const *me,
-    int num_effective_samples,
-    int num_classes,
-    PerfMetrics &perf_zc) {
-  PerfMetrics *perf;
-  checkCUDA(cudaMalloc(&perf, sizeof(PerfMetrics)));
-  checkCUDA(
-      cudaMemcpy(perf, &perf_zc, sizeof(PerfMetrics), cudaMemcpyHostToDevice));
+void update_metrics_sparse_label_kernel_wrapper(float const *logit_ptr,
+                                                int const *label_ptr,
+                                                MetricsAttrs const &me,
+                                                int num_effective_samples,
+                                                int num_classes,
+                                                PerfMetrics &perf_zc) {
+  CUDAPerfMetrics perf(perf_zc);
+  CUDAPerfMetrics *perf_cuda;
+  checkCUDA(cudaMalloc(&perf_cuda, sizeof(CUDAPerfMetrics)));
+  checkCUDA(cudaMemcpy(
+      perf_cuda, &perf, sizeof(CUDAPerfMetrics), cudaMemcpyHostToDevice));
 
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
@@ -154,32 +179,33 @@ void Metrics::update_metrics_sparse_label_kernel_wrapper(
                                        CUDA_NUM_THREADS,
                                        0,
                                        stream>>>(
-      logit_ptr, label_ptr, perf, *me, num_effective_samples, num_classes);
+      logit_ptr, label_ptr, perf_cuda, me, num_effective_samples, num_classes);
   checkCUDA(cudaStreamSynchronize(stream));
-  checkCUDA(
-      cudaMemcpy(&perf_zc, perf, sizeof(PerfMetrics), cudaMemcpyDeviceToHost));
-  checkCUDA(cudaFree(perf));
+  checkCUDA(cudaMemcpy(
+      &perf, perf_cuda, sizeof(CUDAPerfMetrics), cudaMemcpyDeviceToHost));
+  checkCUDA(cudaFree(perf_cuda));
 }
 
-void Metrics::update_metrics_label_kernel_wrapper(float const *logit_ptr,
-                                                  float const *label_ptr,
-                                                  Metrics const *me,
-                                                  int num_samples,
-                                                  int num_classes,
-                                                  PerfMetrics &perf_zc) {
-  PerfMetrics *perf;
-  checkCUDA(cudaMalloc(&perf, sizeof(PerfMetrics)));
-  checkCUDA(
-      cudaMemcpy(perf, &perf_zc, sizeof(PerfMetrics), cudaMemcpyHostToDevice));
+void update_metrics_label_kernel_wrapper(float const *logit_ptr,
+                                         float const *label_ptr,
+                                         MetricsAttrs const &me,
+                                         int num_samples,
+                                         int num_classes,
+                                         PerfMetrics &perf_zc) {
+  CUDAPerfMetrics perf(perf_zc);
+  CUDAPerfMetrics *perf_cuda;
+  checkCUDA(cudaMalloc(&perf_cuda, sizeof(CUDAPerfMetrics)));
+  checkCUDA(cudaMemcpy(
+      perf_cuda, &perf, sizeof(CUDAPerfMetrics), cudaMemcpyHostToDevice));
 
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   update_metrics_label_kernel<<<GET_BLOCKS(num_samples), 256, 0, stream>>>(
-      logit_ptr, label_ptr, perf, *me, num_samples, num_classes);
+      logit_ptr, label_ptr, perf_cuda, me, num_samples, num_classes);
   checkCUDA(cudaStreamSynchronize(stream));
-  checkCUDA(
-      cudaMemcpy(&perf_zc, perf, sizeof(PerfMetrics), cudaMemcpyDeviceToHost));
-  checkCUDA(cudaFree(perf));
+  checkCUDA(cudaMemcpy(
+      &perf, perf_cuda, sizeof(CUDAPerfMetrics), cudaMemcpyDeviceToHost));
+  checkCUDA(cudaFree(perf_cuda));
 }
 
 }; // namespace FlexFlow
diff --git a/lib/kernels/src/cuda/ops/attention_kernels.cu b/lib/kernels/src/cuda/ops/attention_kernels.cu
index 38c32ad9e4..e5bdb6f21d 100644
--- a/lib/kernels/src/cuda/ops/attention_kernels.cu
+++ b/lib/kernels/src/cuda/ops/attention_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/attention_kernels.h"
 #include "kernels/device.h"
 
diff --git a/lib/kernels/src/cuda/ops/batch_matmul_kernels.cu b/lib/kernels/src/cuda/ops/batch_matmul_kernels.cu
index eb23514c5f..348eed9f0c 100644
--- a/lib/kernels/src/cuda/ops/batch_matmul_kernels.cu
+++ b/lib/kernels/src/cuda/ops/batch_matmul_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/batch_matmul_kernels.h"
 
 namespace FlexFlow {
diff --git a/lib/kernels/src/cuda/ops/batch_norm_kernels.cu b/lib/kernels/src/cuda/ops/batch_norm_kernels.cu
index 4e153a028e..ceb3a1b3d9 100644
--- a/lib/kernels/src/cuda/ops/batch_norm_kernels.cu
+++ b/lib/kernels/src/cuda/ops/batch_norm_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/allocation.h"
 #include "kernels/batch_norm_kernels.h"
 #include "kernels/ff_handle.h"
@@ -53,9 +53,9 @@ void forward_kernel(cudaStream_t stream,
 
 void backward_kernel(cudaStream_t stream,
                      BatchNormPerDeviceState const &m,
-                     float const *input_ptr,
-                     float *output_grad_ptr,
                      float const *output_ptr,
+                     float *output_grad_ptr,
+                     float const *input_ptr,
                      float *input_grad_ptr,
                      float const *scale_ptr,
                      float *scale_grad_ptr,
diff --git a/lib/kernels/src/cuda/ops/cast_kernels.cu b/lib/kernels/src/cuda/ops/cast_kernels.cu
index fe7aec68b9..f3ea6db660 100644
--- a/lib/kernels/src/cuda/ops/cast_kernels.cu
+++ b/lib/kernels/src/cuda/ops/cast_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/cast_kernels.h"
 #include "kernels/datatype_dispatch.h"
 
@@ -50,30 +50,26 @@ struct ForwardKernel {
 template <DataType IDT, DataType ODT>
 struct BackwardKernel {
   void operator()(ffStream_t stream,
-                  GenericTensorAccessorR const &input,
-                  GenericTensorAccessorW const &output) {
-    size_t volume = input.shape.get_volume().unwrap_nonnegative();
+                  GenericTensorAccessorR const &output,
+                  GenericTensorAccessorW const &input) {
+    size_t volume = output.shape.get_volume().unwrap_nonnegative();
     cast_backward<<<GET_BLOCKS(volume), CUDA_NUM_THREADS, 0, stream>>>(
-        input.get<IDT>(), output.get<ODT>(), volume, cast_to<ODT>(1.0f));
+        output.get<IDT>(), input.get<ODT>(), volume, cast_to<ODT>(1.0f));
   }
 };
 
 void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorR const &input,
-                    GenericTensorAccessorW const &output,
-                    DataType input_type,
-                    DataType output_type) {
+                    GenericTensorAccessorW const &output) {
   DataTypeDispatch2<ForwardKernel>{}(
-      input_type, output_type, stream, input, output);
+      input.data_type, output.data_type, stream, input, output);
 }
 
 void backward_kernel(ffStream_t stream,
-                     GenericTensorAccessorR const &input,
-                     GenericTensorAccessorW const &output,
-                     DataType input_type,
-                     DataType output_type) {
+                     GenericTensorAccessorR const &output,
+                     GenericTensorAccessorW const &input) {
   DataTypeDispatch2<BackwardKernel>{}(
-      input_type, output_type, stream, input, output);
+      output.data_type, input.data_type, stream, output, input);
 }
 
 } // namespace Cast
diff --git a/lib/kernels/src/cuda/ops/combine_kernels.cu b/lib/kernels/src/cuda/ops/combine_kernels.cu
index 7cc67ceed8..08cc343fd2 100644
--- a/lib/kernels/src/cuda/ops/combine_kernels.cu
+++ b/lib/kernels/src/cuda/ops/combine_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/accessor.h"
 #include "kernels/combine_kernels.h"
 #include "kernels/datatype_dispatch.h"
diff --git a/lib/kernels/src/cuda/ops/concat_kernels.cu b/lib/kernels/src/cuda/ops/concat_kernels.cu
index 2715ff16e9..37dbbe12f8 100644
--- a/lib/kernels/src/cuda/ops/concat_kernels.cu
+++ b/lib/kernels/src/cuda/ops/concat_kernels.cu
@@ -13,50 +13,58 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/concat_kernels.h"
 #include <cassert>
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Concat {
+namespace FlexFlow::Kernels::Concat {
 
 void calc_blk_size(size_t &num_blocks,
                    size_t &blk_size,
                    ArrayShape const &shape,
                    ff_dim_t axis) {
-  blk_size = shape.sub_shape(legion_dim_t{0_n}, axis)
+  legion_dim_t legion_axis = legion_dim_from_ff_dim(axis, shape.num_dims());
+  assert(legion_axis.value < shape.num_dims());
+  if (legion_axis.value == 0_n) {
+    legion_axis.value = 1_n;
+  }
+  blk_size = shape.sub_shape(legion_dim_t{0_n}, legion_axis)
                  .num_elements()
                  .unwrap_nonnegative();
-  num_blocks =
-      shape.sub_shape(axis, std::nullopt).num_elements().unwrap_nonnegative();
+  num_blocks = shape.sub_shape(legion_axis, std::nullopt)
+                   .num_elements()
+                   .unwrap_nonnegative();
 }
 
 void forward_kernel(cudaStream_t stream,
                     GenericTensorAccessorW const &output,
                     std::vector<GenericTensorAccessorR> const &inputs,
                     ff_dim_t axis) {
-  size_t num_blocks = 1, output_blk_size = 1, input_blk_sizes[MAX_NUM_INPUTS];
-  int num_inputs = inputs.size();
-  assert(num_inputs <= MAX_NUM_INPUTS);
+  assert(inputs.size() <= MAX_NUM_INPUTS);
+  size_t num_blocks = 1, output_blk_size = 1;
   calc_blk_size(num_blocks, output_blk_size, output.shape, axis);
-  for (int i = 0; i < num_inputs; i++) {
-    size_t input_num_blocks = 1;
-    calc_blk_size(input_num_blocks, input_blk_sizes[i], inputs[i].shape, axis);
-    assert(input_num_blocks == num_blocks);
-  }
-
   off_t offset = 0;
-  for (int i = 0; i < num_inputs; i++) {
-    copy_with_stride<<<GET_BLOCKS(input_blk_sizes[i] * num_blocks),
+
+  for (GenericTensorAccessorR const &input : inputs) {
+    size_t input_num_blocks = 1, input_blk_size = 1;
+    calc_blk_size(input_num_blocks, input_blk_size, input.shape, axis);
+    assert(input_num_blocks == num_blocks || output_blk_size == input_blk_size);
+
+    int blocks_to_copy =
+        (output_blk_size == input_blk_size) ? input_num_blocks : num_blocks;
+
+    copy_with_stride<<<GET_BLOCKS(input_blk_size * num_blocks),
                        CUDA_NUM_THREADS,
                        0,
                        stream>>>(output.get_float_ptr() + offset,
-                                 inputs[i].get_float_ptr(),
-                                 num_blocks,
+                                 input.get_float_ptr(),
+                                 blocks_to_copy,
                                  output_blk_size,
-                                 input_blk_sizes[i]);
-    offset += input_blk_sizes[i];
+                                 input_blk_size);
+
+    offset += (output_blk_size == input_blk_size)
+                  ? input_blk_size * input_num_blocks
+                  : input_blk_size;
   }
 }
 
@@ -64,32 +72,32 @@ void backward_kernel(cudaStream_t stream,
                      GenericTensorAccessorR const &output_grad,
                      std::vector<GenericTensorAccessorW> const &input_grads,
                      ff_dim_t axis) {
-  size_t num_blocks = 1, output_blk_size = 1, input_blk_sizes[MAX_NUM_INPUTS];
-  int num_inputs = input_grads.size();
-  assert(num_inputs <= MAX_NUM_INPUTS);
-
+  assert(input_grads.size() <= MAX_NUM_INPUTS);
+  size_t num_blocks = 1, output_blk_size = 1;
   calc_blk_size(num_blocks, output_blk_size, output_grad.shape, axis);
-  for (int i = 0; i < num_inputs; i++) {
-    ArrayShape shape = input_grads[i].shape;
-    size_t input_num_blocks = 1;
-    calc_blk_size(input_num_blocks, input_blk_sizes[i], shape, axis);
-    assert(input_num_blocks == num_blocks);
-  }
-
   off_t offset = 0;
-  for (int i = 0; i < num_inputs; i++) {
-    add_with_stride<<<GET_BLOCKS(input_blk_sizes[i] * num_blocks),
+
+  for (auto &input_grad : input_grads) {
+    size_t input_num_blocks = 1, input_blk_size = 1;
+    calc_blk_size(input_num_blocks, input_blk_size, input_grad.shape, axis);
+    assert(input_num_blocks == num_blocks || output_blk_size == input_blk_size);
+
+    int blocks_to_add =
+        (output_blk_size == input_blk_size) ? input_num_blocks : num_blocks;
+
+    add_with_stride<<<GET_BLOCKS(input_blk_size * num_blocks),
                       CUDA_NUM_THREADS,
                       0,
-                      stream>>>(input_grads[i].get_float_ptr(),
+                      stream>>>(input_grad.get_float_ptr(),
                                 output_grad.get_float_ptr() + offset,
-                                num_blocks,
-                                input_blk_sizes[i],
+                                blocks_to_add,
+                                input_blk_size,
                                 output_blk_size);
-    offset += input_blk_sizes[i];
+
+    offset += (output_blk_size == input_blk_size)
+                  ? input_blk_size * input_num_blocks
+                  : input_blk_size;
   }
 }
 
-} // namespace Concat
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Concat
diff --git a/lib/kernels/src/cuda/ops/conv_2d_kernels.cu b/lib/kernels/src/cuda/ops/conv_2d_kernels.cu
index dac55539d2..16db62a57f 100644
--- a/lib/kernels/src/cuda/ops/conv_2d_kernels.cu
+++ b/lib/kernels/src/cuda/ops/conv_2d_kernels.cu
@@ -1,4 +1,4 @@
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/conv_2d_kernels.h"
 
 namespace FlexFlow {
@@ -313,10 +313,10 @@ void forward_kernel(ffStream_t stream,
 
 void backward_kernel(ffStream_t stream,
                      Conv2DPerDeviceState const &m,
-                     float const *input_ptr,
-                     float *input_grad_ptr,
                      float const *output_ptr,
                      float *output_grad_ptr,
+                     float const *input_ptr,
+                     float *input_grad_ptr,
                      float const *filter_ptr,
                      float *filter_grad_ptr,
                      float *bias_grad_ptr,
diff --git a/lib/kernels/src/cuda/ops/dropout_kernels.cu b/lib/kernels/src/cuda/ops/dropout_kernels.cu
index adf0cd8e89..c5fa56bc78 100644
--- a/lib/kernels/src/cuda/ops/dropout_kernels.cu
+++ b/lib/kernels/src/cuda/ops/dropout_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/dropout_kernels.h"
 #include "kernels/ff_handle.h"
 
diff --git a/lib/kernels/src/cuda/ops/element_binary_kernels.cu b/lib/kernels/src/cuda/ops/element_binary_kernels.cu
index 44273a323f..3a4a77b3dd 100644
--- a/lib/kernels/src/cuda/ops/element_binary_kernels.cu
+++ b/lib/kernels/src/cuda/ops/element_binary_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/element_binary_kernels.h"
 #include "kernels/ff_handle.h"
 #include "op-attrs/datatype.h"
diff --git a/lib/kernels/src/cuda/ops/element_unary_kernels.cu b/lib/kernels/src/cuda/ops/element_unary_kernels.cu
index 056c80ecf6..218e74b939 100644
--- a/lib/kernels/src/cuda/ops/element_unary_kernels.cu
+++ b/lib/kernels/src/cuda/ops/element_unary_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/datatype_dispatch.h"
 #include "kernels/element_unary_kernels.h"
 #include "op-attrs/get_op_type.h"
@@ -290,10 +290,10 @@ struct BackwardKernel {
                   OperatorType op_type,
                   std::optional<float> scalar,
                   PerDeviceFFHandle const &handle,
-                  GenericTensorAccessorR const &input,
-                  GenericTensorAccessorW const &input_grad,
                   GenericTensorAccessorR const &output,
-                  GenericTensorAccessorR const &output_grad) {
+                  GenericTensorAccessorR const &output_grad,
+                  GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &input_grad) {
     checkCUDNN(cudnnSetStream(handle.dnn, stream));
 
     if (use_cudnn(op_type)) {
@@ -356,20 +356,20 @@ void backward_kernel(ffStream_t stream,
                      ElementUnaryPerDeviceState const &device_state,
                      ElementUnaryAttrs const &attrs,
                      PerDeviceFFHandle const &handle,
-                     GenericTensorAccessorR const &input,
-                     GenericTensorAccessorW const &input_grad,
                      GenericTensorAccessorR const &output,
-                     GenericTensorAccessorR const &output_grad) {
+                     GenericTensorAccessorR const &output_grad,
+                     GenericTensorAccessorR const &input,
+                     GenericTensorAccessorW const &input_grad) {
   DataTypeDispatch1<BackwardKernel>{}(input.data_type,
                                       stream,
                                       device_state,
                                       get_op_type(attrs),
                                       attrs.scalar,
                                       handle,
-                                      input,
-                                      input_grad,
                                       output,
-                                      output_grad);
+                                      output_grad,
+                                      input,
+                                      input_grad);
 }
 
 } // namespace ElementUnary
diff --git a/lib/kernels/src/cuda/ops/flat_kernels.cu b/lib/kernels/src/cuda/ops/flat_kernels.cu
index 973d05f596..594a183ff0 100644
--- a/lib/kernels/src/cuda/ops/flat_kernels.cu
+++ b/lib/kernels/src/cuda/ops/flat_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/accessor.h"
 #include "kernels/flat_kernels.h"
 
@@ -35,8 +35,8 @@ void forward_kernel(cudaStream_t stream,
 
 void backward_kernel(cudaStream_t stream,
                      GenericTensorAccessorR input,
-                     float *input_grad_ptr,
-                     float const *output_grad_ptr) {
+                     float const *output_grad_ptr,
+                     float *input_grad_ptr) {
 
   float alpha = 1.0f;
   apply_add_with_scale<float>
diff --git a/lib/kernels/src/cuda/ops/gather_kernels.cu b/lib/kernels/src/cuda/ops/gather_kernels.cu
index 31c1bac217..19e495a540 100644
--- a/lib/kernels/src/cuda/ops/gather_kernels.cu
+++ b/lib/kernels/src/cuda/ops/gather_kernels.cu
@@ -13,14 +13,12 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/datatype_dispatch.h"
 #include "kernels/device.h"
 #include "kernels/gather_kernels.h"
 
-namespace FlexFlow {
-namespace Kernels {
-namespace Gather {
+namespace FlexFlow::Kernels::Gather {
 
 template <typename IndexType>
 __global__ void gather_forward(float const *input,
@@ -125,11 +123,15 @@ void forward_kernel(ffStream_t stream,
                     GenericTensorAccessorR const &index,
                     GenericTensorAccessorW const &output) {
   checkCUDA(get_legion_stream(&stream));
-
   coord_t stride =
-      output.shape.sub_shape(std::nullopt, add_to_legion_dim(m.legion_dim, 1))
+      output.shape
+          .sub_shape(legion_dim_t{0_n}, add_to_legion_dim(m.legion_dim, 1))
           .num_elements()
           .unwrap_nonnegative();
+  if (m.legion_dim.value == 0_n) {
+    stride = 1;
+  }
+
   coord_t output_dim_size = output.shape.at(m.legion_dim).unwrap_nonnegative();
   coord_t input_dim_size = input.shape.at(m.legion_dim).unwrap_nonnegative();
 
@@ -157,9 +159,13 @@ void backward_kernel(ffStream_t stream,
 
   coord_t stride =
       output_grad.shape
-          .sub_shape(std::nullopt, add_to_legion_dim(m.legion_dim, 1))
-          .get_volume()
+          .sub_shape(legion_dim_t{0_n}, add_to_legion_dim(m.legion_dim, 1))
+          .num_elements()
           .unwrap_nonnegative();
+  if (m.legion_dim.value == 0_n) {
+    stride = 1;
+  }
+
   coord_t output_dim_size =
       output_grad.shape.at(m.legion_dim).unwrap_nonnegative();
   coord_t input_dim_size =
@@ -180,6 +186,4 @@ void backward_kernel(ffStream_t stream,
       output_dim_size);
 }
 
-} // namespace Gather
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Gather
diff --git a/lib/kernels/src/cuda/ops/linear_kernels.cu b/lib/kernels/src/cuda/ops/linear_kernels.cu
index ca51f0d216..02bda55828 100644
--- a/lib/kernels/src/cuda/ops/linear_kernels.cu
+++ b/lib/kernels/src/cuda/ops/linear_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/allocation.h"
 #include "kernels/linear_kernels.h"
 #include "utils/integer_conversions.h"
@@ -108,10 +108,10 @@ LinearPerDeviceState init_kernel(PerDeviceFFHandle handle,
 
 void forward_kernel(cudaStream_t stream,
                     LinearPerDeviceState const &m,
-                    void const *input_ptr,
-                    void *output_ptr,
-                    void const *weight_ptr,
-                    void const *bias_ptr,
+                    float const *input_ptr,
+                    float *output_ptr,
+                    float const *weight_ptr,
+                    float const *bias_ptr,
                     int in_dim,
                     int out_dim,
                     int batch_size) {
@@ -135,14 +135,14 @@ void forward_kernel(cudaStream_t stream,
                            batch_size,
                            in_dim,
                            &alpha,
-                           weight_ptr,
+                           static_cast<void const *>(weight_ptr),
                            weight_type,
                            in_dim,
-                           input_ptr,
+                           static_cast<void const *>(input_ptr),
                            input_type,
                            in_dim,
                            &beta,
-                           output_ptr,
+                           static_cast<void *>(output_ptr),
                            output_type,
                            out_dim,
                            compute_type,
@@ -156,14 +156,14 @@ void forward_kernel(cudaStream_t stream,
                              batch_size,
                              1,
                              &alpha,
-                             bias_ptr,
+                             static_cast<void const *>(bias_ptr),
                              weight_type,
                              1,
-                             m.one_ptr,
+                             static_cast<void const *>(m.one_ptr),
                              CUDA_R_32F,
                              1,
                              &alpha,
-                             output_ptr,
+                             static_cast<void *>(output_ptr),
                              output_type,
                              out_dim,
                              compute_type,
@@ -174,10 +174,10 @@ void forward_kernel(cudaStream_t stream,
                                       m.actiDesc,
                                       &alpha,
                                       m.outputTensor,
-                                      output_ptr,
+                                      static_cast<void *>(output_ptr),
                                       &beta,
                                       m.outputTensor,
-                                      output_ptr));
+                                      static_cast<void *>(output_ptr)));
   } else if (m.activation == Activation::GELU) {
     size_t elements = size_t_from_int(out_dim) * size_t_from_int(batch_size);
     constexpr float B = 0.7978845608028654f;   // sqrt(2.0/M_PI)
@@ -191,13 +191,13 @@ void forward_kernel(cudaStream_t stream,
 
 void backward_kernel(cudaStream_t stream,
                      LinearPerDeviceState const &m,
-                     void const *input_ptr,
-                     void *input_grad_ptr,
-                     void const *output_ptr,
-                     void *output_grad_ptr,
-                     void const *kernel_ptr,
-                     void *kernel_grad_ptr,
-                     void *bias_grad_ptr,
+                     float const *output_ptr,
+                     float *output_grad_ptr,
+                     float const *input_ptr,
+                     float *input_grad_ptr,
+                     float const *kernel_ptr,
+                     float *kernel_grad_ptr,
+                     float *bias_grad_ptr,
                      int in_dim,
                      int out_dim,
                      int batch_size) {
@@ -216,11 +216,17 @@ void backward_kernel(cudaStream_t stream,
   int output_size = out_dim * batch_size;
   if (m.activation.has_value()) {
     if (m.activation == Activation::RELU) {
-      relu_backward_kernel(
-          m.output_type, output_grad_ptr, output_ptr, output_size, stream);
+      relu_backward_kernel(m.output_type,
+                           static_cast<void *>(output_grad_ptr),
+                           static_cast<void const *>(output_ptr),
+                           output_size,
+                           stream);
     } else if (m.activation == Activation::SIGMOID) {
-      sigmoid_backward_kernel(
-          m.output_type, output_grad_ptr, output_ptr, output_size, stream);
+      sigmoid_backward_kernel(m.output_type,
+                              static_cast<void *>(output_grad_ptr),
+                              static_cast<void const *>(output_ptr),
+                              output_size,
+                              stream);
     } else {
       // TODO: only support relu and sigmoid for now
       assert(false && "Unsupported activation for Linear");
@@ -235,14 +241,14 @@ void backward_kernel(cudaStream_t stream,
                            out_dim,
                            batch_size,
                            &alpha,
-                           input_ptr,
+                           static_cast<void const *>(input_ptr),
                            input_type,
                            in_dim,
-                           output_grad_ptr,
+                           static_cast<void *>(output_grad_ptr),
                            output_type,
                            out_dim,
                            &alpha,
-                           kernel_grad_ptr,
+                           static_cast<void *>(kernel_grad_ptr),
                            weight_type,
                            in_dim,
                            compute_type,
@@ -261,12 +267,12 @@ void backward_kernel(cudaStream_t stream,
                               in_dim,
                               out_dim,
                               &alpha,
-                              (float *)kernel_grad_ptr,
+                              kernel_grad_ptr,
                               in_dim,
                               &lambda,
-                              (float *)kernel_ptr,
+                              kernel_ptr,
                               in_dim,
-                              (float *)kernel_grad_ptr,
+                              kernel_grad_ptr,
                               in_dim));
     } else {
       assert(false && "Only L2 regularization is supported");
@@ -284,14 +290,14 @@ void backward_kernel(cudaStream_t stream,
                              out_dim,
                              batch_size,
                              &alpha,
-                             m.one_ptr,
+                             static_cast<void const *>(m.one_ptr),
                              CUDA_R_32F,
                              1,
-                             output_grad_ptr,
+                             static_cast<void *>(output_grad_ptr),
                              output_type,
                              out_dim,
                              &alpha,
-                             bias_grad_ptr,
+                             static_cast<void *>(bias_grad_ptr),
                              weight_type,
                              1,
                              compute_type,
@@ -307,14 +313,14 @@ void backward_kernel(cudaStream_t stream,
                              batch_size,
                              out_dim,
                              &alpha,
-                             kernel_ptr,
+                             static_cast<void const *>(kernel_ptr),
                              weight_type,
                              in_dim,
-                             output_grad_ptr,
+                             static_cast<void *>(output_grad_ptr),
                              output_type,
                              out_dim,
                              &alpha,
-                             input_grad_ptr,
+                             static_cast<void *>(input_grad_ptr),
                              input_type,
                              in_dim,
                              compute_type,
diff --git a/lib/kernels/src/cuda/ops/partition_kernels.cu b/lib/kernels/src/cuda/ops/partition_kernels.cu
index 2831562f58..b8dfac5204 100644
--- a/lib/kernels/src/cuda/ops/partition_kernels.cu
+++ b/lib/kernels/src/cuda/ops/partition_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/datatype_dispatch.h"
 #include "kernels/partition_kernels.h"
 
@@ -40,8 +40,8 @@ template <DataType T>
 struct BackwardKernel {
   void operator()(cudaStream_t stream,
                   RepartitionPerDeviceState const &m,
-                  GenericTensorAccessorW const &input_grad,
-                  GenericTensorAccessorR const &output_grad) {
+                  GenericTensorAccessorR const &output_grad,
+                  GenericTensorAccessorW const &input_grad) {
     add_kernel<real_type_t<T>>
         <<<GET_BLOCKS(input_grad.shape.num_elements().unwrap_nonnegative()),
            CUDA_NUM_THREADS,
@@ -67,10 +67,10 @@ void forward_kernel(cudaStream_t stream,
 
 void backward_kernel(cudaStream_t stream,
                      RepartitionPerDeviceState const &m,
-                     GenericTensorAccessorW const &input_grad,
-                     GenericTensorAccessorR const &output_grad) {
+                     GenericTensorAccessorR const &output_grad,
+                     GenericTensorAccessorW const &input_grad) {
   DataTypeDispatch1<BackwardKernel>{}(
-      m.data_type, stream, m, input_grad, output_grad);
+      m.data_type, stream, m, output_grad, input_grad);
 }
 
 } // namespace Repartition
diff --git a/lib/kernels/src/cuda/ops/pool_2d_kernels.cu b/lib/kernels/src/cuda/ops/pool_2d_kernels.cu
index 51fa29d289..e8ea3f64c2 100644
--- a/lib/kernels/src/cuda/ops/pool_2d_kernels.cu
+++ b/lib/kernels/src/cuda/ops/pool_2d_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/pool_2d_kernels.h"
 
 namespace FlexFlow {
@@ -112,10 +112,10 @@ void forward_kernel(cudaStream_t stream,
 
 void backward_kernel(cudaStream_t stream,
                      Pool2DPerDeviceState const &m,
-                     void const *input_ptr,
-                     void *input_grad_ptr,
                      void const *output_ptr,
-                     void const *output_grad_ptr) {
+                     void const *output_grad_ptr,
+                     void const *input_ptr,
+                     void *input_grad_ptr) {
 
   checkCUDNN(cudnnSetStream(m.handle.dnn, stream));
 
diff --git a/lib/kernels/src/cuda/ops/reduce_kernels.cu b/lib/kernels/src/cuda/ops/reduce_kernels.cu
index 02a89da807..563bbae21d 100644
--- a/lib/kernels/src/cuda/ops/reduce_kernels.cu
+++ b/lib/kernels/src/cuda/ops/reduce_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/reduce_kernels.h"
 
 namespace FlexFlow {
diff --git a/lib/kernels/src/cuda/ops/reduction_kernels.cu b/lib/kernels/src/cuda/ops/reduction_kernels.cu
index 5d95a3766a..d9c09b082d 100644
--- a/lib/kernels/src/cuda/ops/reduction_kernels.cu
+++ b/lib/kernels/src/cuda/ops/reduction_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/datatype_dispatch.h"
 #include "kernels/reduction_kernels.h"
 
@@ -55,8 +55,8 @@ struct ForwardKernel {
 template <DataType T>
 struct BackwardKernel {
   void operator()(cudaStream_t stream,
-                  GenericTensorAccessorW const &input,
-                  GenericTensorAccessorR const &output) {
+                  GenericTensorAccessorR const &output,
+                  GenericTensorAccessorW const &input) {
     checkCUDA(cudaMemcpyAsync(input.get<T>(),
                               output.get<T>(),
                               input.shape.num_elements().unwrap_nonnegative() *
@@ -75,9 +75,9 @@ void forward_kernel(cudaStream_t stream,
 }
 
 void backward_kernel(cudaStream_t stream,
-                     GenericTensorAccessorW const &input,
-                     GenericTensorAccessorR const &output) {
-  DataTypeDispatch1<BackwardKernel>{}(input.data_type, stream, input, output);
+                     GenericTensorAccessorR const &output,
+                     GenericTensorAccessorW const &input) {
+  DataTypeDispatch1<BackwardKernel>{}(output.data_type, stream, output, input);
 }
 
 } // namespace Reduction
diff --git a/lib/kernels/src/cuda/ops/replicate_kernels.cu b/lib/kernels/src/cuda/ops/replicate_kernels.cu
index 4706f38fd4..4685fd7a2d 100644
--- a/lib/kernels/src/cuda/ops/replicate_kernels.cu
+++ b/lib/kernels/src/cuda/ops/replicate_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/datatype_dispatch.h"
 #include "kernels/replicate_kernels.h"
 
@@ -22,8 +22,8 @@ namespace Kernels {
 namespace Replicate {
 
 template <typename T>
-__global__ void replicate_backward_kernel(T *input_ptr,
-                                          T const *output_ptr,
+__global__ void replicate_backward_kernel(T const *output_ptr,
+                                          T *input_ptr,
                                           size_t num_elements,
                                           size_t num_replicas) {
   CUDA_KERNEL_LOOP(i, num_elements) {
@@ -38,7 +38,6 @@ struct ForwardKernel {
   void operator()(cudaStream_t stream,
                   GenericTensorAccessorR const &input,
                   GenericTensorAccessorW const &output) {
-
     checkCUDA(cudaMemcpyAsync((void *)output.get<T>(),
                               (void *)input.get<T>(),
                               input.shape.num_elements().unwrap_nonnegative() *
@@ -51,15 +50,15 @@ struct ForwardKernel {
 template <DataType T>
 struct BackwardKernel {
   void operator()(cudaStream_t stream,
-                  GenericTensorAccessorW const &input,
                   GenericTensorAccessorR const &output,
+                  GenericTensorAccessorW const &input,
                   size_t num_replicas) {
     size_t total_elements =
         input.shape.num_elements().unwrap_nonnegative() * num_replicas;
     replicate_backward_kernel<real_type_t<T>>
         <<<GET_BLOCKS(total_elements), CUDA_NUM_THREADS, 0, stream>>>(
-            input.get<T>(),
             output.get<T>(),
+            input.get<T>(),
             input.shape.num_elements().unwrap_nonnegative(),
             num_replicas);
   }
@@ -72,11 +71,11 @@ void forward_kernel(cudaStream_t stream,
 }
 
 void backward_kernel(cudaStream_t stream,
-                     GenericTensorAccessorW const &input,
                      GenericTensorAccessorR const &output,
+                     GenericTensorAccessorW const &input,
                      size_t num_replicas) {
   DataTypeDispatch1<BackwardKernel>{}(
-      input.data_type, stream, input, output, num_replicas);
+      input.data_type, stream, output, input, num_replicas);
 }
 
 } // namespace Replicate
diff --git a/lib/kernels/src/cuda/ops/reshape_kernels.cu b/lib/kernels/src/cuda/ops/reshape_kernels.cu
index c5a289ce6b..a6a390b38e 100644
--- a/lib/kernels/src/cuda/ops/reshape_kernels.cu
+++ b/lib/kernels/src/cuda/ops/reshape_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/datatype_dispatch.h"
 #include "kernels/reshape_kernels.h"
 
@@ -43,8 +43,8 @@ struct ForwardKernel {
 template <DataType T>
 struct BackwardKernel {
   void operator()(cudaStream_t stream,
-                  GenericTensorAccessorW const &input,
-                  GenericTensorAccessorR const &output) {
+                  GenericTensorAccessorR const &output,
+                  GenericTensorAccessorW const &input) {
     float alpha = 1.0f;
     apply_add_with_scale<real_type_t<T>>
         <<<GET_BLOCKS(input.shape.num_elements().unwrap_nonnegative()),
@@ -66,9 +66,9 @@ void forward_kernel(cudaStream_t stream,
 
 void backward_kernel(cudaStream_t stream,
                      ReshapePerDeviceState const &m,
-                     GenericTensorAccessorW const &input,
-                     GenericTensorAccessorR const &output) {
-  DataTypeDispatch1<BackwardKernel>{}(m.data_type, stream, input, output);
+                     GenericTensorAccessorR const &output,
+                     GenericTensorAccessorW const &input) {
+  DataTypeDispatch1<BackwardKernel>{}(m.data_type, stream, output, input);
 }
 
 } // namespace Reshape
diff --git a/lib/kernels/src/cuda/ops/reverse_kernels.cu b/lib/kernels/src/cuda/ops/reverse_kernels.cu
index 8391a499df..582aa02386 100644
--- a/lib/kernels/src/cuda/ops/reverse_kernels.cu
+++ b/lib/kernels/src/cuda/ops/reverse_kernels.cu
@@ -13,13 +13,11 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/reverse_kernels.h"
+#include "kernels/reverse_kernels_params.h"
 
-namespace FlexFlow {
-
-namespace Kernels {
-namespace Reverse {
+namespace FlexFlow::Kernels::Reverse {
 
 __global__ void reverse_forward_kernel(float const *in_ptr,
                                        float *out_ptr,
@@ -27,23 +25,24 @@ __global__ void reverse_forward_kernel(float const *in_ptr,
                                        coord_t reverse_dim_size,
                                        coord_t in_blk_size) {
   CUDA_KERNEL_LOOP(i, num_out_blks * reverse_dim_size * in_blk_size) {
+    coord_t out_idx = i;
     coord_t blk_idx = i / (reverse_dim_size * in_blk_size);
     i = i - blk_idx * (reverse_dim_size * in_blk_size);
     coord_t reverse_dim_idx = i / in_blk_size;
     i = i - reverse_dim_idx * in_blk_size;
     coord_t in_idx = blk_idx * (reverse_dim_size * in_blk_size) +
                      (reverse_dim_size - 1 - reverse_dim_idx) * in_blk_size + i;
-    out_ptr[i] = in_ptr[in_idx];
+    out_ptr[out_idx] = in_ptr[in_idx];
   }
 }
 
-void forward_kernel(cudaStream_t stream,
-                    float const *in_ptr,
-                    float *out_ptr,
-                    coord_t num_out_blks,
-                    coord_t reverse_dim_size,
-                    coord_t in_blk_size,
-                    coord_t output_size) {
+static void forward_kernel_internal(cudaStream_t stream,
+                                    float const *in_ptr,
+                                    float *out_ptr,
+                                    coord_t num_out_blks,
+                                    coord_t reverse_dim_size,
+                                    coord_t in_blk_size,
+                                    coord_t output_size) {
 
   reverse_forward_kernel<<<GET_BLOCKS(output_size),
                            CUDA_NUM_THREADS,
@@ -52,13 +51,31 @@ void forward_kernel(cudaStream_t stream,
       in_ptr, out_ptr, num_out_blks, reverse_dim_size, in_blk_size);
 }
 
-void backward_kernel(cudaStream_t stream,
-                     float const *out_grad_ptr,
-                     float *in_grad_ptr,
-                     coord_t num_out_blks,
-                     coord_t reverse_dim_size,
-                     coord_t in_blk_size,
-                     coord_t input_size) {
+void forward_kernel(ffStream_t stream,
+                    GenericTensorAccessorR const &input_accessor,
+                    GenericTensorAccessorW &output_accessor,
+                    ReverseAttrs const &attrs) {
+
+  auto reverse_kernels_params =
+      compute_reverse_kernels_params(output_accessor.shape, attrs);
+
+  forward_kernel_internal(
+      stream,
+      input_accessor.get_float_ptr(),
+      output_accessor.get_float_ptr(),
+      reverse_kernels_params.num_out_blks.unwrap_nonnegative(),
+      reverse_kernels_params.reverse_dim_size.unwrap_nonnegative(),
+      reverse_kernels_params.in_blk_size.unwrap_nonnegative(),
+      reverse_kernels_params.out_size.unwrap_nonnegative());
+}
+
+void backward_kernel_internal(cudaStream_t stream,
+                              float const *out_grad_ptr,
+                              float *in_grad_ptr,
+                              coord_t num_out_blks,
+                              coord_t reverse_dim_size,
+                              coord_t in_blk_size,
+                              coord_t input_size) {
 
   reverse_forward_kernel<<<GET_BLOCKS(input_size),
                            CUDA_NUM_THREADS,
@@ -67,6 +84,21 @@ void backward_kernel(cudaStream_t stream,
       out_grad_ptr, in_grad_ptr, num_out_blks, reverse_dim_size, in_blk_size);
 }
 
-} // namespace Reverse
-} // namespace Kernels
-} // namespace FlexFlow
+void backward_kernel(ffStream_t stream,
+                     GenericTensorAccessorR const &output_grad_accessor,
+                     GenericTensorAccessorW &input_grad_accessor,
+                     ReverseAttrs const &attrs) {
+  auto reverse_kernels_params =
+      compute_reverse_kernels_params(input_grad_accessor.shape, attrs);
+
+  backward_kernel_internal(
+      stream,
+      output_grad_accessor.get_float_ptr(),
+      input_grad_accessor.get_float_ptr(),
+      reverse_kernels_params.num_out_blks.unwrap_nonnegative(),
+      reverse_kernels_params.reverse_dim_size.unwrap_nonnegative(),
+      reverse_kernels_params.in_blk_size.unwrap_nonnegative(),
+      reverse_kernels_params.out_size.unwrap_nonnegative());
+}
+
+} // namespace FlexFlow::Kernels::Reverse
diff --git a/lib/kernels/src/cuda/ops/softmax_kernels.cu b/lib/kernels/src/cuda/ops/softmax_kernels.cu
index 93ed85de18..da0ffd846e 100644
--- a/lib/kernels/src/cuda/ops/softmax_kernels.cu
+++ b/lib/kernels/src/cuda/ops/softmax_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/softmax_kernels.h"
 
 namespace FlexFlow {
@@ -61,8 +61,8 @@ void forward_kernel(cudaStream_t stream,
 }
 
 void backward_kernel(cudaStream_t stream,
-                     float *input_grad_ptr,
                      float const *output_grad_ptr,
+                     float *input_grad_ptr,
                      size_t num_elements) {
 
   checkCUDA(cudaMemcpyAsync(input_grad_ptr,
diff --git a/lib/kernels/src/cuda/ops/split_kernels.cu b/lib/kernels/src/cuda/ops/split_kernels.cu
index f01393732d..5c8b305851 100644
--- a/lib/kernels/src/cuda/ops/split_kernels.cu
+++ b/lib/kernels/src/cuda/ops/split_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/split_kernels.h"
 
 namespace FlexFlow {
diff --git a/lib/kernels/src/cuda/ops/topk_kernels.cu b/lib/kernels/src/cuda/ops/topk_kernels.cu
index c8f183172e..3824c57b32 100644
--- a/lib/kernels/src/cuda/ops/topk_kernels.cu
+++ b/lib/kernels/src/cuda/ops/topk_kernels.cu
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/topk_kernels.h"
 
 namespace FlexFlow {
diff --git a/lib/kernels/src/cuda/ops/transpose_kernels.cu b/lib/kernels/src/cuda/ops/transpose_kernels.cu
index 60d2f7f342..91f3d48a35 100644
--- a/lib/kernels/src/cuda/ops/transpose_kernels.cu
+++ b/lib/kernels/src/cuda/ops/transpose_kernels.cu
@@ -13,10 +13,10 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/accessor.h"
+#include "kernels/legion_ordered/transform.h"
 #include "kernels/transpose_kernels.h"
-#include "op-attrs/dim_ordered/transform.h"
 #include "utils/exception.h"
 #include "utils/nonnegative_int/num_elements.h"
 
@@ -100,8 +100,8 @@ void forward_kernel(cudaStream_t stream,
 
 void backward_kernel(cudaStream_t stream,
                      TransposeAttrs const &m,
-                     GenericTensorAccessorW const &in_grad,
-                     GenericTensorAccessorR const &out_grad) {
+                     GenericTensorAccessorR const &out_grad,
+                     GenericTensorAccessorW const &in_grad) {
 
   TransposeStrides info;
   info.num_dim = in_grad.shape.num_dims().unwrap_nonnegative();
diff --git a/lib/kernels/src/cuda/optimizer_kernel.cu b/lib/kernels/src/cuda/optimizer_kernel.cu
deleted file mode 100644
index 439eed9dec..0000000000
--- a/lib/kernels/src/cuda/optimizer_kernel.cu
+++ /dev/null
@@ -1,216 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernels/optimizer_kernels.h"
-
-namespace FlexFlow {
-
-__global__ void sgd_update(size_t count,
-                           float lr,
-                           float weight_decay,
-                           float momentum,
-                           bool nesterov,
-                           float const *WGrad,
-                           float *V,
-                           float *W) {
-  // Refernce https://pytorch.org/docs/stable/_modules/torch/optim/sgd.html#SGD
-  CUDA_KERNEL_LOOP(i, count) {
-    float gt = WGrad[i] + weight_decay * W[i];
-    if (momentum > 0.0f) {
-      V[i] = V[i] * momentum + gt;
-      if (nesterov) {
-        gt = gt + momentum * V[i];
-      } else {
-        gt = V[i];
-      }
-    }
-    W[i] -= lr * gt;
-  }
-}
-
-__host__ void SGDOptimizer::ps_update_task_gpu(SGDOptimizer const *op,
-                                               float const *w_grad_ptr,
-                                               size_t size,
-                                               int num_replicas,
-                                               float *w_ptr,
-                                               float *v_ptr) {
-  cudaStream_t stream;
-  checkCUDA(get_legion_stream(&stream));
-  // Step 1: Gather gradients in the first replica
-  for (int i = 1; i < num_replicas; i++) {
-    float const *src = w_grad_ptr + i * size;
-    apply_add_with_scale<float>
-        <<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
-            (float *)w_grad_ptr, src, size, 1.0f);
-  }
-  // checkCUDA(cudaDeviceSynchronize());
-  //  Step 2: SGD update
-  sgd_update<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
-      size,
-      op->lr,
-      op->weight_decay,
-      op->momentum,
-      op->nesterov,
-      w_grad_ptr,
-      v_ptr,
-      w_ptr);
-  // checkCUDA(cudaDeviceSynchronize());
-}
-
-#ifdef FF_USE_NCCL
-__host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op,
-                                                 PerDeviceOpState const *meta,
-                                                 float const *w_grad_ptr,
-                                                 size_t size,
-                                                 float *w_ptr,
-                                                 float *v_ptr) {
-  // Use NCCL to sync gradients
-  // fprintf(stderr, "weight(%p) Before ncclAllReduce...\n", w_grad_ptr);
-  cudaStream_t stream;
-  checkCUDA(get_legion_stream(&stream));
-  checkNCCL(ncclAllReduce(w_grad_ptr,
-                          (float *)w_grad_ptr,
-                          size,
-                          ncclFloat,
-                          ncclSum,
-                          meta->handle.ncclComm,
-                          stream));
-  // fprintf(stderr, "weight(%p) After ncclAllReduce...\n", w_grad_ptr);
-  // print_tensor<float>((float*)w_grad_ptr, 16, "[After ncclAllReduce]");
-
-  // Step 2: SGD update
-  sgd_update<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
-      size,
-      op->lr,
-      op->weight_decay,
-      op->momentum,
-      op->nesterov,
-      w_grad_ptr,
-      v_ptr,
-      w_ptr);
-  // checkCUDA(cudaDeviceSynchronize());
-}
-#endif
-
-// ==================================================================
-//                        Adam Optimizer
-// ==================================================================
-__global__ void
-    add_kernel(int count, float scale, float const *src, float *dst) {
-  CUDA_KERNEL_LOOP(i, count) {
-    dst[i] += src[i] * scale;
-  }
-}
-
-__global__ void scale_kernel(int count, float a, float b, float *ptr) {
-  CUDA_KERNEL_LOOP(i, count) {
-    ptr[i] = (b - a) * ptr[i] + a;
-  }
-}
-
-__global__ void adam_update(int count,
-                            float alpha_t,
-                            float beta1,
-                            float beta2,
-                            float weight_decay,
-                            float epsilon,
-                            float const *WGrad,
-                            float *M,
-                            float *V,
-                            float *W) {
-  // Reference for weight decay
-  // https://www.fast.ai/2018/07/02/adam-weight-decay/
-  CUDA_KERNEL_LOOP(i, count) {
-    // W[i] -= weight_decay * alpha_t * W[i];
-    // float gt = WGrad[i];
-    float gt = WGrad[i] + weight_decay * W[i];
-    float mt = beta1 * M[i] + (1 - beta1) * gt;
-    float vt = beta2 * V[i] + (1 - beta2) * gt * gt;
-    M[i] = mt;
-    V[i] = vt;
-    W[i] -= alpha_t * mt / (sqrt(vt) + epsilon);
-  }
-}
-
-__host__ void AdamOptimizer::ps_update_task_gpu(AdamOptimizer const *op,
-                                                float const *w_grad_ptr,
-                                                size_t size,
-                                                int num_replicas,
-                                                float *w_ptr,
-                                                float *v_ptr,
-                                                float *m_ptr) {
-  cudaStream_t stream;
-  checkCUDA(get_legion_stream(&stream));
-  // Step 1: Gather gradients in the first replica
-  for (int i = 1; i < num_replicas; i++) {
-    float const *src = w_grad_ptr + i * size;
-    add_kernel<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
-        size, 1.0f, src, (float *)w_grad_ptr);
-  }
-  // checkCUDA(cudaDeviceSynchronize());
-  // fprintf(stderr, "alpha = %.8lf alpha_t = %.8lf decay = %.8lf\n",
-  //         op->alpha, op->alpha_t, op->weight_decay);
-  //  Step 2: Adam update
-  adam_update<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
-      size,
-      op->alpha_t,
-      op->beta1,
-      op->beta2,
-      op->weight_decay,
-      op->epsilon,
-      w_grad_ptr,
-      m_ptr,
-      v_ptr,
-      w_ptr);
-  // checkCUDA(cudaDeviceSynchronize());
-}
-
-#ifdef FF_USE_NCCL
-__host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op,
-                                                  PerDeviceOpState const *meta,
-                                                  float const *w_grad_ptr,
-                                                  size_t size,
-                                                  float *w_ptr,
-                                                  float *v_ptr,
-                                                  float *m_ptr) {
-  // Use NCCL to sync gradients
-  cudaStream_t stream;
-  checkCUDA(get_legion_stream(&stream));
-  checkNCCL(ncclAllReduce(w_grad_ptr,
-                          (float *)w_grad_ptr,
-                          size,
-                          ncclFloat,
-                          ncclSum,
-                          meta->handle.ncclComm,
-                          stream));
-  // fprintf(stderr, "alpha = %.8lf alpha_t = %.8lf decay = %.8lf\n",
-  //         op->alpha, op->alpha_t, op->weight_decay);
-  //  Step 2: Adam update
-  adam_update<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
-      size,
-      op->alpha_t,
-      op->beta1,
-      op->beta2,
-      op->weight_decay,
-      op->epsilon,
-      w_grad_ptr,
-      m_ptr,
-      v_ptr,
-      w_ptr);
-  // checkCUDA(cudaDeviceSynchronize());
-}
-#endif
-
-} // namespace FlexFlow
diff --git a/lib/kernels/src/cuda/optimizer_kernels.cu b/lib/kernels/src/cuda/optimizer_kernels.cu
new file mode 100644
index 0000000000..fe817876ce
--- /dev/null
+++ b/lib/kernels/src/cuda/optimizer_kernels.cu
@@ -0,0 +1,205 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "internal/device.h"
+#include "kernels/nccl.h"
+#include "kernels/optimizer_kernels.h"
+#include "utils/exception.h"
+
+namespace FlexFlow {
+
+__global__ void sgd_update(size_t count,
+                           float lr,
+                           float weight_decay,
+                           float momentum,
+                           bool nesterov,
+                           float const *WGrad,
+                           float *V,
+                           float *W) {
+  // Refernce https://pytorch.org/docs/stable/_modules/torch/optim/sgd.html#SGD
+  CUDA_KERNEL_LOOP(i, count) {
+    float gt = WGrad[i] + weight_decay * W[i];
+    if (momentum > 0.0f) {
+      V[i] = V[i] * momentum + gt;
+      if (nesterov) {
+        gt = gt + momentum * V[i];
+      } else {
+        gt = V[i];
+      }
+    }
+    W[i] -= lr * gt;
+  }
+}
+
+__host__ void sgd_ps_update_task_gpu(ffStream_t stream,
+                                     float lr,
+                                     float momentum,
+                                     bool nesterov,
+                                     float weight_decay,
+                                     float const *weight_grad_ptr,
+                                     size_t size,
+                                     int num_replicas,
+                                     float *weight_ptr,
+                                     float *sgd_v_ptr) {
+  // Step 1: Gather gradients in the first replica
+  for (int i = 1; i < num_replicas; i++) {
+    float const *src = weight_grad_ptr + i * size;
+    apply_add_with_scale<float>
+        <<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
+            (float *)weight_grad_ptr, src, size, 1.0f);
+  }
+
+  //  Step 2: SGD update
+  sgd_update<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(size,
+                                                                lr,
+                                                                weight_decay,
+                                                                momentum,
+                                                                nesterov,
+                                                                weight_grad_ptr,
+                                                                sgd_v_ptr,
+                                                                weight_ptr);
+}
+
+#ifdef FF_USE_NCCL
+__host__ void sgd_nccl_update_task_gpu(ffStream_t stream,
+                                       float lr,
+                                       float momentum,
+                                       bool nesterov,
+                                       float weight_decay,
+                                       PerDeviceFFHandle const &handle,
+                                       float const *w_grad_ptr,
+                                       size_t size,
+                                       float *w_ptr,
+                                       float *v_ptr) {
+  // Step 1: Use NCCL to sync gradients
+  ncclComm_t comm = handle.ncclComm;
+  checkNCCL(ncclAllReduce(
+      w_grad_ptr, (float *)w_grad_ptr, size, ncclFloat, ncclSum, comm, stream));
+
+  //  Step 2: SGD update
+  sgd_update<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
+      size, lr, weight_decay, momentum, nesterov, w_grad_ptr, v_ptr, w_ptr);
+}
+#endif
+
+// ==================================================================
+//                        Adam Optimizer
+// ==================================================================
+__global__ void
+    add_kernel(int count, float scale, float const *src, float *dst) {
+  CUDA_KERNEL_LOOP(i, count) {
+    dst[i] += src[i] * scale;
+  }
+}
+
+__global__ void scale_kernel(int count, float a, float b, float *ptr) {
+  CUDA_KERNEL_LOOP(i, count) {
+    ptr[i] = (b - a) * ptr[i] + a;
+  }
+}
+
+__global__ void adam_update(int count,
+                            float alpha_t,
+                            float beta1,
+                            float beta2,
+                            float weight_decay,
+                            float epsilon,
+                            float const *WGrad,
+                            float *M,
+                            float *V,
+                            float *W) {
+  // Reference for weight decay
+  // https://www.fast.ai/2018/07/02/adam-weight-decay/
+  CUDA_KERNEL_LOOP(i, count) {
+    // W[i] -= weight_decay * alpha_t * W[i];
+    // float gt = WGrad[i];
+    float gt = WGrad[i] + weight_decay * W[i];
+    float mt = beta1 * M[i] + (1 - beta1) * gt;
+    float vt = beta2 * V[i] + (1 - beta2) * gt * gt;
+    M[i] = mt;
+    V[i] = vt;
+    W[i] -= alpha_t * mt / (sqrt(vt) + epsilon);
+  }
+}
+
+__host__ void adam_ps_update_task_gpu(ffStream_t stream,
+                                      float alpha_t,
+                                      float beta1,
+                                      float beta2,
+                                      float weight_decay,
+                                      float epsilon,
+                                      float const *w_grad_ptr,
+                                      size_t size,
+                                      int num_replicas,
+                                      float *w_ptr,
+                                      float *v_ptr,
+                                      float *m_ptr) {
+  // Step 1: Gather gradients in the first replica
+  for (int i = 1; i < num_replicas; i++) {
+    float const *src = w_grad_ptr + i * size;
+    add_kernel<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
+        (float *)w_grad_ptr, src, size);
+  }
+
+  //  Step 2: Adam update
+  adam_update<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(size,
+                                                                 alpha_t,
+                                                                 beta1,
+                                                                 beta2,
+                                                                 weight_decay,
+                                                                 epsilon,
+                                                                 w_grad_ptr,
+                                                                 m_ptr,
+                                                                 v_ptr,
+                                                                 w_ptr);
+}
+
+#ifdef FF_USE_NCCL
+__host__ void nccl_update_task_gpu(ffStream_t stream,
+                                   float alpha_t,
+                                   float beta1,
+                                   float beta2,
+                                   float weight_decay,
+                                   float epsilon,
+                                   PerDeviceFFHandle const &handle,
+                                   float const *w_grad_ptr,
+                                   size_t size,
+                                   float *w_ptr,
+                                   float *v_ptr,
+                                   float *m_ptr) {
+  // Step 1: Use NCCL to sync gradients
+  checkNCCL(ncclAllReduce(w_grad_ptr,
+                          (float *)w_grad_ptr,
+                          size,
+                          ncclFloat,
+                          ncclSum,
+                          handle.ncclComm,
+                          stream));
+
+  //  Step 2: Adam update
+  adam_update<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(size,
+                                                                 alpha_t,
+                                                                 beta1,
+                                                                 beta2,
+                                                                 weight_decay,
+                                                                 epsilon,
+                                                                 w_grad_ptr,
+                                                                 m_ptr,
+                                                                 v_ptr,
+                                                                 w_ptr);
+}
+#endif
+
+} // namespace FlexFlow
diff --git a/lib/kernels/src/hip/embedding_kernels.cpp b/lib/kernels/src/hip/embedding_kernels.cpp
index 7ca3149f2f..aefe53cc46 100644
--- a/lib/kernels/src/hip/embedding_kernels.cpp
+++ b/lib/kernels/src/hip/embedding_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/embedding_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/datatype_dispatch.h"
 #include <hip/hip_runtime.h>
 
@@ -364,8 +364,8 @@ struct ForwardKernel {
            weight.data_type == DataType::FLOAT ||
            weight.data_type == DataType::DOUBLE);
 
-    if (aggr == AggregateOp::NONE) {
-      hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_forward_no_aggr<TI, TD>),
+    if (aggr == AggregateOp::AVG || aggr == AggregateOp::SUM) {
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_forward_with_aggr<TI, TD>),
                          GET_BLOCKS(output.shape.get_volume()),
                          CUDA_NUM_THREADS,
                          0,
@@ -374,10 +374,11 @@ struct ForwardKernel {
                          output.get<TD>(),
                          weight.get<TD>(),
                          out_dim,
-                         batch_size);
+                         in_dim,
+                         batch_size,
+                         aggr);
     } else {
-      assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM);
-      hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_forward_with_aggr<TI, TD>),
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_forward_no_aggr<TI, TD>),
                          GET_BLOCKS(output.shape.get_volume()),
                          CUDA_NUM_THREADS,
                          0,
@@ -386,9 +387,7 @@ struct ForwardKernel {
                          output.get<TD>(),
                          weight.get<TD>(),
                          out_dim,
-                         in_dim,
-                         batch_size,
-                         aggr);
+                         batch_size);
     }
   }
 }
@@ -408,8 +407,9 @@ struct BackwardKernel {
     assert(output.data_type == DataType::HALF ||
            output.data_type == DataType::FLOAT ||
            output.data_type == DataType::DOUBLE);
-    if (aggr == AggregateOp::NONE) {
-      hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_backward_no_aggr<TI, TD>),
+
+    if (aggr == AggregateOp::AVG || aggr == AggregateOp::SUM) {
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_backward_with_aggr<TI, TD>),
                          GET_BLOCKS(output.shape.get_volume()),
                          CUDA_NUM_THREADS,
                          0,
@@ -418,9 +418,11 @@ struct BackwardKernel {
                          output.get<TD>(),
                          weight_grad.get<TD>(),
                          out_dim,
-                         batch_size);
+                         in_dim,
+                         batch_size,
+                         aggr);
     } else {
-      hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_backward_with_aggr<TI, TD>),
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(embed_backward_no_aggr<TI, TD>),
                          GET_BLOCKS(output.shape.get_volume()),
                          CUDA_NUM_THREADS,
                          0,
@@ -429,9 +431,7 @@ struct BackwardKernel {
                          output.get<TD>(),
                          weight_grad.get<TD>(),
                          out_dim,
-                         in_dim,
-                         batch_size,
-                         aggr);
+                         batch_size);
     }
   }
 }
diff --git a/lib/kernels/src/hip/loss_function_kernels.cpp b/lib/kernels/src/hip/loss_function_kernels.cpp
index e82b5c96d5..05068f1bd0 100644
--- a/lib/kernels/src/hip/loss_function_kernels.cpp
+++ b/lib/kernels/src/hip/loss_function_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/loss_function_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include <hip/hip_runtime.h>
 
 namespace FlexFlow {
diff --git a/lib/kernels/src/hip/ops/attention_kernels.cpp b/lib/kernels/src/hip/ops/attention_kernels.cpp
index 005cef30d1..b374ead305 100644
--- a/lib/kernels/src/hip/ops/attention_kernels.cpp
+++ b/lib/kernels/src/hip/ops/attention_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/attention_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include <hip/hip_runtime.h>
 
 namespace FlexFlow {
diff --git a/lib/kernels/src/hip/ops/batch_matmul_kernels.cpp b/lib/kernels/src/hip/ops/batch_matmul_kernels.cpp
index c4b3be823f..6d9ae8a268 100644
--- a/lib/kernels/src/hip/ops/batch_matmul_kernels.cpp
+++ b/lib/kernels/src/hip/ops/batch_matmul_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/batch_matmul_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include <hip/hip_runtime.h>
 
 namespace FlexFlow {
diff --git a/lib/kernels/src/hip/ops/batch_norm_kernels.cpp b/lib/kernels/src/hip/ops/batch_norm_kernels.cpp
index 8e94b462cd..764a3e0b58 100644
--- a/lib/kernels/src/hip/ops/batch_norm_kernels.cpp
+++ b/lib/kernels/src/hip/ops/batch_norm_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/batch_norm_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/allocation.h"
 #include "kernels/ff_handle.h"
 #include <hip/hip_runtime.h>
diff --git a/lib/kernels/src/hip/ops/cast_kernels.cpp b/lib/kernels/src/hip/ops/cast_kernels.cpp
index fa0c37ffa1..1035657c04 100644
--- a/lib/kernels/src/hip/ops/cast_kernels.cpp
+++ b/lib/kernels/src/hip/ops/cast_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/cast_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/datatype_dispatch.h"
 #include <hip/hip_runtime.h>
 
diff --git a/lib/kernels/src/hip/ops/combine_kernels.cpp b/lib/kernels/src/hip/ops/combine_kernels.cpp
index aa01f02276..f1e0422747 100644
--- a/lib/kernels/src/hip/ops/combine_kernels.cpp
+++ b/lib/kernels/src/hip/ops/combine_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/combine_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/accessor.h"
 #include "kernels/datatype_dispatch.h"
 #include <hip/hip_runtime.h>
diff --git a/lib/kernels/src/hip/ops/concat_kernels.cpp b/lib/kernels/src/hip/ops/concat_kernels.cpp
index aa38be739b..a215d67942 100644
--- a/lib/kernels/src/hip/ops/concat_kernels.cpp
+++ b/lib/kernels/src/hip/ops/concat_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/concat_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include <cassert>
 #include <hip/hip_runtime.h>
 
diff --git a/lib/kernels/src/hip/ops/conv_2d_kernels.h b/lib/kernels/src/hip/ops/conv_2d_kernels.h
index bcf015d561..76a73ab08c 100644
--- a/lib/kernels/src/hip/ops/conv_2d_kernels.h
+++ b/lib/kernels/src/hip/ops/conv_2d_kernels.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_KERNELS_HIP_CONV_2D_KERNELS_H
 #define _FLEXFLOW_KERNELS_HIP_CONV_2D_KERNELS_H
 
-#include "device.h"
+#include "kernels/device.h"
 
 namespace FlexFlow {
 namespace Kernels {
diff --git a/lib/kernels/src/hip/ops/dropout_kernels.cpp b/lib/kernels/src/hip/ops/dropout_kernels.cpp
index baaf8e6902..d85c0ae054 100644
--- a/lib/kernels/src/hip/ops/dropout_kernels.cpp
+++ b/lib/kernels/src/hip/ops/dropout_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/dropout_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/ff_handle.h"
 #include <hip/hip_runtime.h>
 
diff --git a/lib/kernels/src/hip/ops/element_binary_kernels.cpp b/lib/kernels/src/hip/ops/element_binary_kernels.cpp
index bc66bbff2f..9e0452b09b 100644
--- a/lib/kernels/src/hip/ops/element_binary_kernels.cpp
+++ b/lib/kernels/src/hip/ops/element_binary_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/element_binary_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/ff_handle.h"
 #include "op-attrs/datatype.h"
 #include "op-attrs/operator_type.dtg.h"
diff --git a/lib/kernels/src/hip/ops/element_unary_kernels.cpp b/lib/kernels/src/hip/ops/element_unary_kernels.cpp
index f4b0ccb82d..163f13a6da 100644
--- a/lib/kernels/src/hip/ops/element_unary_kernels.cpp
+++ b/lib/kernels/src/hip/ops/element_unary_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/element_unary_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/datatype_dispatch.h"
 #include "op-attrs/get_op_type.h"
 #include <hip/hip_runtime.h>
diff --git a/lib/kernels/src/hip/ops/flat_kernels.cpp b/lib/kernels/src/hip/ops/flat_kernels.cpp
index 763fb9e322..dedfb4b9a9 100644
--- a/lib/kernels/src/hip/ops/flat_kernels.cpp
+++ b/lib/kernels/src/hip/ops/flat_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/flat_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/accessor.h"
 #include <hip/hip_runtime.h>
 
diff --git a/lib/kernels/src/hip/ops/gather_kernels.cpp b/lib/kernels/src/hip/ops/gather_kernels.cpp
index 17c0014e98..6e9e4c6a2c 100644
--- a/lib/kernels/src/hip/ops/gather_kernels.cpp
+++ b/lib/kernels/src/hip/ops/gather_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/gather_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/datatype_dispatch.h"
 #include <hip/hip_runtime.h>
 
diff --git a/lib/kernels/src/hip/ops/partition_kernels.cpp b/lib/kernels/src/hip/ops/partition_kernels.cpp
index 4591247faa..26748a7e45 100644
--- a/lib/kernels/src/hip/ops/partition_kernels.cpp
+++ b/lib/kernels/src/hip/ops/partition_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/partition_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/datatype_dispatch.h"
 #include <hip/hip_runtime.h>
 
diff --git a/lib/kernels/src/hip/ops/pool_2d_kernels.cpp b/lib/kernels/src/hip/ops/pool_2d_kernels.cpp
index ed942c105c..7e5ae2ab80 100644
--- a/lib/kernels/src/hip/ops/pool_2d_kernels.cpp
+++ b/lib/kernels/src/hip/ops/pool_2d_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/pool_2d_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include <hip/hip_runtime.h>
 
 namespace FlexFlow {
diff --git a/lib/kernels/src/hip/ops/reduce_kernels.cpp b/lib/kernels/src/hip/ops/reduce_kernels.cpp
index 468543dd5b..c0bcc84d48 100644
--- a/lib/kernels/src/hip/ops/reduce_kernels.cpp
+++ b/lib/kernels/src/hip/ops/reduce_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/reduce_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include <hip/hip_runtime.h>
 
 namespace FlexFlow {
diff --git a/lib/kernels/src/hip/ops/replicate_kernels.cpp b/lib/kernels/src/hip/ops/replicate_kernels.cpp
index 8d27bb1908..ee7bf701c0 100644
--- a/lib/kernels/src/hip/ops/replicate_kernels.cpp
+++ b/lib/kernels/src/hip/ops/replicate_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/replicate_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/datatype_dispatch.h"
 #include <hip/hip_runtime.h>
 
diff --git a/lib/kernels/src/hip/ops/reshape_kernels.cpp b/lib/kernels/src/hip/ops/reshape_kernels.cpp
index 47978a5f4a..810b929e24 100644
--- a/lib/kernels/src/hip/ops/reshape_kernels.cpp
+++ b/lib/kernels/src/hip/ops/reshape_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/reshape_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/datatype_dispatch.h"
 #include <hip/hip_runtime.h>
 
diff --git a/lib/kernels/src/hip/ops/reverse_kernels.cpp b/lib/kernels/src/hip/ops/reverse_kernels.cpp
index 03e97245bf..a56ff3540a 100644
--- a/lib/kernels/src/hip/ops/reverse_kernels.cpp
+++ b/lib/kernels/src/hip/ops/reverse_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/reverse_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include <hip/hip_runtime.h>
 
 namespace FlexFlow {
diff --git a/lib/kernels/src/hip/ops/softmax_kernels.cpp b/lib/kernels/src/hip/ops/softmax_kernels.cpp
index 3a8f2813b7..610675850b 100644
--- a/lib/kernels/src/hip/ops/softmax_kernels.cpp
+++ b/lib/kernels/src/hip/ops/softmax_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/softmax_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include <hip/hip_runtime.h>
 
 namespace FlexFlow {
diff --git a/lib/kernels/src/hip/ops/split_kernels.cpp b/lib/kernels/src/hip/ops/split_kernels.cpp
index 5599ae6d6f..3034b633a6 100644
--- a/lib/kernels/src/hip/ops/split_kernels.cpp
+++ b/lib/kernels/src/hip/ops/split_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/split_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include <hip/hip_runtime.h>
 
 namespace FlexFlow {
diff --git a/lib/kernels/src/hip/ops/topk_kernels.cpp b/lib/kernels/src/hip/ops/topk_kernels.cpp
index f085c5831f..777d9edffa 100644
--- a/lib/kernels/src/hip/ops/topk_kernels.cpp
+++ b/lib/kernels/src/hip/ops/topk_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/topk_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include <hip/hip_runtime.h>
 
 namespace FlexFlow {
diff --git a/lib/kernels/src/hip/ops/transpose_kernels.cpp b/lib/kernels/src/hip/ops/transpose_kernels.cpp
index ef9dd58c63..c5122f34bf 100644
--- a/lib/kernels/src/hip/ops/transpose_kernels.cpp
+++ b/lib/kernels/src/hip/ops/transpose_kernels.cpp
@@ -14,7 +14,7 @@
  */
 
 #include "kernels/transpose_kernels.h"
-#include "device.h"
+#include "internal/device.h"
 #include "kernels/accessor.h"
 #include "utils/exception.h"
 #include <hip/hip_runtime.h>
diff --git a/lib/kernels/src/device.cc b/lib/kernels/src/internal/device.cc
similarity index 97%
rename from lib/kernels/src/device.cc
rename to lib/kernels/src/internal/device.cc
index f46099c79a..eb3d229c2a 100644
--- a/lib/kernels/src/device.cc
+++ b/lib/kernels/src/internal/device.cc
@@ -1,4 +1,4 @@
-#include "device.h"
+#include "internal/device.h"
 
 namespace FlexFlow {
 
diff --git a/lib/kernels/src/device.h b/lib/kernels/src/internal/device.h
similarity index 98%
rename from lib/kernels/src/device.h
rename to lib/kernels/src/internal/device.h
index ceff2f92ff..226c7ad174 100644
--- a/lib/kernels/src/device.h
+++ b/lib/kernels/src/internal/device.h
@@ -1,5 +1,5 @@
-#ifndef _FLEXFLOW_KERNELS_SRC_DEVICE_H
-#define _FLEXFLOW_KERNELS_SRC_DEVICE_H
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_INTERNAL_DEVICE_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_INTERNAL_DEVICE_H
 
 #include "kernels/array_shape.h"
 #include "kernels/device.h"
diff --git a/lib/kernels/src/kernels/accessor.cc b/lib/kernels/src/kernels/accessor.cc
new file mode 100644
index 0000000000..b5042f77a0
--- /dev/null
+++ b/lib/kernels/src/kernels/accessor.cc
@@ -0,0 +1,249 @@
+#include "kernels/accessor.h"
+#include "kernels/allocation.h"
+#include "kernels/datatype_dispatch.h"
+#include "utils/containers/reversed.h"
+#include "utils/containers/vector_of.h"
+#include "utils/nonnegative_int/nonnegative_range.h"
+#include <libassert/assert.hpp>
+
+namespace FlexFlow {
+
+nonnegative_int
+    calculate_accessor_offset(LegionOrdered<nonnegative_int> const &indices,
+                              ArrayShape const &shape) {
+  ASSERT(indices.size() == shape.num_dims(),
+         "Number of indices does not match the number of dimensions");
+
+  nonnegative_int offset = 0_n;
+  nonnegative_int multiplier = 1_n;
+
+  for (legion_dim_t dim : reversed(vector_of(key_range(shape.dims)))) {
+    ASSERT(indices.at(dim) < shape.at(legion_dim_t{dim}),
+           "Out of bounds access",
+           dim);
+
+    offset += indices.at(dim) * multiplier;
+    multiplier *= shape.at(legion_dim_t{dim});
+  }
+
+  return offset;
+}
+
+void copy_accessor_data_to_l_from_r(
+    GenericTensorAccessorW &dst_accessor,
+    GenericTensorAccessorR const &src_accessor) {
+  size_t num_bytes =
+      dst_accessor.shape.get_volume().unwrap_nonnegative() *
+      size_of_datatype(dst_accessor.data_type).unwrap_nonnegative();
+
+  DeviceType dst_device_type = dst_accessor.device_type;
+  DeviceType src_device_type = src_accessor.device_type;
+
+  if (src_device_type == DeviceType::CPU &&
+      dst_device_type == DeviceType::CPU) {
+    memcpy(dst_accessor.ptr, src_accessor.ptr, num_bytes);
+  } else if (src_device_type == DeviceType::CPU &&
+             dst_device_type == DeviceType::GPU) {
+    checkCUDA(cudaMemcpy(
+        dst_accessor.ptr, src_accessor.ptr, num_bytes, cudaMemcpyHostToDevice));
+  } else if (src_device_type == DeviceType::GPU &&
+             dst_device_type == DeviceType::CPU) {
+    checkCUDA(cudaMemcpy(
+        dst_accessor.ptr, src_accessor.ptr, num_bytes, cudaMemcpyDeviceToHost));
+  } else {
+    assert(src_device_type == DeviceType::GPU);
+    assert(dst_device_type == DeviceType::GPU);
+    checkCUDA(cudaMemcpy(dst_accessor.ptr,
+                         src_accessor.ptr,
+                         num_bytes,
+                         cudaMemcpyDeviceToDevice));
+  }
+}
+
+GenericTensorAccessorW::operator GenericTensorAccessorR() const {
+  return read_only_accessor_from_write_accessor(*this);
+}
+
+GenericTensorAccessorW::GenericTensorAccessorW(
+    DataType data_type,
+    ArrayShape const &shape,
+    void *ptr,
+    DeviceType device_type = DeviceType::GPU)
+    : data_type(data_type), shape(shape), ptr(ptr), device_type(device_type) {}
+
+std::tuple<DataType const &,
+           ArrayShape const &,
+           void *const &,
+           DeviceType const &>
+    GenericTensorAccessorW::tie() const {
+  return std::tie(this->data_type, this->shape, this->ptr, this->device_type);
+}
+
+bool GenericTensorAccessorW::operator==(
+    GenericTensorAccessorW const &other) const {
+  return this->tie() == other.tie();
+}
+
+bool GenericTensorAccessorW::operator!=(
+    GenericTensorAccessorW const &other) const {
+  return this->tie() != other.tie();
+}
+
+int32_t *GenericTensorAccessorW::get_int32_ptr() const {
+  return this->get<DataType::INT32>();
+}
+
+int64_t *GenericTensorAccessorW::get_int64_ptr() const {
+  return this->get<DataType::INT64>();
+}
+
+float *GenericTensorAccessorW::get_float_ptr() const {
+  return this->get<DataType::FLOAT>();
+}
+
+double *GenericTensorAccessorW::get_double_ptr() const {
+  return this->get<DataType::DOUBLE>();
+}
+
+half *GenericTensorAccessorW::get_half_ptr() const {
+  return this->get<DataType::HALF>();
+}
+
+std::string format_as(GenericTensorAccessorW const &a) {
+  return fmt::format("<GenericTensorAccessorW data_type={} shape={} ptr={}>",
+                     a.data_type,
+                     a.shape,
+                     a.ptr);
+}
+
+std::ostream &operator<<(std::ostream &s, GenericTensorAccessorW const &a) {
+  return (s << fmt::to_string(a));
+}
+
+GenericTensorAccessorR::GenericTensorAccessorR(
+    DataType data_type,
+    ArrayShape const &shape,
+    void const *ptr,
+    DeviceType device_type = DeviceType::GPU)
+    : data_type(data_type), shape(shape), ptr(ptr), device_type(device_type) {}
+
+std::tuple<DataType const &,
+           ArrayShape const &,
+           void const *const &,
+           DeviceType const &>
+    GenericTensorAccessorR::tie() const {
+  return std::tie(this->data_type, this->shape, this->ptr, this->device_type);
+}
+
+bool GenericTensorAccessorR::operator==(
+    GenericTensorAccessorR const &other) const {
+  return this->tie() == other.tie();
+}
+
+bool GenericTensorAccessorR::operator!=(
+    GenericTensorAccessorR const &other) const {
+  return this->tie() != other.tie();
+}
+
+int32_t const *GenericTensorAccessorR::get_int32_ptr() const {
+  return this->get<DataType::INT32>();
+}
+
+int64_t const *GenericTensorAccessorR::get_int64_ptr() const {
+  return this->get<DataType::INT64>();
+}
+
+float const *GenericTensorAccessorR::get_float_ptr() const {
+  return this->get<DataType::FLOAT>();
+}
+
+double const *GenericTensorAccessorR::get_double_ptr() const {
+  return this->get<DataType::DOUBLE>();
+}
+
+half const *GenericTensorAccessorR::get_half_ptr() const {
+  return get<DataType::HALF>();
+}
+
+std::string format_as(GenericTensorAccessorR const &a) {
+  return fmt::format("<GenericTensorAccessorR data_type={} shape={} ptr={}>",
+                     a.data_type,
+                     a.shape,
+                     a.ptr);
+}
+
+std::ostream &operator<<(std::ostream &s, GenericTensorAccessorR const &a) {
+  return (s << fmt::to_string(a));
+}
+
+int32_t const *get_int32_ptr(GenericTensorAccessorR const &a) {
+  return get<DataType::INT32>(a);
+}
+
+int64_t const *get_int64_ptr(GenericTensorAccessorR const &a) {
+  return get<DataType::INT64>(a);
+}
+
+float const *get_float_ptr(GenericTensorAccessorR const &a) {
+  return get<DataType::FLOAT>(a);
+}
+
+double const *get_double_ptr(GenericTensorAccessorR const &a) {
+  return get<DataType::DOUBLE>(a);
+}
+
+half const *get_half_ptr(GenericTensorAccessorR const &a) {
+  return get<DataType::HALF>(a);
+}
+
+std::vector<int32_t const *>
+    get_int32_ptrs(std::vector<GenericTensorAccessorR> const &a) {
+  return get<DataType::INT32>(a);
+}
+
+std::vector<int64_t const *>
+    get_int64_ptrs(std::vector<GenericTensorAccessorR> const &a) {
+  return get<DataType::INT64>(a);
+}
+
+std::vector<float const *>
+    get_float_ptrs(std::vector<GenericTensorAccessorR> const &a) {
+  return get<DataType::FLOAT>(a);
+}
+
+std::vector<double const *>
+    get_double_ptrs(std::vector<GenericTensorAccessorR> const &a) {
+  return get<DataType::DOUBLE>(a);
+}
+
+std::vector<half const *>
+    get_half_ptrs(std::vector<GenericTensorAccessorR> const &a) {
+  return get<DataType::HALF>(a);
+}
+
+GenericTensorAccessorR read_only_accessor_from_write_accessor(
+    GenericTensorAccessorW const &writable) {
+  return GenericTensorAccessorR{writable.data_type,
+                                writable.shape,
+                                req<void const *>(writable.ptr),
+                                writable.device_type};
+}
+
+bool is_shape_and_dtype_equal(GenericTensorAccessorR const &acc1,
+                              GenericTensorAccessorR const &acc2) {
+  return acc1.shape == acc2.shape && acc1.data_type == acc2.data_type;
+}
+
+bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor,
+                             ArrayShape const &expected_shape,
+                             DataType const &expected_dtype) {
+  return accessor.shape == expected_shape &&
+         accessor.data_type == expected_dtype;
+}
+
+std::pair<ArrayShape, DataType>
+    get_shape_and_datatype(GenericTensorAccessorR const &accessor) {
+  return std::make_pair(accessor.shape, accessor.data_type);
+}
+
+} // namespace FlexFlow
diff --git a/lib/kernels/src/kernels/allocation.cc b/lib/kernels/src/kernels/allocation.cc
new file mode 100644
index 0000000000..b9f253bcff
--- /dev/null
+++ b/lib/kernels/src/kernels/allocation.cc
@@ -0,0 +1,38 @@
+#include "kernels/allocation.h"
+#include "op-attrs/tensor_shape.h"
+
+namespace FlexFlow {
+
+void *Allocator::allocate(size_t mem_size) {
+  return this->i_allocator->allocate(mem_size);
+}
+
+void Allocator::deallocate(void *ptr) {
+  this->i_allocator->deallocate(ptr);
+}
+
+DeviceType Allocator::get_allocation_device_type() const {
+  return this->i_allocator->get_allocation_device_type();
+}
+
+GenericTensorAccessorW
+    Allocator::allocate_tensor(TensorShape const &tensor_shape) {
+  void *ptr =
+      this->allocate(get_size_in_bytes(tensor_shape).unwrap_nonnegative());
+  return GenericTensorAccessorW{
+      tensor_shape.data_type,
+      array_shape_from_tensor_shape(tensor_shape),
+      ptr,
+      this->get_allocation_device_type(),
+  };
+}
+
+void Allocator::deallocate_tensor(GenericTensorAccessorW const &t) {
+  this->deallocate(t.ptr);
+}
+
+void Allocator::deallocate_tensor(GenericTensorAccessorR const &t) {
+  this->deallocate(const_cast<void *>(t.ptr));
+}
+
+} // namespace FlexFlow
diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/kernels/array_shape.cc
similarity index 51%
rename from lib/kernels/src/array_shape.cc
rename to lib/kernels/src/kernels/array_shape.cc
index 243185ada4..34a53c1bb3 100644
--- a/lib/kernels/src/array_shape.cc
+++ b/lib/kernels/src/kernels/array_shape.cc
@@ -1,23 +1,20 @@
 #include "kernels/array_shape.h"
+#include "kernels/legion_ordered/slice.h"
+#include "op-attrs/ff_ordered/ff_ordered_of.h"
+#include "op-attrs/ff_ordered/slice.h"
+#include "utils/containers/cartesian_product.h"
 #include "utils/containers/product.h"
 #include "utils/containers/reversed.h"
+#include "utils/containers/transform.h"
+#include "utils/containers/unordered_set_of.h"
 #include "utils/containers/vector_of.h"
+#include "utils/hash/tuple.h"
+#include "utils/hash/vector.h"
 #include "utils/nonnegative_int/num_elements.h"
 
 namespace FlexFlow {
 
-static LegionOrdered<nonnegative_int>
-    legion_dims_from_ff_dims(FFOrdered<nonnegative_int> const &ff_ordered) {
-  return LegionOrdered<nonnegative_int>{reversed(vector_of(ff_ordered))};
-}
-
-ArrayShape::ArrayShape(nonnegative_int *_dims, nonnegative_int num_dims)
-    : dims(_dims, _dims + num_dims.unwrap_nonnegative()) {}
-
-ArrayShape::ArrayShape(TensorShape const &shape)
-    : dims(legion_dims_from_ff_dims(shape.dims.ff_ordered)) {}
-
-ArrayShape::ArrayShape(std::vector<nonnegative_int> const &input_dims)
+ArrayShape::ArrayShape(LegionOrdered<nonnegative_int> const &input_dims)
     : dims(input_dims) {}
 
 nonnegative_int ArrayShape::get_volume() const {
@@ -59,10 +56,19 @@ bool ArrayShape::operator!=(ArrayShape const &other) const {
   return this->tie() != other.tie();
 }
 
-ArrayShape ArrayShape::sub_shape(
-    std::optional<std::variant<ff_dim_t, legion_dim_t>> start,
-    std::optional<std::variant<ff_dim_t, legion_dim_t>> end) const {
-  NOT_IMPLEMENTED();
+ArrayShape
+    ArrayShape::sub_shape(ff_dim_t const &start,
+                          std::optional<ff_dim_t> const &maybe_end) const {
+  FFOrdered<nonnegative_int> ff_ordered_dims =
+      ff_ordered_from_legion_ordered(this->dims);
+  FFOrdered<nonnegative_int> sliced = slice(ff_ordered_dims, start, maybe_end);
+  return ArrayShape{legion_ordered_from_ff_ordered(sliced)};
+}
+
+ArrayShape
+    ArrayShape::sub_shape(legion_dim_t const &start,
+                          std::optional<legion_dim_t> const &maybe_end) const {
+  return ArrayShape{slice(this->dims, start, maybe_end)};
 }
 
 std::optional<nonnegative_int> ArrayShape::at_maybe(legion_dim_t index) const {
@@ -81,15 +87,6 @@ std::tuple<LegionOrdered<nonnegative_int> const &> ArrayShape::tie() const {
   return std::tie(this->dims);
 }
 
-nonnegative_int get_volume(ArrayShape const &shape) {
-  return shape.get_volume();
-}
-
-TensorShape get_tensor_shape(ArrayShape const &shape, DataType dtype) {
-  return TensorShape{TensorDims{ff_ordered_from_legion_ordered(shape.dims)},
-                     dtype};
-}
-
 std::string format_as(ArrayShape const &x) {
   std::ostringstream oss;
   oss << "<ArrayShape";
@@ -102,4 +99,44 @@ std::ostream &operator<<(std::ostream &s, ArrayShape const &x) {
   return (s << fmt::to_string(x));
 }
 
+nonnegative_int get_volume(ArrayShape const &shape) {
+  return shape.get_volume();
+}
+
+ArrayShape array_shape_from_tensor_shape(TensorShape const &tensor_shape) {
+  return ArrayShape{
+      legion_ordered_from_ff_ordered(tensor_shape.dims.ff_ordered)};
+}
+
+TensorShape get_tensor_shape(ArrayShape const &shape, DataType dtype) {
+  return TensorShape{TensorDims{ff_ordered_from_legion_ordered(shape.dims)},
+                     dtype};
+}
+
+std::unordered_set<ArrayCoord> get_array_coord_set(ArrayShape const &shape) {
+  std::vector<std::vector<nonnegative_int>> per_dim_ranges =
+      transform(vector_of(ff_ordered_from_legion_ordered(shape.dims)),
+                [](nonnegative_int dim_size) -> std::vector<nonnegative_int> {
+                  return nonnegative_range(dim_size);
+                });
+
+  std::unordered_set<std::vector<nonnegative_int>> raw_points =
+      unordered_set_of(cartesian_product(per_dim_ranges));
+
+  return transform(raw_points,
+                   [](std::vector<nonnegative_int> const &raw_point) {
+                     return ArrayCoord{ff_ordered_of(raw_point)};
+                   });
+}
+
 } // namespace FlexFlow
+
+namespace std {
+
+using namespace FlexFlow;
+
+size_t hash<ArrayShape>::operator()(ArrayShape const &s) const {
+  return get_std_hash(s.tie());
+}
+
+} // namespace std
diff --git a/lib/kernels/src/kernels/copy_tensor_accessor.cc b/lib/kernels/src/kernels/copy_tensor_accessor.cc
new file mode 100644
index 0000000000..d8619d8ce6
--- /dev/null
+++ b/lib/kernels/src/kernels/copy_tensor_accessor.cc
@@ -0,0 +1,66 @@
+#include "kernels/copy_tensor_accessor.h"
+#include "kernels/datatype_dispatch.h"
+
+namespace FlexFlow {
+
+template <DataType DT>
+struct CopyTensorAccessorW {
+  GenericTensorAccessorW operator()(GenericTensorAccessorW const &src_accessor,
+                                    Allocator &allocator) {
+    TensorShape shape =
+        get_tensor_shape(src_accessor.shape, src_accessor.data_type);
+    GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape);
+
+    copy_accessor_data_to_l_from_r(dst_accessor, src_accessor);
+
+    return dst_accessor;
+  }
+};
+
+GenericTensorAccessorW
+    copy_tensor_accessor_w(GenericTensorAccessorW const &src_accessor,
+                           Allocator &allocator) {
+  return DataTypeDispatch1<CopyTensorAccessorW>{}(
+      src_accessor.data_type, src_accessor, allocator);
+}
+
+template <DataType DT>
+struct CopyTensorAccessorR {
+  GenericTensorAccessorR operator()(GenericTensorAccessorR const &src_accessor,
+                                    Allocator &allocator) {
+    TensorShape shape =
+        get_tensor_shape(src_accessor.shape, src_accessor.data_type);
+    GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape);
+
+    copy_accessor_data_to_l_from_r(dst_accessor, src_accessor);
+
+    return read_only_accessor_from_write_accessor(dst_accessor);
+  }
+};
+
+GenericTensorAccessorR
+    copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor,
+                           Allocator &allocator) {
+  return DataTypeDispatch1<CopyTensorAccessorR>{}(
+      src_accessor.data_type, src_accessor, allocator);
+}
+
+GenericTensorAccessorR copy_tensor_accessor_r_to_cpu_if_necessary(
+    GenericTensorAccessorR const &accessor, Allocator &cpu_allocator) {
+  if (accessor.device_type == DeviceType::GPU) {
+    return copy_tensor_accessor_r(accessor, cpu_allocator);
+  } else {
+    return accessor;
+  }
+}
+
+GenericTensorAccessorW copy_tensor_accessor_w_to_cpu_if_necessary(
+    GenericTensorAccessorW const &accessor, Allocator &cpu_allocator) {
+  if (accessor.device_type == DeviceType::GPU) {
+    return copy_tensor_accessor_w(accessor, cpu_allocator);
+  } else {
+    return accessor;
+  }
+}
+
+} // namespace FlexFlow
diff --git a/lib/kernels/src/kernels/format_accessor_contents.cc b/lib/kernels/src/kernels/format_accessor_contents.cc
new file mode 100644
index 0000000000..1b8ab35d89
--- /dev/null
+++ b/lib/kernels/src/kernels/format_accessor_contents.cc
@@ -0,0 +1,184 @@
+#include "kernels/format_accessor_contents.h"
+#include "kernels/copy_tensor_accessor.h"
+#include "kernels/datatype_dispatch.h"
+#include "kernels/local_cpu_allocator.h"
+#include "utils/indent.h"
+#include <libassert/assert.hpp>
+
+namespace FlexFlow {
+
+template <DataType DT>
+struct Print1DCPUAccessorR {
+  void operator()(GenericTensorAccessorR const &accessor,
+                  std::ostream &stream) {
+    ASSERT(accessor.device_type == DeviceType::CPU);
+    nonnegative_int dims = accessor.shape.num_dims();
+    ASSERT(dims == 1_n);
+
+    nonnegative_int ncols = accessor.shape.at(ff_dim_t{0_n});
+
+    stream << "["
+           << join_strings(nonnegative_range(ncols),
+                           " ",
+                           [&](nonnegative_int col_idx) -> std::string {
+                             return fmt::to_string(
+                                 accessor.at<DT>(FFOrdered{col_idx}));
+                           })
+           << "]";
+  }
+};
+
+static std::string
+    format_1d_accessor_r_contents(GenericTensorAccessorR const &accessor) {
+  ASSERT(accessor.device_type == DeviceType::CPU);
+  ASSERT(accessor.shape.num_dims() == 1_n);
+
+  std::ostringstream oss;
+  DataTypeDispatch1<Print1DCPUAccessorR>{}(accessor.data_type, accessor, oss);
+  return oss.str();
+}
+
+template <DataType DT>
+struct Print2DCPUAccessorR {
+  void operator()(GenericTensorAccessorR const &accessor,
+                  std::ostream &stream) {
+    ASSERT(accessor.device_type == DeviceType::CPU);
+    nonnegative_int dims = accessor.shape.num_dims();
+    ASSERT(dims == 2_n);
+    nonnegative_int dim0_size = accessor.shape.at(ff_dim_t{0_n});
+    nonnegative_int dim1_size = accessor.shape.at(ff_dim_t{1_n});
+
+    auto render_1d = [&](nonnegative_int dim0_idx) -> std::string {
+      return "[" +
+             join_strings(nonnegative_range(dim1_size),
+                          " ",
+                          [&](nonnegative_int dim1_idx) -> std::string {
+                            return fmt::to_string(
+                                accessor.at<DT>(FFOrdered{dim0_idx, dim1_idx}));
+                          }) +
+             "]";
+    };
+
+    stream << "[\n"
+           << indent(
+                  join_strings(nonnegative_range(dim0_size), "\n", render_1d))
+           << "\n]";
+  }
+};
+
+static std::string
+    format_2d_accessor_r_contents(GenericTensorAccessorR const &accessor) {
+  ASSERT(accessor.device_type == DeviceType::CPU);
+  ASSERT(accessor.shape.num_dims() == 2_n);
+
+  std::ostringstream oss;
+  DataTypeDispatch1<Print2DCPUAccessorR>{}(accessor.data_type, accessor, oss);
+  return oss.str();
+}
+
+template <DataType DT>
+struct Print3DCPUAccessorR {
+  void operator()(GenericTensorAccessorR const &accessor,
+                  std::ostream &stream) {
+    ASSERT(accessor.device_type == DeviceType::CPU);
+    nonnegative_int dims = accessor.shape.num_dims();
+    ASSERT(dims == 3_n);
+
+    nonnegative_int dim0_size = accessor.shape.at(ff_dim_t{0_n});
+    nonnegative_int dim1_size = accessor.shape.at(ff_dim_t{1_n});
+    nonnegative_int dim2_size = accessor.shape.at(ff_dim_t{2_n});
+
+    auto render_1d = [&](nonnegative_int dim0_idx,
+                         nonnegative_int dim1_idx) -> std::string {
+      return "[" +
+             join_strings(nonnegative_range(dim2_size),
+                          " ",
+                          [&](nonnegative_int dim2_idx) -> std::string {
+                            return fmt::to_string(accessor.at<DT>(
+                                FFOrdered{dim0_idx, dim1_idx, dim2_idx}));
+                          }) +
+             "]";
+    };
+
+    auto render_2d = [&](nonnegative_int dim0_idx) -> std::string {
+      return "[\n" +
+             indent(join_strings(nonnegative_range(dim1_size),
+                                 "\n",
+                                 [&](nonnegative_int dim1_idx) -> std::string {
+                                   return render_1d(dim0_idx, dim1_idx);
+                                 })) +
+             "\n]";
+    };
+
+    stream << "[\n"
+           << indent(
+                  join_strings(nonnegative_range(dim0_size), "\n", render_2d))
+           << "\n]";
+  }
+};
+
+static std::string
+    format_3d_accessor_r_contents(GenericTensorAccessorR const &accessor) {
+  ASSERT(accessor.device_type == DeviceType::CPU);
+  ASSERT(accessor.shape.num_dims() == 3_n);
+
+  std::ostringstream oss;
+  DataTypeDispatch1<Print3DCPUAccessorR>{}(accessor.data_type, accessor, oss);
+  return oss.str();
+}
+
+static std::string
+    format_1d_accessor_w_contents(GenericTensorAccessorW const &accessor) {
+  return format_1d_accessor_r_contents(
+      read_only_accessor_from_write_accessor(accessor));
+}
+
+static std::string
+    format_2d_accessor_w_contents(GenericTensorAccessorW const &accessor) {
+  return format_2d_accessor_r_contents(
+      read_only_accessor_from_write_accessor(accessor));
+}
+
+static std::string
+    format_3d_accessor_w_contents(GenericTensorAccessorW const &accessor) {
+  return format_3d_accessor_r_contents(
+      read_only_accessor_from_write_accessor(accessor));
+}
+
+std::string format_accessor_r_contents(GenericTensorAccessorR const &accessor) {
+  Allocator cpu_allocator = create_local_cpu_memory_allocator();
+  GenericTensorAccessorR cpu_accessor =
+      copy_tensor_accessor_r_to_cpu_if_necessary(accessor, cpu_allocator);
+
+  int num_dims = accessor.shape.num_dims().unwrap_nonnegative();
+  switch (num_dims) {
+    case 1:
+      return format_1d_accessor_r_contents(accessor);
+    case 2:
+      return format_2d_accessor_r_contents(accessor);
+    case 3:
+      return format_3d_accessor_r_contents(accessor);
+    default:
+      PANIC("Unhandled accessor dimensionality", num_dims);
+  }
+}
+
+std::string format_accessor_w_contents(GenericTensorAccessorW const &accessor) {
+  Allocator cpu_allocator = create_local_cpu_memory_allocator();
+  GenericTensorAccessorW cpu_accessor =
+      copy_tensor_accessor_w_to_cpu_if_necessary(accessor, cpu_allocator);
+
+  int num_dims = cpu_accessor.shape.num_dims().unwrap_nonnegative();
+  switch (num_dims) {
+    case 1:
+      return format_1d_accessor_w_contents(cpu_accessor);
+    case 2:
+      return format_2d_accessor_w_contents(cpu_accessor);
+    case 3:
+      return format_3d_accessor_w_contents(cpu_accessor);
+    default:
+      PANIC("Unhandled accessor dimensionality", num_dims);
+  }
+}
+
+} // namespace FlexFlow
diff --git a/lib/kernels/src/legion_dim.cc b/lib/kernels/src/kernels/legion_dim.cc
similarity index 78%
rename from lib/kernels/src/legion_dim.cc
rename to lib/kernels/src/kernels/legion_dim.cc
index bbb15c5636..f3482b1d9b 100644
--- a/lib/kernels/src/legion_dim.cc
+++ b/lib/kernels/src/kernels/legion_dim.cc
@@ -1,7 +1,11 @@
 #include "kernels/legion_dim.h"
+#include "utils/archetypes/value_type.h"
 
 namespace FlexFlow {
 
+using T = value_type<0>;
+template std::set<legion_dim_t> key_range(LegionOrdered<T> const &);
+
 legion_dim_t add_to_legion_dim(legion_dim_t legion_dim, int value) {
   return legion_dim_t{
       nonnegative_int{legion_dim.value.unwrap_nonnegative() + value}};
@@ -11,6 +15,7 @@ legion_dim_t legion_dim_from_ff_dim(ff_dim_t ff_dim,
                                     nonnegative_int num_dimensions) {
   return legion_dim_t{nonnegative_int{num_dimensions.unwrap_nonnegative() -
                                       ff_dim.value.unwrap_nonnegative() - 1}};
+  ;
 }
 
 } // namespace FlexFlow
diff --git a/lib/kernels/src/kernels/legion_ordered/legion_ordered.cc b/lib/kernels/src/kernels/legion_ordered/legion_ordered.cc
new file mode 100644
index 0000000000..8af44173b0
--- /dev/null
+++ b/lib/kernels/src/kernels/legion_ordered/legion_ordered.cc
@@ -0,0 +1,10 @@
+#include "kernels/legion_ordered/legion_ordered.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using T = value_type<0>;
+
+template struct LegionOrdered<T>;
+
+} // namespace FlexFlow
diff --git a/lib/kernels/src/kernels/legion_ordered/slice.cc b/lib/kernels/src/kernels/legion_ordered/slice.cc
new file mode 100644
index 0000000000..69fcf570aa
--- /dev/null
+++ b/lib/kernels/src/kernels/legion_ordered/slice.cc
@@ -0,0 +1,12 @@
+#include "kernels/legion_ordered/slice.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using T = value_type<0>;
+
+template LegionOrdered<T> slice(LegionOrdered<T> const &,
+                                legion_dim_t const &,
+                                std::optional<legion_dim_t> const &);
+
+} // namespace FlexFlow
diff --git a/lib/kernels/src/kernels/legion_ordered/transform.cc b/lib/kernels/src/kernels/legion_ordered/transform.cc
new file mode 100644
index 0000000000..d9fb38198e
--- /dev/null
+++ b/lib/kernels/src/kernels/legion_ordered/transform.cc
@@ -0,0 +1,12 @@
+#include "kernels/legion_ordered/transform.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using T = value_type<0>;
+using Out = value_type<1>;
+using F = std::function<Out(T const &)>;
+
+template LegionOrdered<Out> transform(LegionOrdered<T> const &, F &&);
+
+} // namespace FlexFlow
diff --git a/lib/local-execution/src/local_cpu_allocator.cc b/lib/kernels/src/kernels/local_cpu_allocator.cc
similarity index 52%
rename from lib/local-execution/src/local_cpu_allocator.cc
rename to lib/kernels/src/kernels/local_cpu_allocator.cc
index 4ca5f987a8..738d1abf27 100644
--- a/lib/local-execution/src/local_cpu_allocator.cc
+++ b/lib/kernels/src/kernels/local_cpu_allocator.cc
@@ -1,20 +1,27 @@
-#include "local-execution/local_cpu_allocator.h"
+#include "kernels/local_cpu_allocator.h"
+#include "kernels/device.h"
 #include "utils/containers/contains_key.h"
+#include <libassert/assert.hpp>
+#include <stdlib.h>
 
 namespace FlexFlow {
 void *LocalCPUAllocator::allocate(size_t requested_memory_size) {
   void *ptr = malloc(requested_memory_size);
+  ASSERT(ptr != nullptr);
   this->ptrs.insert({ptr, std::unique_ptr<void, decltype(&free)>(ptr, free)});
   return ptr;
 }
 
 void LocalCPUAllocator::deallocate(void *ptr) {
-  if (contains_key(this->ptrs, ptr)) {
-    this->ptrs.erase(ptr);
-  } else {
-    throw std::runtime_error(
-        "Deallocating a pointer that was not allocated by this Allocator");
-  }
+  ASSERT(contains_key(this->ptrs, ptr),
+         "Deallocating a pointer that was not allocated by this Allocator");
+
+  free(ptr);
+  this->ptrs.erase(ptr);
+}
+
+DeviceType LocalCPUAllocator::get_allocation_device_type() const {
+  return DeviceType::CPU;
 }
 
 Allocator create_local_cpu_memory_allocator() {
diff --git a/lib/kernels/src/local_cuda_allocator.cc b/lib/kernels/src/kernels/local_cuda_allocator.cc
similarity index 59%
rename from lib/kernels/src/local_cuda_allocator.cc
rename to lib/kernels/src/kernels/local_cuda_allocator.cc
index cdcfb017a0..1b081517bf 100644
--- a/lib/kernels/src/local_cuda_allocator.cc
+++ b/lib/kernels/src/kernels/local_cuda_allocator.cc
@@ -1,6 +1,7 @@
 #include "kernels/local_cuda_allocator.h"
 #include "kernels/device.h"
 #include "utils/containers/contains.h"
+#include <libassert/assert.hpp>
 
 namespace FlexFlow {
 void *LocalCudaAllocator::allocate(size_t requested_memory_size) {
@@ -11,13 +12,15 @@ void *LocalCudaAllocator::allocate(size_t requested_memory_size) {
 }
 
 void LocalCudaAllocator::deallocate(void *ptr) {
-  if (contains(this->ptrs, ptr)) {
-    checkCUDA(cudaFree(ptr));
-    this->ptrs.erase(ptr);
-  } else {
-    throw std::runtime_error(
-        "Deallocating a pointer that was not allocated by this Allocator");
-  }
+  ASSERT(contains(this->ptrs, ptr),
+         "Deallocating a pointer that was not allocated by this Allocator");
+
+  checkCUDA(cudaFree(ptr));
+  this->ptrs.erase(ptr);
+}
+
+DeviceType LocalCudaAllocator::get_allocation_device_type() const {
+  return DeviceType::GPU;
 }
 
 LocalCudaAllocator::~LocalCudaAllocator() {
@@ -27,7 +30,8 @@ LocalCudaAllocator::~LocalCudaAllocator() {
 }
 
 Allocator create_local_cuda_memory_allocator() {
-  return Allocator::create<LocalCudaAllocator>();
+  Allocator allocator = Allocator::create<LocalCudaAllocator>();
+  return allocator;
 }
 
 } // namespace FlexFlow
diff --git a/lib/kernels/src/kernels/reverse_kernels_params.cc b/lib/kernels/src/kernels/reverse_kernels_params.cc
new file mode 100644
index 0000000000..c647181872
--- /dev/null
+++ b/lib/kernels/src/kernels/reverse_kernels_params.cc
@@ -0,0 +1,30 @@
+#include "kernels/reverse_kernels_params.h"
+
+namespace FlexFlow {
+
+ReverseKernelsParams
+    compute_reverse_kernels_params(ArrayShape const &output_shape,
+                                   ReverseAttrs const &attrs) {
+  auto axis = attrs.axis;
+  nonnegative_int in_blk_size = 1_n;
+  nonnegative_int reverse_dim_size = 1_n;
+  nonnegative_int num_out_blks = 1_n;
+  for (nonnegative_int i : nonnegative_range(output_shape.get_dim())) {
+    if (i < axis.value) {
+      in_blk_size *= output_shape.at(ff_dim_t{i});
+    } else if (i == axis.value) {
+      reverse_dim_size = output_shape.at(ff_dim_t{i});
+    } else {
+      num_out_blks *= output_shape.at(ff_dim_t{i});
+    }
+  }
+
+  return ReverseKernelsParams{
+      num_out_blks,
+      reverse_dim_size,
+      in_blk_size,
+      output_shape.get_volume(),
+  };
+}
+
+} // namespace FlexFlow
diff --git a/lib/kernels/src/managed_ff_stream.cc b/lib/kernels/src/managed_ff_stream.cc
index 7385b6cc3e..f0348aa91c 100644
--- a/lib/kernels/src/managed_ff_stream.cc
+++ b/lib/kernels/src/managed_ff_stream.cc
@@ -1,28 +1,36 @@
 #include "kernels/managed_ff_stream.h"
+#include "utils/exception.h"
 
 namespace FlexFlow {
 
 ManagedFFStream::ManagedFFStream() : stream(new ffStream_t) {
-  checkCUDA(cudaStreamCreate(stream));
+  checkCUDA(cudaStreamCreate(this->stream));
 }
 
 ManagedFFStream::ManagedFFStream(ManagedFFStream &&other) noexcept
     : stream(std::exchange(other.stream, nullptr)) {}
 
 ManagedFFStream &ManagedFFStream::operator=(ManagedFFStream &&other) noexcept {
-  std::swap(this->stream, other.stream);
+  if (this != &other) {
+    this->cleanup();
+    this->stream = std::exchange(other.stream, nullptr);
+  }
   return *this;
 }
 
 ManagedFFStream::~ManagedFFStream() {
-  if (stream != nullptr) {
-    checkCUDA(cudaStreamDestroy(*stream));
-    delete stream;
+  this->cleanup();
+}
+
+void ManagedFFStream::cleanup() {
+  if (this->stream != nullptr) {
+    checkCUDA(cudaStreamDestroy(*this->stream));
+    delete this->stream;
   }
 }
 
 ffStream_t const &ManagedFFStream::raw_stream() const {
-  return *stream;
+  return *this->stream;
 }
 
 } // namespace FlexFlow
diff --git a/lib/kernels/src/managed_per_device_ff_handle.cc b/lib/kernels/src/managed_per_device_ff_handle.cc
index c050e887b6..ea26d2350c 100644
--- a/lib/kernels/src/managed_per_device_ff_handle.cc
+++ b/lib/kernels/src/managed_per_device_ff_handle.cc
@@ -1,16 +1,17 @@
 #include "kernels/managed_per_device_ff_handle.h"
-#include "device.h"
+#include "internal/device.h"
 
 namespace FlexFlow {
 
-ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle() {
-  handle = new PerDeviceFFHandle;
-  handle->workSpaceSize = 1024 * 1024;
-  handle->allowTensorOpMathConversion = true;
-
-  checkCUDNN(cudnnCreate(&handle->dnn));
-  checkCUBLAS(cublasCreate(&handle->blas));
-  checkCUDA(cudaMalloc(&handle->workSpace, handle->workSpaceSize));
+ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle(
+    size_t workSpaceSize, bool allowTensorOpMathConversion) {
+  this->handle = new PerDeviceFFHandle{};
+  this->handle->workSpaceSize = workSpaceSize;
+  this->handle->allowTensorOpMathConversion = allowTensorOpMathConversion;
+
+  checkCUDNN(cudnnCreate(&this->handle->dnn));
+  checkCUBLAS(cublasCreate(&this->handle->blas));
+  checkCUDA(cudaMalloc(&this->handle->workSpace, this->handle->workSpaceSize));
 }
 
 ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle(
@@ -19,16 +20,23 @@ ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle(
 
 ManagedPerDeviceFFHandle &ManagedPerDeviceFFHandle::operator=(
     ManagedPerDeviceFFHandle &&other) noexcept {
-  std::swap(this->handle, other.handle);
+  if (this != &other) {
+    this->cleanup();
+    this->handle = std::exchange(other.handle, nullptr);
+  }
   return *this;
 }
 
 ManagedPerDeviceFFHandle::~ManagedPerDeviceFFHandle() {
-  if (handle != nullptr) {
-    checkCUDNN(cudnnDestroy(handle->dnn));
-    checkCUBLAS(cublasDestroy(handle->blas));
-    checkCUDA(cudaFree(handle->workSpace));
-    delete handle;
+  this->cleanup();
+}
+
+void ManagedPerDeviceFFHandle::cleanup() {
+  if (this->handle != nullptr) {
+    checkCUDNN(cudnnDestroy(this->handle->dnn));
+    checkCUBLAS(cublasDestroy(this->handle->blas));
+    checkCUDA(cudaFree(this->handle->workSpace));
+    delete this->handle;
   }
 }
 
diff --git a/lib/kernels/test/CMakeLists.txt b/lib/kernels/test/CMakeLists.txt
index 00da2d0d70..066cb96753 100644
--- a/lib/kernels/test/CMakeLists.txt
+++ b/lib/kernels/test/CMakeLists.txt
@@ -14,6 +14,7 @@ ff_add_test_executable(
     cudnn
     cudart
     cublas
+    pcg
 )
 
 set(FF_TEST_EXEC_NAME "kernels-tests")
diff --git a/lib/kernels/test/src/cpu/ops/replicate_kernels.cc b/lib/kernels/test/src/cpu/ops/replicate_kernels.cc
new file mode 100644
index 0000000000..8630dcd8cd
--- /dev/null
+++ b/lib/kernels/test/src/cpu/ops/replicate_kernels.cc
@@ -0,0 +1,57 @@
+#include "internal/test_utils.h"
+#include "kernels/format_accessor_contents.h"
+#include "kernels/replicate_kernels_cpu.h"
+#include "test/utils/doctest/check_kv.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("Replicate::cpu_forward_kernel") {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    GenericTensorAccessorR input =
+        create_1d_accessor_r_with_contents({1, 3, 2}, cpu_allocator);
+
+    TensorShape result_shape = TensorShape{
+        TensorDims{FFOrdered{3_n}},
+        DataType::FLOAT,
+    };
+    GenericTensorAccessorW result =
+        create_zero_filled_accessor_w(result_shape, cpu_allocator);
+
+    GenericTensorAccessorR correct = input;
+
+    Kernels::Replicate::cpu_forward_kernel(input, result);
+
+    CHECK_MESSAGE(accessors_are_equal(result, correct),
+                  "result=",
+                  format_accessor_w_contents(result));
+  }
+
+  TEST_CASE("Replicate::cpu_backward_kernel") {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    GenericTensorAccessorR output = create_2d_accessor_r_with_contents(
+        {
+            {1, 2, 3},
+            {4, 3, 3},
+            {1, 3, 5},
+        },
+        cpu_allocator);
+
+    GenericTensorAccessorR correct = create_1d_accessor_r_with_contents(
+        {1 + 2 + 3, 4 + 3 + 3, 1 + 3 + 5}, cpu_allocator);
+
+    TensorShape result_shape = TensorShape{
+        TensorDims{FFOrdered{3_n}},
+        DataType::FLOAT,
+    };
+    GenericTensorAccessorW result =
+        create_zero_filled_accessor_w(result_shape, cpu_allocator);
+    Kernels::Replicate::cpu_backward_kernel(output, result, 3);
+
+    CHECK_MESSAGE(accessors_are_equal(result, correct),
+                  check_kv("result", format_accessor_w_contents(result)));
+  }
+}
diff --git a/lib/kernels/test/src/cpu/ops/reverse_kernels.cc b/lib/kernels/test/src/cpu/ops/reverse_kernels.cc
new file mode 100644
index 0000000000..db0016cb0b
--- /dev/null
+++ b/lib/kernels/test/src/cpu/ops/reverse_kernels.cc
@@ -0,0 +1,206 @@
+#include "internal/test_utils.h"
+#include "kernels/format_accessor_contents.h"
+#include "kernels/reverse_kernels_cpu.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("Reverse::cpu_forward_kernel") {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    GenericTensorAccessorR input = create_3d_accessor_r_with_contents(
+        {
+            {
+                {1, 3, 2},
+                {4, 2, 1},
+            },
+            {
+                {3, 3, 6},
+                {2, 1, 5},
+            },
+        },
+        cpu_allocator);
+
+    GenericTensorAccessorW result = create_zero_filled_accessor_w(
+        TensorShape{
+            TensorDims{FFOrdered{2_n, 2_n, 3_n}},
+            DataType::FLOAT,
+        },
+        cpu_allocator);
+
+    SUBCASE("axis = ff_dim_t{0}") {
+      ReverseAttrs attrs = ReverseAttrs{
+          /*axis=*/ff_dim_t{0_n},
+      };
+
+      GenericTensorAccessorR correct = create_3d_accessor_r_with_contents(
+          {
+              {
+                  {3, 3, 6},
+                  {2, 1, 5},
+              },
+              {
+                  {1, 3, 2},
+                  {4, 2, 1},
+              },
+          },
+          cpu_allocator);
+
+      Kernels::Reverse::cpu_forward_kernel(input, result, attrs);
+
+      CHECK_MESSAGE(accessors_are_equal(result, correct),
+                    "result=",
+                    format_accessor_w_contents(result));
+    }
+
+    SUBCASE("axis = ff_dim_t{1}") {
+      ReverseAttrs attrs = ReverseAttrs{
+          /*axis=*/ff_dim_t{1_n},
+      };
+
+      GenericTensorAccessorR correct = create_3d_accessor_r_with_contents(
+          {
+              {
+                  {4, 2, 1},
+                  {1, 3, 2},
+              },
+              {
+                  {2, 1, 5},
+                  {3, 3, 6},
+              },
+          },
+          cpu_allocator);
+
+      Kernels::Reverse::cpu_forward_kernel(input, result, attrs);
+
+      CHECK_MESSAGE(accessors_are_equal(result, correct),
+                    "result=",
+                    format_accessor_w_contents(result));
+    }
+
+    SUBCASE("axis = ff_dim_t{2}") {
+      ReverseAttrs attrs = ReverseAttrs{
+          /*axis=*/ff_dim_t{2_n},
+      };
+
+      GenericTensorAccessorR correct = create_3d_accessor_r_with_contents(
+          {
+              {
+                  {2, 3, 1},
+                  {1, 2, 4},
+              },
+              {
+                  {6, 3, 3},
+                  {5, 1, 2},
+              },
+          },
+          cpu_allocator);
+
+      Kernels::Reverse::cpu_forward_kernel(input, result, attrs);
+
+      CHECK_MESSAGE(accessors_are_equal(result, correct),
+                    "result=",
+                    format_accessor_w_contents(result));
+    }
+  }
+
+  TEST_CASE("Reverse::cpu_backward_kernel") {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    GenericTensorAccessorR input = create_3d_accessor_r_with_contents(
+        {
+            {
+                {1, 3, 2},
+                {4, 2, 1},
+            },
+            {
+                {3, 3, 6},
+                {2, 1, 5},
+            },
+        },
+        cpu_allocator);
+
+    GenericTensorAccessorW result = create_zero_filled_accessor_w(
+        TensorShape{
+            TensorDims{FFOrdered{2_n, 2_n, 3_n}},
+            DataType::FLOAT,
+        },
+        cpu_allocator);
+
+    SUBCASE("axis = ff_dim_t{0}") {
+      ReverseAttrs attrs = ReverseAttrs{
+          /*axis=*/ff_dim_t{0_n},
+      };
+
+      GenericTensorAccessorR correct = create_3d_accessor_r_with_contents(
+          {
+              {
+                  {3, 3, 6},
+                  {2, 1, 5},
+              },
+              {
+                  {1, 3, 2},
+                  {4, 2, 1},
+              },
+          },
+          cpu_allocator);
+
+      Kernels::Reverse::cpu_forward_kernel(input, result, attrs);
+
+      CHECK_MESSAGE(accessors_are_equal(result, correct),
+                    "result=",
+                    format_accessor_w_contents(result));
+    }
+
+    SUBCASE("axis = ff_dim_t{1}") {
+      ReverseAttrs attrs = ReverseAttrs{
+          /*axis=*/ff_dim_t{1_n},
+      };
+
+      GenericTensorAccessorR correct = create_3d_accessor_r_with_contents(
+          {
+              {
+                  {4, 2, 1},
+                  {1, 3, 2},
+              },
+              {
+                  {2, 1, 5},
+                  {3, 3, 6},
+              },
+          },
+          cpu_allocator);
+
+      Kernels::Reverse::cpu_forward_kernel(input, result, attrs);
+
+      CHECK_MESSAGE(accessors_are_equal(result, correct),
+                    "result=",
+                    format_accessor_w_contents(result));
+    }
+
+    SUBCASE("axis = ff_dim_t{2}") {
+      ReverseAttrs attrs = ReverseAttrs{
+          /*axis=*/ff_dim_t{2_n},
+      };
+
+      GenericTensorAccessorR correct = create_3d_accessor_r_with_contents(
+          {
+              {
+                  {2, 3, 1},
+                  {1, 2, 4},
+              },
+              {
+                  {6, 3, 3},
+                  {5, 1, 2},
+              },
+          },
+          cpu_allocator);
+
+      Kernels::Reverse::cpu_forward_kernel(input, result, attrs);
+
+      CHECK_MESSAGE(accessors_are_equal(result, correct),
+                    "result=",
+                    format_accessor_w_contents(result));
+    }
+  }
+}
diff --git a/lib/kernels/test/src/internal/test_utils.cc b/lib/kernels/test/src/internal/test_utils.cc
new file mode 100644
index 0000000000..0f34a6aa06
--- /dev/null
+++ b/lib/kernels/test/src/internal/test_utils.cc
@@ -0,0 +1,392 @@
+#include "internal/test_utils.h"
+#include "op-attrs/tensor_shape.h"
+#include "utils/containers/require_all_same1.h"
+#include "utils/join_strings.h"
+#include <random>
+
+namespace FlexFlow {
+
+GenericTensorAccessorW create_zero_filled_accessor_w(TensorShape const &shape,
+                                                     Allocator &allocator) {
+  GenericTensorAccessorW result_accessor = allocator.allocate_tensor(shape);
+  fill_with_zeros(result_accessor);
+  return result_accessor;
+}
+
+GenericTensorAccessorR create_zero_filled_accessor_r(TensorShape const &shape,
+                                                     Allocator &allocator) {
+  GenericTensorAccessorW accessor =
+      create_zero_filled_accessor_w(shape, allocator);
+  return read_only_accessor_from_write_accessor(accessor);
+}
+
+GenericTensorAccessorW
+    create_1d_accessor_w_with_contents(std::vector<float> const &contents,
+                                       Allocator &allocator) {
+  nonnegative_int ncols = num_elements(contents);
+  ASSERT(ncols > 0);
+
+  TensorShape shape = TensorShape{
+      TensorDims{FFOrdered{ncols}},
+      DataType::FLOAT,
+  };
+
+  Allocator cpu_allocator = create_local_cpu_memory_allocator();
+  GenericTensorAccessorW cpu_accessor = cpu_allocator.allocate_tensor(shape);
+
+  for (nonnegative_int col_idx : nonnegative_range(ncols)) {
+    cpu_accessor.at<DataType::FLOAT>(FFOrdered{col_idx}) =
+        contents.at(col_idx.unwrap_nonnegative());
+  }
+
+  GenericTensorAccessorW result = allocator.allocate_tensor(shape);
+  copy_accessor_data_to_l_from_r(
+      result, read_only_accessor_from_write_accessor(cpu_accessor));
+
+  return result;
+}
+
+GenericTensorAccessorW create_2d_accessor_w_with_contents(
+    std::vector<std::vector<float>> const &contents, Allocator &allocator) {
+  nonnegative_int nrows = num_elements(contents);
+  ASSERT(nrows > 0);
+
+  nonnegative_int ncols = throw_if_unexpected(
+      require_all_same1(transform(contents, [](std::vector<float> const &row) {
+        return num_elements(row);
+      })));
+  ASSERT(ncols > 0);
+
+  TensorShape shape = TensorShape{
+      TensorDims{FFOrdered{nrows, ncols}},
+      DataType::FLOAT,
+  };
+
+  Allocator cpu_allocator = create_local_cpu_memory_allocator();
+  GenericTensorAccessorW cpu_accessor = cpu_allocator.allocate_tensor(shape);
+
+  for (nonnegative_int row_idx : nonnegative_range(nrows)) {
+    for (nonnegative_int col_idx : nonnegative_range(ncols)) {
+      cpu_accessor.at<DataType::FLOAT>(FFOrdered{row_idx, col_idx}) =
+          contents.at(row_idx.unwrap_nonnegative())
+              .at(col_idx.unwrap_nonnegative());
+    }
+  }
+
+  GenericTensorAccessorW result = allocator.allocate_tensor(shape);
+  copy_accessor_data_to_l_from_r(
+      result, read_only_accessor_from_write_accessor(cpu_accessor));
+
+  return result;
+}
+
+GenericTensorAccessorW create_3d_accessor_w_with_contents(
+    std::vector<std::vector<std::vector<float>>> const &contents,
+    Allocator &allocator) {
+  nonnegative_int dim0_size = num_elements(contents);
+  ASSERT(dim0_size > 0);
+
+  nonnegative_int dim1_size = throw_if_unexpected(require_all_same1(
+      transform(contents, [](std::vector<std::vector<float>> const &m) {
+        return num_elements(m);
+      })));
+  ASSERT(dim1_size > 0);
+
+  nonnegative_int dim2_size = throw_if_unexpected(require_all_same1(
+      transform(contents, [](std::vector<std::vector<float>> const &m) {
+        return throw_if_unexpected(
+            require_all_same1(transform(m, [](std::vector<float> const &vec) {
+              return num_elements(vec);
+            })));
+      })));
+  ASSERT(dim2_size > 0);
+
+  TensorShape shape = TensorShape{
+      TensorDims{FFOrdered{dim0_size, dim1_size, dim2_size}},
+      DataType::FLOAT,
+  };
+
+  Allocator cpu_allocator = create_local_cpu_memory_allocator();
+  GenericTensorAccessorW cpu_accessor = cpu_allocator.allocate_tensor(shape);
+
+  for (nonnegative_int dim0_idx : nonnegative_range(dim0_size)) {
+    for (nonnegative_int dim1_idx : nonnegative_range(dim1_size)) {
+      for (nonnegative_int dim2_idx : nonnegative_range(dim2_size)) {
+        cpu_accessor.at<DataType::FLOAT>(
+            FFOrdered{dim0_idx, dim1_idx, dim2_idx}) =
+            contents.at(dim0_idx.unwrap_nonnegative())
+                .at(dim1_idx.unwrap_nonnegative())
+                .at(dim2_idx.unwrap_nonnegative());
+      }
+    }
+  }
+
+  GenericTensorAccessorW result = allocator.allocate_tensor(shape);
+  copy_accessor_data_to_l_from_r(
+      result, read_only_accessor_from_write_accessor(cpu_accessor));
+
+  return result;
+}
+
+GenericTensorAccessorW create_4d_accessor_w_with_contents(
+    std::vector<std::vector<std::vector<std::vector<float>>>> const &contents,
+    Allocator &allocator) {
+  nonnegative_int dim0_size = num_elements(contents);
+  ASSERT(dim0_size > 0);
+
+  nonnegative_int dim1_size = throw_if_unexpected(require_all_same1(transform(
+      contents, [](std::vector<std::vector<std::vector<float>>> const &t) {
+        return num_elements(t);
+      })));
+  ASSERT(dim1_size > 0);
+
+  nonnegative_int dim2_size = throw_if_unexpected(require_all_same1(transform(
+      contents, [](std::vector<std::vector<std::vector<float>>> const &m) {
+        return throw_if_unexpected(require_all_same1(
+            transform(m, [](std::vector<std::vector<float>> const &vec) {
+              return num_elements(vec);
+            })));
+      })));
+  ASSERT(dim2_size > 0);
+
+  nonnegative_int dim3_size = throw_if_unexpected(require_all_same1(transform(
+      contents, [](std::vector<std::vector<std::vector<float>>> const &t) {
+        return throw_if_unexpected(require_all_same1(
+            transform(t, [](std::vector<std::vector<float>> const &mat) {
+              return throw_if_unexpected(require_all_same1(
+                  transform(mat, [](std::vector<float> const &vec) {
+                    return num_elements(vec);
+                  })));
+            })));
+      })));
+  ASSERT(dim3_size > 0);
+
+  TensorShape shape = TensorShape{
+      TensorDims{FFOrdered{dim0_size, dim1_size, dim2_size, dim3_size}},
+      DataType::FLOAT,
+  };
+
+  GenericTensorAccessorW accessor = allocator.allocate_tensor(shape);
+
+  for (nonnegative_int dim0_idx : nonnegative_range(dim0_size)) {
+    for (nonnegative_int dim1_idx : nonnegative_range(dim1_size)) {
+      for (nonnegative_int dim2_idx : nonnegative_range(dim2_size)) {
+        for (nonnegative_int dim3_idx : nonnegative_range(dim3_size)) {
+          accessor.at<DataType::FLOAT>(
+              FFOrdered{dim0_idx, dim1_idx, dim2_idx, dim3_idx}) =
+              contents.at(dim0_idx.unwrap_nonnegative())
+                  .at(dim1_idx.unwrap_nonnegative())
+                  .at(dim2_idx.unwrap_nonnegative())
+                  .at(dim3_idx.unwrap_nonnegative());
+        }
+      }
+    }
+  }
+
+  return accessor;
+}
+
+GenericTensorAccessorR
+    create_1d_accessor_r_with_contents(std::vector<float> const &contents,
+                                       Allocator &allocator) {
+  return read_only_accessor_from_write_accessor(
+      create_1d_accessor_w_with_contents(contents, allocator));
+}
+
+GenericTensorAccessorR create_2d_accessor_r_with_contents(
+    std::vector<std::vector<float>> const &contents, Allocator &allocator) {
+  return read_only_accessor_from_write_accessor(
+      create_2d_accessor_w_with_contents(contents, allocator));
+}
+
+GenericTensorAccessorR create_3d_accessor_r_with_contents(
+    std::vector<std::vector<std::vector<float>>> const &contents,
+    Allocator &allocator) {
+  return read_only_accessor_from_write_accessor(
+      create_3d_accessor_w_with_contents(contents, allocator));
+}
+
+GenericTensorAccessorR create_4d_accessor_r_with_contents(
+    std::vector<std::vector<std::vector<std::vector<float>>>> const &contents,
+    Allocator &allocator) {
+  return read_only_accessor_from_write_accessor(
+      create_4d_accessor_w_with_contents(contents, allocator));
+}
+
+template <DataType DT>
+struct CreateRandomFilledAccessorW {
+  GenericTensorAccessorW operator()(TensorShape const &shape,
+                                    Allocator &allocator) {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+    GenericTensorAccessorW src_accessor = cpu_allocator.allocate_tensor(shape);
+
+    using T = real_type_t<DT>;
+    T *data_ptr = src_accessor.get<DT>();
+
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    size_t num_elements = get_num_elements(shape).unwrap_nonnegative();
+    if constexpr (std::is_same<T, bool>::value) {
+      std::bernoulli_distribution dist(0.5);
+      for (size_t i = 0; i < num_elements; i++) {
+        data_ptr[i] = dist(gen);
+      }
+    } else if constexpr (std::is_floating_point<T>::value) {
+      std::uniform_real_distribution<T> dist(-1.0, 1.0);
+      for (size_t i = 0; i < num_elements; i++) {
+        data_ptr[i] = dist(gen);
+      }
+    } else if constexpr (std::is_integral<T>::value) {
+      std::uniform_int_distribution<T> dist(0, 99);
+      for (size_t i = 0; i < num_elements; i++) {
+        data_ptr[i] = dist(gen);
+      }
+    }
+
+    GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape);
+    copy_accessor_data_to_l_from_r(dst_accessor, src_accessor);
+
+    return dst_accessor;
+  }
+};
+
+GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape,
+                                                       Allocator &allocator) {
+  return DataTypeDispatch1<CreateRandomFilledAccessorW>{}(
+      shape.data_type, shape, allocator);
+}
+
+GenericTensorAccessorR create_random_filled_accessor_r(TensorShape const &shape,
+                                                       Allocator &allocator) {
+  GenericTensorAccessorW accessor =
+      create_random_filled_accessor_w(shape, allocator);
+
+  return read_only_accessor_from_write_accessor(accessor);
+}
+
+template <DataType DT>
+struct FillWithZeros {
+  void operator()(GenericTensorAccessorW const &accessor) {
+    using T = real_type_t<DT>;
+
+    if (accessor.device_type == DeviceType::CPU) {
+      memset(accessor.ptr,
+             0,
+             accessor.shape.get_volume().unwrap_nonnegative() * sizeof(T));
+    } else {
+      checkCUDA(cudaMemset(accessor.ptr,
+                           0,
+                           accessor.shape.get_volume().unwrap_nonnegative() *
+                               sizeof(T)));
+    }
+  }
+};
+
+void fill_with_zeros(GenericTensorAccessorW const &accessor) {
+  DataTypeDispatch1<FillWithZeros>{}(accessor.data_type, accessor);
+}
+
+template <DataType DT>
+struct CPUAccessorRContainsNonZero {
+  bool operator()(GenericTensorAccessorR const &accessor) {
+    using T = real_type_t<DT>;
+
+    T const *data_ptr = accessor.get<DT>();
+
+    int volume = accessor.shape.num_elements().unwrap_nonnegative();
+    for (size_t i = 0; i < volume; i++) {
+      if (data_ptr[i] != 0) {
+        return true;
+      }
+    }
+
+    return false;
+  }
+};
+
+bool contains_non_zero(GenericTensorAccessorR const &accessor) {
+  Allocator cpu_allocator = create_local_cpu_memory_allocator();
+  GenericTensorAccessorR cpu_accessor =
+      copy_tensor_accessor_r_to_cpu_if_necessary(accessor, cpu_allocator);
+  return DataTypeDispatch1<CPUAccessorRContainsNonZero>{}(
+      cpu_accessor.data_type, cpu_accessor);
+}
+
+template <DataType DT>
+struct AccessorsAreEqual {
+  bool operator()(GenericTensorAccessorR const &accessor_a,
+                  GenericTensorAccessorR const &accessor_b) {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+    GenericTensorAccessorR cpu_accessor_a =
+        copy_tensor_accessor_r_to_cpu_if_necessary(accessor_a, cpu_allocator);
+    GenericTensorAccessorR cpu_accessor_b =
+        copy_tensor_accessor_r_to_cpu_if_necessary(accessor_b, cpu_allocator);
+
+    using T = real_type_t<DT>;
+    T const *a_data_ptr = cpu_accessor_a.get<DT>();
+    T const *b_data_ptr = cpu_accessor_b.get<DT>();
+
+    int volume = accessor_a.shape.num_elements().unwrap_nonnegative();
+    for (size_t i = 0; i < volume; i++) {
+      if (a_data_ptr[i] != b_data_ptr[i]) {
+        return false;
+      }
+    }
+
+    return true;
+  }
+};
+
+bool accessors_are_equal(GenericTensorAccessorR const &accessor_a,
+                         GenericTensorAccessorR const &accessor_b) {
+  ASSERT(accessor_a.shape == accessor_b.shape,
+         "accessors_are_equal expects accessors to have the same shape");
+
+  return DataTypeDispatch1<AccessorsAreEqual>{}(
+      accessor_a.data_type, accessor_a, accessor_b);
+}
+
+template <DataType DT>
+struct CreateFilledAccessorW {
+  GenericTensorAccessorW operator()(TensorShape const &shape,
+                                    Allocator &allocator,
+                                    DataTypeValue val) {
+    using T = real_type_t<DT>;
+    if (!val.template has<T>()) {
+      throw mk_runtime_error("create_filed_accessor expected data type of "
+                             "shape and passed-in value to match");
+    }
+
+    auto unwrapped_value = val.get<T>();
+    GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape);
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+    GenericTensorAccessorW src_accessor = cpu_allocator.allocate_tensor(shape);
+
+    T *data_ptr = src_accessor.get<DT>();
+
+    int volume = dst_accessor.shape.num_elements().unwrap_nonnegative();
+    for (size_t i = 0; i < volume; i++) {
+      data_ptr[i] = unwrapped_value;
+    }
+
+    copy_accessor_data_to_l_from_r(dst_accessor, src_accessor);
+    return dst_accessor;
+  }
+};
+
+GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape,
+                                                Allocator &allocator,
+                                                DataTypeValue val) {
+
+  return DataTypeDispatch1<CreateFilledAccessorW>{}(
+      shape.data_type, shape, allocator, val);
+}
+
+GenericTensorAccessorR create_filled_accessor_r(TensorShape const &shape,
+                                                Allocator &allocator,
+                                                DataTypeValue val) {
+  GenericTensorAccessorW w_accessor =
+      create_filled_accessor_w(shape, allocator, val);
+  return read_only_accessor_from_write_accessor(w_accessor);
+}
+} // namespace FlexFlow
diff --git a/lib/kernels/test/src/internal/test_utils.h b/lib/kernels/test/src/internal/test_utils.h
new file mode 100644
index 0000000000..a4fc9b88c8
--- /dev/null
+++ b/lib/kernels/test/src/internal/test_utils.h
@@ -0,0 +1,78 @@
+#ifndef _FLEXFLOW_KERNELS_TEST_SRC_INTERNAL_TEST_UTILS_H
+#define _FLEXFLOW_KERNELS_TEST_SRC_INTERNAL_TEST_UTILS_H
+
+#include "kernels/copy_tensor_accessor.h"
+#include "kernels/datatype_dispatch.h"
+#include "kernels/device.h"
+#include "kernels/local_cpu_allocator.h"
+#include "kernels/local_cuda_allocator.h"
+#include "kernels/managed_ff_stream.h"
+#include "kernels/managed_per_device_ff_handle.h"
+#include "op-attrs/datatype.h"
+#include "op-attrs/datatype_value.dtg.h"
+#include <doctest/doctest.h>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace FlexFlow {
+
+GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape,
+                                                       Allocator &allocator);
+
+GenericTensorAccessorR create_random_filled_accessor_r(TensorShape const &shape,
+                                                       Allocator &allocator);
+
+GenericTensorAccessorW create_zero_filled_accessor_w(TensorShape const &shape,
+                                                     Allocator &allocator);
+
+GenericTensorAccessorR create_zero_filled_accessor_r(TensorShape const &shape,
+                                                     Allocator &allocator);
+
+GenericTensorAccessorW
+    create_1d_accessor_w_with_contents(std::vector<float> const &contents,
+                                       Allocator &allocator);
+GenericTensorAccessorR
+    create_1d_accessor_r_with_contents(std::vector<float> const &contents,
+                                       Allocator &allocator);
+
+GenericTensorAccessorW create_2d_accessor_w_with_contents(
+    std::vector<std::vector<float>> const &contents, Allocator &allocator);
+GenericTensorAccessorR create_2d_accessor_r_with_contents(
+    std::vector<std::vector<float>> const &contents, Allocator &allocator);
+
+GenericTensorAccessorW create_3d_accessor_w_with_contents(
+    std::vector<std::vector<std::vector<float>>> const &contents,
+    Allocator &allocator);
+GenericTensorAccessorR create_3d_accessor_r_with_contents(
+    std::vector<std::vector<std::vector<float>>> const &contents,
+    Allocator &allocator);
+
+GenericTensorAccessorW create_4d_accessor_w_with_contents(
+    std::vector<std::vector<std::vector<std::vector<float>>>> const &contents,
+    Allocator &allocator);
+GenericTensorAccessorR create_4d_accessor_r_with_contents(
+    std::vector<std::vector<std::vector<std::vector<float>>>> const &contents,
+    Allocator &allocator);
+
+bool contains_non_zero(GenericTensorAccessorR const &accessor);
+
+void fill_with_zeros(GenericTensorAccessorW const &accessor);
+
+void print_2d_tensor_accessor_contents(GenericTensorAccessorR const &accessor,
+                                       std::ostream &stream);
+
+bool accessors_are_equal(GenericTensorAccessorR const &accessor_a,
+                         GenericTensorAccessorR const &accessor_b);
+
+GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape,
+                                                Allocator &allocator,
+                                                DataTypeValue val);
+
+GenericTensorAccessorR create_filled_accessor_r(TensorShape const &shape,
+                                                Allocator &allocator,
+                                                DataTypeValue val);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/kernels/test/src/kernels/accessor.cc b/lib/kernels/test/src/kernels/accessor.cc
new file mode 100644
index 0000000000..98f8471212
--- /dev/null
+++ b/lib/kernels/test/src/kernels/accessor.cc
@@ -0,0 +1,73 @@
+#include "kernels/accessor.h"
+#include "internal/test_utils.h"
+#include "kernels/local_cpu_allocator.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("calculate_accessor_offset") {
+    SUBCASE("one dimension") {
+      std::vector<nonnegative_int> indices = {4_n};
+      ArrayShape shape = ArrayShape{
+          std::vector<nonnegative_int>{
+              13_n,
+          },
+      };
+
+      nonnegative_int result = calculate_accessor_offset(indices, shape);
+      nonnegative_int correct = 4_n;
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("multiple dimensions") {
+      std::vector<nonnegative_int> indices = {2_n, 4_n};
+      ArrayShape shape = ArrayShape{
+          std::vector<nonnegative_int>{
+              6_n,
+              5_n,
+          },
+      };
+
+      nonnegative_int result = calculate_accessor_offset(indices, shape);
+      nonnegative_int correct = 2_n * 5_n + 4_n;
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("zero dimensions") {
+      std::vector<nonnegative_int> indices = {};
+      ArrayShape shape = ArrayShape{std::vector<nonnegative_int>{}};
+
+      nonnegative_int result = calculate_accessor_offset(indices, shape);
+      nonnegative_int correct = 0_n;
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("index and shape dimensions do not match") {
+      std::vector<nonnegative_int> indices = {1_n, 2_n, 4_n};
+      ArrayShape shape = ArrayShape{
+          std::vector<nonnegative_int>{
+              6_n,
+              5_n,
+          },
+      };
+
+      CHECK_THROWS(calculate_accessor_offset(indices, shape));
+    }
+
+    SUBCASE("out of bounds index") {
+      std::vector<nonnegative_int> indices = {2_n, 5_n};
+      ArrayShape shape = ArrayShape{
+          std::vector<nonnegative_int>{
+              6_n,
+              5_n,
+          },
+      };
+
+      CHECK_THROWS(calculate_accessor_offset(indices, shape));
+    }
+  }
+}
diff --git a/lib/kernels/test/src/kernels/array_shape.cc b/lib/kernels/test/src/kernels/array_shape.cc
new file mode 100644
index 0000000000..1fb4c0b541
--- /dev/null
+++ b/lib/kernels/test/src/kernels/array_shape.cc
@@ -0,0 +1,49 @@
+#include "kernels/array_shape.h"
+#include "test/utils/doctest/fmt/unordered_set.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("get_array_coord_set") {
+    SUBCASE("ArrayShape is not empty") {
+      ArrayShape input = ArrayShape{
+          LegionOrdered{2_n, 1_n, 3_n},
+      };
+
+      std::unordered_set<ArrayCoord> result = get_array_coord_set(input);
+      std::unordered_set<ArrayCoord> correct = {
+          ArrayCoord{FFOrdered{0_n, 0_n, 0_n}},
+          ArrayCoord{FFOrdered{0_n, 0_n, 1_n}},
+          ArrayCoord{FFOrdered{1_n, 0_n, 0_n}},
+          ArrayCoord{FFOrdered{1_n, 0_n, 1_n}},
+          ArrayCoord{FFOrdered{2_n, 0_n, 0_n}},
+          ArrayCoord{FFOrdered{2_n, 0_n, 1_n}},
+      };
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("ArrayShape has a dimension of size zero") {
+      ArrayShape input = ArrayShape{
+          LegionOrdered{2_n, 0_n, 3_n},
+      };
+
+      std::unordered_set<ArrayCoord> result = get_array_coord_set(input);
+      std::unordered_set<ArrayCoord> correct = {};
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("ArrayShape is zero-dimensional") {
+      ArrayShape input = ArrayShape{LegionOrdered<nonnegative_int>{}};
+
+      std::unordered_set<ArrayCoord> result = get_array_coord_set(input);
+      std::unordered_set<ArrayCoord> correct = {
+          ArrayCoord{FFOrdered<nonnegative_int>{}},
+      };
+
+      CHECK(result == correct);
+    }
+  }
+}
diff --git a/lib/kernels/test/src/kernels/format_accessor_contents.cc b/lib/kernels/test/src/kernels/format_accessor_contents.cc
new file mode 100644
index 0000000000..915a84c335
--- /dev/null
+++ b/lib/kernels/test/src/kernels/format_accessor_contents.cc
@@ -0,0 +1,94 @@
+#include "kernels/format_accessor_contents.h"
+#include "internal/test_utils.h"
+#include "kernels/local_cpu_allocator.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("format_accessor_r_contents(GenericTensorAccessorR)") {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    SUBCASE("accessor is 1d") {
+      GenericTensorAccessorR accessor =
+          create_1d_accessor_r_with_contents({1, 2, 3, 2}, cpu_allocator);
+
+      std::string correct = "[1 2 3 2]";
+
+      std::string result = format_accessor_r_contents(accessor);
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("accessor is 2d") {
+      GenericTensorAccessorR accessor = create_2d_accessor_r_with_contents(
+          {
+              {1, 2, 3, 5},
+              {4, 3, 3, 2},
+              {1, 1, 5, 8},
+          },
+          cpu_allocator);
+
+      std::string correct = "[\n"
+                            "  [1 2 3 5]\n"
+                            "  [4 3 3 2]\n"
+                            "  [1 1 5 8]\n"
+                            "]";
+
+      std::string result = format_accessor_r_contents(accessor);
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("accessor is 3d") {
+      GenericTensorAccessorR accessor = create_3d_accessor_r_with_contents(
+          {
+              {
+                  {1, 2, 3, 6},
+                  {4, 3, 3, 9},
+                  {1, 1, 5, 1},
+              },
+              {
+                  {4, 1, 8, 7},
+                  {9, 4, 2, 4},
+                  {1, 0, 0, 6},
+              },
+              {
+                  {2, 1, 1, 9},
+                  {1, 3, 6, 2},
+                  {1, 9, 8, 9},
+              },
+          },
+          cpu_allocator);
+
+      std::string correct = "[\n"
+                            "  [\n"
+                            "    [1 2 3 6]\n"
+                            "    [4 3 3 9]\n"
+                            "    [1 1 5 1]\n"
+                            "  ]\n"
+                            "  [\n"
+                            "    [4 1 8 7]\n"
+                            "    [9 4 2 4]\n"
+                            "    [1 0 0 6]\n"
+                            "  ]\n"
+                            "  [\n"
+                            "    [2 1 1 9]\n"
+                            "    [1 3 6 2]\n"
+                            "    [1 9 8 9]\n"
+                            "  ]\n"
+                            "]";
+
+      std::string result = format_accessor_r_contents(accessor);
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("accessor is some other dimension") {
+      GenericTensorAccessorR accessor =
+          create_4d_accessor_r_with_contents({{{{5}}}}, cpu_allocator);
+
+      CHECK_THROWS(format_accessor_r_contents(accessor));
+    }
+  }
+}
diff --git a/lib/kernels/test/src/kernels/legion_dim.cc b/lib/kernels/test/src/kernels/legion_dim.cc
new file mode 100644
index 0000000000..34822ed1c3
--- /dev/null
+++ b/lib/kernels/test/src/kernels/legion_dim.cc
@@ -0,0 +1,32 @@
+#include "kernels/legion_dim.h"
+#include "test/utils/doctest/fmt/set.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
+  TEST_CASE("key_range(LegionOrdered<T>)") {
+    SUBCASE("input is non-empty") {
+      LegionOrdered<int> input = {5, 3, 2, 3};
+
+      std::set<legion_dim_t> result = key_range(input);
+      std::set<legion_dim_t> correct = {
+          legion_dim_t{0_n},
+          legion_dim_t{1_n},
+          legion_dim_t{2_n},
+          legion_dim_t{3_n},
+      };
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("input is empty") {
+      LegionOrdered<int> input = {};
+
+      std::set<legion_dim_t> result = key_range(input);
+      std::set<legion_dim_t> correct = {};
+
+      CHECK(result == correct);
+    }
+  }
+}
diff --git a/lib/kernels/test/src/kernels/legion_ordered/legion_ordered.cc b/lib/kernels/test/src/kernels/legion_ordered/legion_ordered.cc
new file mode 100644
index 0000000000..4b50cad735
--- /dev/null
+++ b/lib/kernels/test/src/kernels/legion_ordered/legion_ordered.cc
@@ -0,0 +1,12 @@
+#include "kernels/legion_ordered/legion_ordered.h"
+#include "test/utils/rapidcheck.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
+  TEST_CASE_TEMPLATE(
+      "Arbitrary<LegionOrdered<T>> with T=", T, int, double, char) {
+    RC_SUBCASE([](LegionOrdered<T>) {});
+  }
+}
diff --git a/lib/kernels/test/src/kernels/legion_ordered/slice.cc b/lib/kernels/test/src/kernels/legion_ordered/slice.cc
new file mode 100644
index 0000000000..d0211d270e
--- /dev/null
+++ b/lib/kernels/test/src/kernels/legion_ordered/slice.cc
@@ -0,0 +1,30 @@
+#include "kernels/legion_ordered/slice.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
+  TEST_CASE("slice(LegionOrdered<T>, ..., ...") {
+    LegionOrdered<size_t> d = LegionOrdered<size_t>{
+        1,
+        2,
+        3,
+        4,
+    };
+    SUBCASE("legion_dim_t, legion_dim_t") {
+      LegionOrdered<size_t> result = slice(d,
+                                           legion_dim_t{nonnegative_int{1}},
+                                           legion_dim_t{nonnegative_int{3}});
+      LegionOrdered<size_t> correct = LegionOrdered<size_t>{2, 3};
+
+      CHECK(result == correct);
+    }
+    SUBCASE("legion_dim_t, std::nullopt_t") {
+      LegionOrdered<size_t> result =
+          slice(d, legion_dim_t{nonnegative_int{1}}, std::nullopt);
+      LegionOrdered<size_t> correct = LegionOrdered<size_t>{2, 3, 4};
+
+      CHECK(result == correct);
+    }
+  }
+}
diff --git a/lib/kernels/test/src/kernels/legion_ordered/transform.cc b/lib/kernels/test/src/kernels/legion_ordered/transform.cc
new file mode 100644
index 0000000000..759507264f
--- /dev/null
+++ b/lib/kernels/test/src/kernels/legion_ordered/transform.cc
@@ -0,0 +1,36 @@
+#include "kernels/legion_ordered/transform.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
+  TEST_CASE("transform(LegionOrdered<T>, F)") {
+    SUBCASE("input is empty") {
+      LegionOrdered<std::string> input = {};
+
+      LegionOrdered<int> result =
+          transform(input, [](std::string const &) -> int {
+            CHECK(false);
+            return 0;
+          });
+      LegionOrdered<int> correct = {};
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("input is not empty") {
+      LegionOrdered<int> input = {2, 1, 2, 5};
+
+      LegionOrdered<std::string> result =
+          transform(input, [](int x) { return fmt::to_string(x); });
+      LegionOrdered<std::string> correct = LegionOrdered<std::string>{
+          "2",
+          "1",
+          "2",
+          "5",
+      };
+
+      CHECK(result == correct);
+    }
+  }
+}
diff --git a/lib/kernels/test/src/test_attention_kernel.cc b/lib/kernels/test/src/test_attention_kernel.cc
index 64264f6c39..9064ae4824 100644
--- a/lib/kernels/test/src/test_attention_kernel.cc
+++ b/lib/kernels/test/src/test_attention_kernel.cc
@@ -1,10 +1,10 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/attention_kernels.h"
-#include "test_utils.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
 
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test multi-head attention kernel") {
     nonnegative_int num_samples = 10_n;
     nonnegative_int num_heads = 4_n;
@@ -19,7 +19,9 @@ TEST_SUITE(FF_TEST_SUITE) {
     nonnegative_int kvSeqLength = 20_n;
 
     ManagedFFStream managed_stream{};
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
@@ -39,16 +41,26 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*kvSeqLength=*/kvSeqLength.unwrap_nonnegative(),
         /*add_bias_kv=*/false);
 
-    TensorShape query_shape = make_float_tensor_shape_from_legion_dims(
-        {qoSeqLength, num_samples, qSize});
-    TensorShape key_shape = make_float_tensor_shape_from_legion_dims(
-        {kvSeqLength, num_samples, kSize});
-    TensorShape value_shape = make_float_tensor_shape_from_legion_dims(
-        {kvSeqLength, num_samples, vSize});
-    TensorShape output_shape = make_float_tensor_shape_from_legion_dims(
-        {qoSeqLength, num_samples, oProjSize});
-    TensorShape weight_shape = make_float_tensor_shape_from_legion_dims(
-        {nonnegative_int{state.weightSize}});
+    TensorShape query_shape = TensorShape{
+        TensorDims{FFOrdered{qoSeqLength, num_samples, qSize}},
+        DataType::FLOAT,
+    };
+    TensorShape key_shape = TensorShape{
+        TensorDims{FFOrdered{kvSeqLength, num_samples, kSize}},
+        DataType::FLOAT,
+    };
+    TensorShape value_shape = TensorShape{
+        TensorDims{FFOrdered{kvSeqLength, num_samples, vSize}},
+        DataType::FLOAT,
+    };
+    TensorShape output_shape = TensorShape{
+        TensorDims{FFOrdered{qoSeqLength, num_samples, oProjSize}},
+        DataType::FLOAT,
+    };
+    TensorShape weight_shape = TensorShape{
+        TensorDims{FFOrdered{nonnegative_int{state.weightSize}}},
+        DataType::FLOAT,
+    };
 
     GenericTensorAccessorW query_accessor =
         create_random_filled_accessor_w(query_shape, allocator);
@@ -72,9 +84,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           weight_accessor.get_float_ptr(),
           output_accessor.get_float_ptr());
 
-      std::vector<float> host_output = load_data_to_host_from_device<float>(
-          read_only_accessor_from_write_accessor(output_accessor));
-      CHECK(contains_non_zero(host_output));
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
diff --git a/lib/kernels/test/src/test_batch_matmul_kernel.cc b/lib/kernels/test/src/test_batch_matmul_kernel.cc
index cacd5b60fb..5f63b48198 100644
--- a/lib/kernels/test/src/test_batch_matmul_kernel.cc
+++ b/lib/kernels/test/src/test_batch_matmul_kernel.cc
@@ -1,10 +1,10 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/batch_matmul_kernels.h"
-#include "test_utils.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
 
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test BatchMatmul Kernel") {
     nonnegative_int m = 10_n;
     nonnegative_int n = 10_n;
@@ -15,16 +15,24 @@ TEST_SUITE(FF_TEST_SUITE) {
     int seq_length = -1;
 
     ManagedFFStream managed_stream{};
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    TensorShape input_shape_a =
-        make_float_tensor_shape_from_legion_dims({m, k, batch});
-    TensorShape input_shape_b =
-        make_float_tensor_shape_from_legion_dims({k, n, batch});
-    TensorShape output_shape =
-        make_float_tensor_shape_from_legion_dims({m, n, batch});
+    TensorShape input_shape_a = TensorShape{
+        TensorDims{FFOrdered{batch, k, m}},
+        DataType::FLOAT,
+    };
+    TensorShape input_shape_b = TensorShape{
+        TensorDims{FFOrdered{batch, n, k}},
+        DataType::FLOAT,
+    };
+    TensorShape output_shape = TensorShape{
+        TensorDims{FFOrdered{batch, n, m}},
+        DataType::FLOAT,
+    };
 
     GenericTensorAccessorW a_accessor =
         create_random_filled_accessor_w(input_shape_a, allocator);
diff --git a/lib/kernels/test/src/test_batch_norm_kernel.cc b/lib/kernels/test/src/test_batch_norm_kernel.cc
index b4c43cf1d8..903ad8cc43 100644
--- a/lib/kernels/test/src/test_batch_norm_kernel.cc
+++ b/lib/kernels/test/src/test_batch_norm_kernel.cc
@@ -1,10 +1,11 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/batch_norm_kernels.h"
-#include "test_utils.h"
+#include "op-attrs/datatype_value.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
 
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test BatchNorm Kernel") {
     nonnegative_int output_n = 1_n;
     nonnegative_int output_c = 10_n;
@@ -12,7 +13,9 @@ TEST_SUITE(FF_TEST_SUITE) {
     nonnegative_int output_w = 10_n;
 
     ManagedFFStream managed_stream{};
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
@@ -26,25 +29,33 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*output_w=*/output_w.unwrap_nonnegative(),
         /*relu=*/true);
 
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims(
-        {output_n, output_c, output_h, output_w});
-    TensorShape output_shape = make_float_tensor_shape_from_legion_dims(
-        {output_n, output_c, output_h, output_w});
-    TensorShape scale_shape = make_float_tensor_shape_from_legion_dims(
-        {output_n, output_c, output_h, output_w});
-    TensorShape bias_shape = make_float_tensor_shape_from_legion_dims(
-        {output_n, output_c, output_h, output_w});
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{output_n, output_c, output_h, output_w}},
+        DataType::FLOAT,
+    };
+    TensorShape output_shape = TensorShape{
+        TensorDims{FFOrdered{output_n, output_c, output_h, output_w}},
+        DataType::FLOAT,
+    };
+    TensorShape scale_shape = TensorShape{
+        TensorDims{FFOrdered{output_n, output_c, output_h, output_w}},
+        DataType::FLOAT,
+    };
+    TensorShape bias_shape = TensorShape{
+        TensorDims{FFOrdered{output_n, output_c, output_h, output_w}},
+        DataType::FLOAT,
+    };
 
     GenericTensorAccessorW input_accessor =
         create_random_filled_accessor_w(input_shape, allocator);
     GenericTensorAccessorW output_accessor =
         create_random_filled_accessor_w(output_shape, allocator);
-    GenericTensorAccessorW scale_accessor =
-        create_filled_accessor_w(scale_shape, allocator, 1.0f);
+    GenericTensorAccessorW scale_accessor = create_filled_accessor_w(
+        scale_shape, allocator, make_float_data_type_value(1));
 
     SUBCASE("forward_kernel") {
-      GenericTensorAccessorW bias_accessor =
-          create_filled_accessor_w(bias_shape, allocator, 0.0f);
+      GenericTensorAccessorW bias_accessor = create_filled_accessor_w(
+          bias_shape, allocator, make_float_data_type_value(0));
 
       Kernels::BatchNorm::forward_kernel(
           /*stream=*/managed_stream.raw_stream(),
@@ -54,10 +65,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           /*scale_ptr=*/scale_accessor.get_float_ptr(),
           /*bias_ptr=*/bias_accessor.get_float_ptr());
 
-      std::vector<float> host_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-      CHECK(contains_non_zero(host_output_data));
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
@@ -73,9 +81,9 @@ TEST_SUITE(FF_TEST_SUITE) {
       Kernels::BatchNorm::backward_kernel(
           /*stream=*/managed_stream.raw_stream(),
           /*per_device_state=*/state,
-          /*input_ptr=*/input_accessor.get_float_ptr(),
-          /*output_grad_ptr=*/output_grad_accessor.get_float_ptr(),
           /*output_ptr=*/output_accessor.get_float_ptr(),
+          /*output_grad_ptr=*/output_grad_accessor.get_float_ptr(),
+          /*input_ptr=*/input_accessor.get_float_ptr(),
           /*input_grad_ptr=*/input_grad_accessor.get_float_ptr(),
           /*scale_ptr=*/scale_accessor.get_float_ptr(),
           /*scale_grad_ptr=*/scale_grad_accessor.get_float_ptr(),
@@ -83,19 +91,9 @@ TEST_SUITE(FF_TEST_SUITE) {
           /*numElements=*/
           input_accessor.shape.num_elements().unwrap_nonnegative());
 
-      std::vector<float> host_input_grad_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(input_grad_accessor));
-      std::vector<float> host_scale_grad_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(scale_grad_accessor));
-      std::vector<float> host_bias_grad_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(bias_grad_accessor));
-
-      CHECK(contains_non_zero(host_input_grad_data));
-      CHECK(contains_non_zero(host_scale_grad_data));
-      CHECK(contains_non_zero(host_bias_grad_data));
+      CHECK(contains_non_zero(input_grad_accessor));
+      CHECK(contains_non_zero(scale_grad_accessor));
+      CHECK(contains_non_zero(bias_grad_accessor));
     }
 
     Kernels::BatchNorm::cleanup_kernel(allocator,
diff --git a/lib/kernels/test/src/test_cast_kernel.cc b/lib/kernels/test/src/test_cast_kernel.cc
index 0e0769014d..0c41fe12ac 100644
--- a/lib/kernels/test/src/test_cast_kernel.cc
+++ b/lib/kernels/test/src/test_cast_kernel.cc
@@ -1,56 +1,86 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/cast_kernels.h"
-#include "test_utils.h"
-#include <type_traits>
+#include "kernels/cast_kernels_cpu.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Call Cast Forward and Backward Kernels") {
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    TensorShape input_shape =
-        make_float_tensor_shape_from_legion_dims({100_n, 100_n});
-    TensorShape output_shape =
-        make_double_tensor_shape_from_legion_dims({100_n, 100_n});
-
-    GenericTensorAccessorW output_accessor =
-        create_random_filled_accessor_w(output_shape, allocator);
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{100_n, 100_n}},
+        DataType::FLOAT,
+    };
+    TensorShape output_shape = TensorShape{
+        TensorDims{FFOrdered{100_n, 100_n}},
+        DataType::DOUBLE,
+    };
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorR input_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(input_shape, allocator));
-
-      Kernels::Cast::forward_kernel(managed_stream.raw_stream(),
-                                    input_accessor,
-                                    output_accessor,
-                                    DataType::FLOAT,
-                                    DataType::DOUBLE);
+          create_random_filled_accessor_r(input_shape, allocator);
+      GenericTensorAccessorW output_accessor =
+          allocator.allocate_tensor(output_shape);
 
-      std::vector<double> host_double_data =
-          load_data_to_host_from_device<double>(
-              read_only_accessor_from_write_accessor(output_accessor));
+      Kernels::Cast::forward_kernel(
+          managed_stream.raw_stream(), input_accessor, output_accessor);
 
-      CHECK(contains_non_zero(host_double_data));
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
+      GenericTensorAccessorR grad_output_accessor =
+          create_random_filled_accessor_r(output_shape, allocator);
       GenericTensorAccessorW grad_input_accessor =
-          allocator.allocate_tensor(input_shape);
-
-      Kernels::Cast::backward_kernel(
-          managed_stream.raw_stream(),
-          read_only_accessor_from_write_accessor(output_accessor),
-          grad_input_accessor,
-          DataType::DOUBLE,
-          DataType::FLOAT);
-
-      std::vector<float> host_grad_float_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(grad_input_accessor));
-      CHECK(contains_non_zero(host_grad_float_data));
+          create_zero_filled_accessor_w(input_shape, allocator);
+
+      Kernels::Cast::backward_kernel(managed_stream.raw_stream(),
+                                     grad_output_accessor,
+                                     grad_input_accessor);
+
+      CHECK(contains_non_zero(grad_input_accessor));
+    }
+  }
+
+  TEST_CASE("Check Cast Forward Kernel against CPU Kernel") {
+    ManagedFFStream managed_stream{};
+
+    Allocator gpu_allocator = create_local_cuda_memory_allocator();
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{10_n, 2_n}},
+        DataType::FLOAT,
+    };
+    TensorShape output_shape = TensorShape{
+        TensorDims{FFOrdered{10_n, 2_n}},
+        DataType::DOUBLE,
+    };
+
+    // Only calling forward kernel as backward kernel is exactly the same
+    SUBCASE("forward_kernel") {
+      // Run GPU Forward Kernel
+      GenericTensorAccessorR input_accessor_gpu =
+          create_random_filled_accessor_r(input_shape, gpu_allocator);
+      GenericTensorAccessorW output_accessor_gpu =
+          create_zero_filled_accessor_w(output_shape, gpu_allocator);
+
+      Kernels::Cast::forward_kernel(
+          managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu);
+
+      // Run CPU Forward Kernel
+      GenericTensorAccessorR input_accessor_cpu =
+          copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator);
+      GenericTensorAccessorW output_accessor_cpu =
+          create_zero_filled_accessor_w(output_shape, cpu_allocator);
+
+      Kernels::Cast::cpu_forward_kernel(input_accessor_cpu,
+                                        output_accessor_cpu);
+
+      CHECK(accessors_are_equal(output_accessor_gpu, output_accessor_cpu));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_combine_kernel.cc b/lib/kernels/test/src/test_combine_kernel.cc
index 2b6b9bf589..2040dcbd5d 100644
--- a/lib/kernels/test/src/test_combine_kernel.cc
+++ b/lib/kernels/test/src/test_combine_kernel.cc
@@ -1,39 +1,39 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/combine_kernels.h"
-#include "test_utils.h"
+#include "kernels/combine_kernels_cpu.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
-TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE("Test combine kernel") {
-    ManagedPerDeviceFFHandle managed_handle{};
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
+  TEST_CASE("Call Combine Forward and Backward Kernels") {
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    TensorShape input_shape =
-        make_float_tensor_shape_from_legion_dims({100_n, 100_n});
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{100_n, 100_n}},
+        DataType::FLOAT,
+    };
     TensorShape output_shape = input_shape;
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorR input_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(input_shape, allocator));
+          create_random_filled_accessor_r(input_shape, allocator);
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
 
       Kernels::Combine::forward_kernel(
           managed_stream.raw_stream(), input_accessor, output_accessor);
 
-      std::vector<float> host_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-      CHECK(contains_non_zero(host_output_data));
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
       GenericTensorAccessorR output_grad_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(output_shape, allocator));
+          create_random_filled_accessor_r(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
           allocator.allocate_tensor(input_shape);
 
@@ -41,9 +41,66 @@ TEST_SUITE(FF_TEST_SUITE) {
                                         output_grad_accessor,
                                         input_grad_accessor);
 
-      std::vector<float> host_input_grad = load_data_to_host_from_device<float>(
-          read_only_accessor_from_write_accessor(input_grad_accessor));
-      CHECK(contains_non_zero(host_input_grad));
+      CHECK(contains_non_zero(input_grad_accessor));
+    }
+  }
+
+  TEST_CASE("Check Combine Forward Kernel against CPU Kernel") {
+    ManagedFFStream managed_stream{};
+
+    Allocator gpu_allocator = create_local_cuda_memory_allocator();
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{5_n, 5_n}},
+        DataType::FLOAT,
+    };
+    TensorShape output_shape = input_shape;
+
+    SUBCASE("forward_kernel") {
+      // Run GPU Combine Forward Kernel
+      GenericTensorAccessorR input_accessor_gpu =
+          create_random_filled_accessor_r(input_shape, gpu_allocator);
+      GenericTensorAccessorW output_accessor_gpu =
+          gpu_allocator.allocate_tensor(output_shape);
+
+      Kernels::Combine::forward_kernel(
+          managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu);
+
+      // Run CPU Combine Forward Kernel
+      GenericTensorAccessorR input_accessor_cpu =
+          copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator);
+      GenericTensorAccessorW output_accessor_cpu =
+          cpu_allocator.allocate_tensor(output_shape);
+
+      Kernels::Combine::cpu_forward_kernel(input_accessor_cpu,
+                                           output_accessor_cpu);
+
+      CHECK(accessors_are_equal(output_accessor_gpu, output_accessor_cpu));
+    }
+
+    SUBCASE("backward_kernel") {
+      // Run GPU Combine Backward Kernel
+      GenericTensorAccessorR output_grad_accessor_gpu =
+          create_random_filled_accessor_r(output_shape, gpu_allocator);
+      GenericTensorAccessorW input_grad_accessor_gpu =
+          create_zero_filled_accessor_w(input_shape, gpu_allocator);
+
+      Kernels::Combine::backward_kernel(managed_stream.raw_stream(),
+                                        output_grad_accessor_gpu,
+                                        input_grad_accessor_gpu);
+
+      // Run CPU Combine Backward Kernel
+      GenericTensorAccessorR output_grad_accessor_cpu =
+          copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator);
+      GenericTensorAccessorW input_grad_accessor_cpu =
+          create_zero_filled_accessor_w(input_shape, cpu_allocator);
+
+      Kernels::Combine::cpu_backward_kernel(output_grad_accessor_cpu,
+                                            input_grad_accessor_cpu);
+
+      CHECK(accessors_are_equal(input_grad_accessor_gpu,
+                                input_grad_accessor_cpu));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_concat_kernel.cc b/lib/kernels/test/src/test_concat_kernel.cc
index 215e599716..c2df907917 100644
--- a/lib/kernels/test/src/test_concat_kernel.cc
+++ b/lib/kernels/test/src/test_concat_kernel.cc
@@ -1,56 +1,113 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/concat_kernels.h"
-#include "test_utils.h"
 #include "utils/containers/repeat.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test concat kernel forward and backward") {
-    nonnegative_int num_inputs = 3_n;
-    nonnegative_int size_per_input = 100_n;
-    ff_dim_t concat_axis = ff_dim_t{0_n};
-
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
-
-    TensorShape input_shape =
-        make_float_tensor_shape_from_legion_dims({size_per_input});
-    TensorShape output_shape =
-        make_float_tensor_shape_from_legion_dims({size_per_input, num_inputs});
-
     Allocator allocator = create_local_cuda_memory_allocator();
 
+    const nonnegative_int num_inputs = 4_n;
+
     SUBCASE("forward_kernel") {
-      std::vector<GenericTensorAccessorR> input_accessors =
-          repeat(num_inputs, [&]() {
-            return read_only_accessor_from_write_accessor(
-                create_random_filled_accessor_w(input_shape, allocator));
-          });
-      GenericTensorAccessorW output_accessor =
-          allocator.allocate_tensor(output_shape);
-
-      Kernels::Concat::forward_kernel(managed_stream.raw_stream(),
-                                      output_accessor,
-                                      input_accessors,
-                                      concat_axis);
-
-      std::vector<float> host_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-
-      CHECK(contains_non_zero(host_output_data));
+      auto run_forward_test = [&](nonnegative_int input_rows,
+                                  nonnegative_int input_cols,
+                                  TensorShape output_shape,
+                                  ff_dim_t concat_axis) {
+        TensorShape input_shape = TensorShape{
+            TensorDims{FFOrdered{input_rows, input_cols}},
+            DataType::FLOAT,
+        };
+
+        std::vector<GenericTensorAccessorR> input_accessors =
+            repeat(num_inputs, [&]() {
+              return create_random_filled_accessor_r(input_shape, allocator);
+            });
+
+        GenericTensorAccessorW output_accessor =
+            allocator.allocate_tensor(output_shape);
+
+        Kernels::Concat::forward_kernel(managed_stream.raw_stream(),
+                                        output_accessor,
+                                        input_accessors,
+                                        concat_axis);
+
+        CHECK(contains_non_zero(output_accessor));
+      };
+
+      SUBCASE("test forward concat, axis = 0") {
+        nonnegative_int input_rows = 2_n;
+        nonnegative_int input_cols = 4_n;
+        TensorShape output_shape = TensorShape{
+            TensorDims{FFOrdered{num_inputs * input_rows, input_cols}},
+            DataType::FLOAT,
+        };
+        run_forward_test(input_rows, input_cols, output_shape, ff_dim_t{0_n});
+      }
+
+      SUBCASE("test forward concat, axis = 1") {
+        nonnegative_int input_rows = 4_n;
+        nonnegative_int input_cols = 2_n;
+        TensorShape output_shape = TensorShape{
+            TensorDims{FFOrdered{input_rows, num_inputs * input_cols}},
+            DataType::FLOAT,
+        };
+        run_forward_test(input_rows, input_cols, output_shape, ff_dim_t{1_n});
+      }
     }
 
     SUBCASE("backward_kernel") {
-      GenericTensorAccessorR output_grad_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(output_shape, allocator));
-      std::vector<GenericTensorAccessorW> input_grad_accessors = repeat(
-          num_inputs, [&]() { return allocator.allocate_tensor(input_shape); });
-      Kernels::Concat::backward_kernel(managed_stream.raw_stream(),
-                                       output_grad_accessor,
-                                       input_grad_accessors,
-                                       concat_axis);
+      auto run_backward_test = [&](nonnegative_int input_rows,
+                                   nonnegative_int input_cols,
+                                   TensorShape output_shape,
+                                   ff_dim_t concat_axis) {
+        TensorShape input_shape = TensorShape{
+            TensorDims{FFOrdered{input_rows, input_cols}},
+            DataType::FLOAT,
+        };
+
+        GenericTensorAccessorR output_grad_accessor =
+            create_random_filled_accessor_r(output_shape, allocator);
+
+        std::vector<GenericTensorAccessorW> input_grad_accessors =
+            repeat(num_inputs, [&]() {
+              return create_zero_filled_accessor_w(input_shape, allocator);
+            });
+
+        Kernels::Concat::backward_kernel(managed_stream.raw_stream(),
+                                         output_grad_accessor,
+                                         input_grad_accessors,
+                                         concat_axis);
+
+        for (auto &accessor : input_grad_accessors) {
+          CHECK(contains_non_zero(accessor));
+        }
+      };
+
+      SUBCASE("test backward concat, axis = 0") {
+        nonnegative_int input_rows = 2_n;
+        nonnegative_int input_cols = 4_n;
+        TensorShape output_shape = TensorShape{
+            TensorDims{FFOrdered{num_inputs * input_rows, input_cols}},
+            DataType::FLOAT,
+        };
+        run_backward_test(input_rows, input_cols, output_shape, ff_dim_t{0_n});
+      }
+
+      SUBCASE("test backward concat, axis = 1") {
+        nonnegative_int input_rows = 4_n;
+        nonnegative_int input_cols = 2_n;
+        TensorShape output_shape = TensorShape{
+            TensorDims{FFOrdered{input_rows, num_inputs * input_cols}},
+            DataType::FLOAT,
+        };
+        run_backward_test(input_rows, input_cols, output_shape, ff_dim_t{1_n});
+      }
     }
   }
 }
diff --git a/lib/kernels/test/src/test_cuda.cc b/lib/kernels/test/src/test_cuda.cc
index ed5852bc31..de3215cf2d 100644
--- a/lib/kernels/test/src/test_cuda.cc
+++ b/lib/kernels/test/src/test_cuda.cc
@@ -1,10 +1,10 @@
-#include "doctest/doctest.h"
-#include "test_utils.h"
+#include "internal/test_utils.h"
+#include <doctest/doctest.h>
 
 #include <random>
 
 namespace FlexFlow {
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test CUDA") {
     int deviceCount = 0;
 
diff --git a/lib/kernels/test/src/test_dropout.cc b/lib/kernels/test/src/test_dropout.cc
index 86f8f2102b..409b06d9a9 100644
--- a/lib/kernels/test/src/test_dropout.cc
+++ b/lib/kernels/test/src/test_dropout.cc
@@ -1,38 +1,37 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/dropout_kernels.h"
-#include "test_utils.h"
 #include "utils/containers/count.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test Dropout Kernels") {
     unsigned long long seed = 12345;
     float dropout_rate = 0.1;
 
     ArrayShape shape = ArrayShape{
-        std::vector<nonnegative_int>{10_n, 10_n},
+        std::vector{10_n, 10_n},
     };
 
-    TensorShape input_shape =
-        make_float_tensor_shape_from_legion_dims({10_n, 10_n});
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{10_n, 10_n}},
+        DataType::FLOAT,
+    };
     TensorShape output_shape = input_shape;
 
     ManagedFFStream managed_stream{};
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
     DropoutPerDeviceState state = Kernels::Dropout::init_kernel(
         managed_handle.raw_handle(), dropout_rate, seed, shape, allocator);
 
-    auto get_zero_count = [](std::vector<float> const &data) {
-      return count(data, [](float x) { return x == 0.0f; });
-    };
-
     SUBCASE("forward_kernel") {
       GenericTensorAccessorR input_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(input_shape, allocator));
+          create_random_filled_accessor_r(input_shape, allocator);
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
 
@@ -41,11 +40,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                                        input_accessor.get_float_ptr(),
                                        output_accessor.get_float_ptr());
 
-      std::vector<float> host_output_accessor =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-
-      CHECK(contains_non_zero(host_output_accessor));
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
diff --git a/lib/kernels/test/src/test_flat_kernel.cc b/lib/kernels/test/src/test_flat_kernel.cc
index 83f7f0445e..f8a3abdb98 100644
--- a/lib/kernels/test/src/test_flat_kernel.cc
+++ b/lib/kernels/test/src/test_flat_kernel.cc
@@ -1,21 +1,27 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/flat_kernels.h"
-#include "test_utils.h"
+#include "op-attrs/datatype_value.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test Flat Kernel") {
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n});
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{100_n}},
+        DataType::FLOAT,
+    };
     TensorShape output_shape = input_shape;
 
     GenericTensorAccessorR input_accessor =
-        read_only_accessor_from_write_accessor(
-            create_filled_accessor_w(input_shape, allocator, 2.0f));
+        read_only_accessor_from_write_accessor(create_filled_accessor_w(
+            input_shape, allocator, make_float_data_type_value(2)));
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorW output_accessor =
@@ -25,33 +31,21 @@ TEST_SUITE(FF_TEST_SUITE) {
                                     input_accessor,
                                     output_accessor.get_float_ptr());
 
-      std::vector<float> check_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-
-      std::vector<float> expected_output_data(
-          input_accessor.shape.num_elements().unwrap_nonnegative(), 2.0f);
-      CHECK(check_output_data == expected_output_data);
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
-      GenericTensorAccessorW output_grad_accessor =
-          create_filled_accessor_w(output_shape, allocator, 0.0f);
-      GenericTensorAccessorW input_grad_accessor =
-          create_filled_accessor_w(input_shape, allocator, 1.0f);
+      GenericTensorAccessorR output_grad_accessor = create_filled_accessor_r(
+          output_shape, allocator, make_float_data_type_value(0));
+      GenericTensorAccessorW input_grad_accessor = create_filled_accessor_w(
+          input_shape, allocator, make_float_data_type_value(1));
 
       Kernels::Flat::backward_kernel(managed_stream.raw_stream(),
                                      input_accessor,
-                                     input_grad_accessor.get_float_ptr(),
-                                     output_grad_accessor.get_float_ptr());
-
-      std::vector<float> backward_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(input_grad_accessor));
+                                     output_grad_accessor.get_float_ptr(),
+                                     input_grad_accessor.get_float_ptr());
 
-      std::vector<float> expected_output_data(
-          input_accessor.shape.num_elements().unwrap_nonnegative(), 1.0f);
-      CHECK(backward_output_data == expected_output_data);
+      CHECK(contains_non_zero(input_grad_accessor));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_gather_kernels.cc b/lib/kernels/test/src/test_gather_kernels.cc
index 1a8cf5f82a..f0be809475 100644
--- a/lib/kernels/test/src/test_gather_kernels.cc
+++ b/lib/kernels/test/src/test_gather_kernels.cc
@@ -1,61 +1,107 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/gather_kernels.h"
-#include "test_utils.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
-TEST_SUITE(FF_TEST_SUITE) {
+
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test Gather Forward and Backward Kernel") {
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
-
     Allocator allocator = create_local_cuda_memory_allocator();
 
     GatherPerDeviceState state = {managed_handle.raw_handle(),
-                                  legion_dim_t{2_n}};
+                                  legion_dim_t{0_n}};
 
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n});
-    TensorShape output_shape = make_float_tensor_shape_from_legion_dims({50_n});
+    SUBCASE("forward_kernel") {
+      auto run_forward_test = [&](TensorShape input_shape,
+                                  TensorShape index_shape,
+                                  TensorShape output_shape) {
+        GenericTensorAccessorR input_accessor =
+            create_random_filled_accessor_r(input_shape, allocator);
+        GenericTensorAccessorR index_accessor =
+            create_random_filled_accessor_r(index_shape, allocator);
+        GenericTensorAccessorW output_accessor =
+            allocator.allocate_tensor(output_shape);
 
-    GenericTensorAccessorR index_accessor =
-        read_only_accessor_from_write_accessor(
-            create_random_filled_accessor_w(output_shape, allocator));
+        Kernels::Gather::forward_kernel(managed_stream.raw_stream(),
+                                        state,
+                                        input_accessor,
+                                        index_accessor,
+                                        output_accessor);
 
-    SUBCASE("forward_kernel") {
-      GenericTensorAccessorR input_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(input_shape, allocator));
-      GenericTensorAccessorW output_accessor =
-          allocator.allocate_tensor(output_shape);
-
-      Kernels::Gather::forward_kernel(managed_stream.raw_stream(),
-                                      state,
-                                      input_accessor,
-                                      index_accessor,
-                                      output_accessor);
-
-      std::vector<float> host_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-      CHECK(contains_non_zero(host_output_data));
+        CHECK(contains_non_zero(output_accessor));
+      };
+
+      SUBCASE("test gather forward, 2D") {
+        TensorShape input_shape = TensorShape{
+            TensorDims{FFOrdered{2_n, 100_n}},
+            DataType::FLOAT,
+        };
+        TensorShape index_shape = TensorShape{
+            TensorDims{FFOrdered{2_n, 20_n}},
+            DataType::INT32,
+        };
+        TensorShape output_shape = TensorShape{
+            TensorDims{FFOrdered{2_n, 20_n}},
+            DataType::FLOAT,
+        };
+        run_forward_test(input_shape, index_shape, output_shape);
+      }
+
+      SUBCASE("test gather forward, 1D") {
+        TensorShape input_shape = TensorShape{
+            TensorDims{FFOrdered{100_n}},
+            DataType::FLOAT,
+        };
+        TensorShape index_shape = TensorShape{
+            TensorDims{FFOrdered{10_n}},
+            DataType::INT32,
+        };
+        TensorShape output_shape = TensorShape{
+            TensorDims{FFOrdered{10_n}},
+            DataType::FLOAT,
+        };
+        run_forward_test(input_shape, index_shape, output_shape);
+      }
     }
 
     SUBCASE("backward_kernel") {
-      GenericTensorAccessorR output_grad_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(output_shape, allocator));
-      GenericTensorAccessorW input_grad_accessor =
-          create_random_filled_accessor_w(input_shape, allocator);
-
-      Kernels::Gather::backward_kernel(managed_stream.raw_stream(),
-                                       state,
-                                       output_grad_accessor,
-                                       index_accessor,
-                                       input_grad_accessor);
-
-      std::vector<float> host_input_grad_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(input_grad_accessor));
-      CHECK(contains_non_zero(host_input_grad_data));
+      auto run_backward_test = [&](TensorShape input_shape,
+                                   TensorShape index_shape,
+                                   TensorShape output_shape) {
+        GenericTensorAccessorR output_grad_accessor =
+            create_random_filled_accessor_r(output_shape, allocator);
+        GenericTensorAccessorR index_accessor =
+            create_random_filled_accessor_r(index_shape, allocator);
+        GenericTensorAccessorW input_grad_accessor =
+            allocator.allocate_tensor(input_shape);
+
+        Kernels::Gather::backward_kernel(managed_stream.raw_stream(),
+                                         state,
+                                         output_grad_accessor,
+                                         index_accessor,
+                                         input_grad_accessor);
+        CHECK(contains_non_zero(input_grad_accessor));
+      };
+
+      SUBCASE("test gather backward, 2D") {
+        TensorShape input_shape = TensorShape{
+            TensorDims{FFOrdered{2_n, 100_n}},
+            DataType::FLOAT,
+        };
+        TensorShape index_shape = TensorShape{
+            TensorDims{FFOrdered{2_n, 25_n}},
+            DataType::INT32,
+        };
+        TensorShape output_shape = TensorShape{
+            TensorDims{FFOrdered{2_n, 25_n}},
+            DataType::FLOAT,
+        };
+        run_backward_test(input_shape, index_shape, output_shape);
+      }
     }
   }
 }
diff --git a/lib/kernels/test/src/test_layer_norm_kernels.cc b/lib/kernels/test/src/test_layer_norm_kernels.cc
index 5386c1d943..02a95ba58a 100644
--- a/lib/kernels/test/src/test_layer_norm_kernels.cc
+++ b/lib/kernels/test/src/test_layer_norm_kernels.cc
@@ -1,23 +1,30 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/layer_norm_kernels.h"
-#include "test_utils.h"
+#include "op-attrs/datatype_value.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
 
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test LayerNorm Forward and Backward Kernel") {
     nonnegative_int batch_size = 10_n;
     nonnegative_int feature_size = 10_n;
     float epsilon = 1e-5f;
     bool elementwise_affine = true;
 
-    TensorShape input_shape =
-        make_float_tensor_shape_from_legion_dims({batch_size, feature_size});
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{batch_size, feature_size}},
+        DataType::FLOAT,
+    };
     TensorShape output_shape = input_shape;
-    TensorShape feature_shape =
-        make_float_tensor_shape_from_legion_dims({feature_size});
+    TensorShape feature_shape = TensorShape{
+        TensorDims{FFOrdered{feature_size}},
+        DataType::FLOAT,
+    };
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
@@ -31,16 +38,15 @@ TEST_SUITE(FF_TEST_SUITE) {
                                         epsilon);
 
     GenericTensorAccessorR input_accessor =
-        read_only_accessor_from_write_accessor(
-            create_random_filled_accessor_w(input_shape, allocator));
-    GenericTensorAccessorW gamma_accessor =
-        create_filled_accessor_w(feature_shape, allocator, 1.0f);
+        create_random_filled_accessor_r(input_shape, allocator);
+    GenericTensorAccessorW gamma_accessor = create_filled_accessor_w(
+        feature_shape, allocator, make_float_data_type_value(1));
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
-      GenericTensorAccessorW beta_accessor =
-          create_filled_accessor_w(feature_shape, allocator, 0.0f);
+      GenericTensorAccessorW beta_accessor = create_filled_accessor_w(
+          feature_shape, allocator, make_float_data_type_value(0));
 
       Kernels::LayerNorm::forward_kernel(managed_stream.raw_stream(),
                                          state,
@@ -52,8 +58,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("backward_kernel") {
       GenericTensorAccessorR output_grad_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(output_shape, allocator));
+          create_random_filled_accessor_r(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
           create_random_filled_accessor_w(input_shape, allocator);
       GenericTensorAccessorW gamma_grad_accessor =
diff --git a/lib/kernels/test/src/test_managed_ff_stream.cc b/lib/kernels/test/src/test_managed_ff_stream.cc
new file mode 100644
index 0000000000..fb5920adcc
--- /dev/null
+++ b/lib/kernels/test/src/test_managed_ff_stream.cc
@@ -0,0 +1,107 @@
+#include "internal/test_utils.h"
+#include "kernels/gather_kernels.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
+  TEST_CASE("Test ManagedFFStream") {
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
+    ManagedFFStream managed_stream{};
+    Allocator allocator = create_local_cuda_memory_allocator();
+
+    GatherPerDeviceState state = {managed_handle.raw_handle(),
+                                  legion_dim_t{0_n}};
+
+    SUBCASE("forward_kernel") {
+      auto run_forward_test = [&](TensorShape const &input_shape,
+                                  TensorShape const &index_shape,
+                                  TensorShape const &output_shape) {
+        GenericTensorAccessorR input_accessor =
+            create_random_filled_accessor_r(input_shape, allocator);
+        GenericTensorAccessorR index_accessor =
+            create_random_filled_accessor_r(index_shape, allocator);
+        GenericTensorAccessorW output_accessor =
+            allocator.allocate_tensor(output_shape);
+
+        Kernels::Gather::forward_kernel(/*stream=*/managed_stream.raw_stream(),
+                                        /*per_device_state=*/state,
+                                        /*input=*/input_accessor,
+                                        /*index=*/index_accessor,
+                                        /*output=*/output_accessor);
+
+        CHECK(contains_non_zero(output_accessor));
+      };
+
+      SUBCASE("test gather forward, 2D") {
+        TensorShape input_shape = TensorShape{
+            TensorDims{FFOrdered{2_n, 100_n}},
+            DataType::FLOAT,
+        };
+        TensorShape index_shape = TensorShape{
+            TensorDims{FFOrdered{2_n, 20_n}},
+            DataType::INT32,
+        };
+        TensorShape output_shape = TensorShape{
+            TensorDims{FFOrdered{2_n, 20_n}},
+            DataType::FLOAT,
+        };
+        run_forward_test(input_shape, index_shape, output_shape);
+      }
+
+      SUBCASE("test gather forward, 1D") {
+        TensorShape input_shape = TensorShape{
+            TensorDims{FFOrdered{100_n}},
+            DataType::FLOAT,
+        };
+        TensorShape index_shape = TensorShape{
+            TensorDims{FFOrdered{10_n}},
+            DataType::INT32,
+        };
+        TensorShape output_shape = TensorShape{
+            TensorDims{FFOrdered{10_n}},
+            DataType::FLOAT,
+        };
+        run_forward_test(input_shape, index_shape, output_shape);
+      }
+    }
+
+    SUBCASE("backward_kernel") {
+      auto run_backward_test = [&](TensorShape const &input_shape,
+                                   TensorShape const &index_shape,
+                                   TensorShape const &output_shape) {
+        GenericTensorAccessorR output_grad_accessor =
+            create_random_filled_accessor_r(output_shape, allocator);
+        GenericTensorAccessorR index_accessor =
+            create_random_filled_accessor_r(index_shape, allocator);
+        GenericTensorAccessorW input_grad_accessor =
+            allocator.allocate_tensor(input_shape);
+
+        Kernels::Gather::backward_kernel(/*stream=*/managed_stream.raw_stream(),
+                                         /*per_device_state=*/state,
+                                         /*output_grad=*/output_grad_accessor,
+                                         /*index=*/index_accessor,
+                                         /*input_grad=*/input_grad_accessor);
+        CHECK(contains_non_zero(input_grad_accessor));
+      };
+
+      SUBCASE("test gather backward, 2D") {
+        TensorShape input_shape = TensorShape{
+            TensorDims{FFOrdered{2_n, 100_n}},
+            DataType::FLOAT,
+        };
+        TensorShape index_shape = TensorShape{
+            TensorDims{FFOrdered{2_n, 25_n}},
+            DataType::INT32,
+        };
+        TensorShape output_shape = TensorShape{
+            TensorDims{FFOrdered{2_n, 25_n}},
+            DataType::FLOAT,
+        };
+        run_backward_test(input_shape, index_shape, output_shape);
+      }
+    }
+  }
+}
diff --git a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc
new file mode 100644
index 0000000000..fc67764cdb
--- /dev/null
+++ b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc
@@ -0,0 +1,37 @@
+#include "kernels/managed_per_device_ff_handle.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
+  TEST_CASE("Test ManagedPerDeviceFFHandle") {
+    ManagedPerDeviceFFHandle base_handle{/*workSpaceSize=*/1024 * 1024,
+                                         /*allowTensorOpMathConversion=*/true};
+    PerDeviceFFHandle const *base_handle_ptr = &base_handle.raw_handle();
+
+    SUBCASE("constructor") {
+      CHECK(base_handle.raw_handle().workSpaceSize == 1024 * 1024);
+      CHECK(base_handle.raw_handle().allowTensorOpMathConversion == true);
+    }
+
+    SUBCASE("move constructor") {
+      ManagedPerDeviceFFHandle new_handle(std::move(base_handle));
+      CHECK(&new_handle.raw_handle() == base_handle_ptr);
+    }
+
+    SUBCASE("move assignment operator") {
+      SUBCASE("move assign to other") {
+        ManagedPerDeviceFFHandle new_handle{
+            /*workSpaceSize=*/1024 * 1024,
+            /*allowTensorOpMathConversion=*/true};
+        new_handle = std::move(base_handle);
+        CHECK(&new_handle.raw_handle() == base_handle_ptr);
+      }
+
+      SUBCASE("move assign to self") {
+        base_handle = std::move(base_handle);
+        CHECK(&base_handle.raw_handle() == base_handle_ptr);
+      }
+    }
+  }
+}
diff --git a/lib/kernels/test/src/test_partition_kernel.cc b/lib/kernels/test/src/test_partition_kernel.cc
index 4fd1b53210..5452266dad 100644
--- a/lib/kernels/test/src/test_partition_kernel.cc
+++ b/lib/kernels/test/src/test_partition_kernel.cc
@@ -1,12 +1,15 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/partition_kernels.h"
-#include "test_utils.h"
+#include "op-attrs/datatype_value.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
 
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test Partition Forward and Backward") {
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
@@ -14,48 +17,36 @@ TEST_SUITE(FF_TEST_SUITE) {
     RepartitionPerDeviceState state = Kernels::Repartition::init_kernel(
         managed_handle.raw_handle(), DataType::FLOAT);
 
-    TensorShape input_shape =
-        make_float_tensor_shape_from_legion_dims({10_n, 10_n});
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{10_n, 10_n}},
+        DataType::FLOAT,
+    };
     TensorShape output_shape = input_shape;
 
     SUBCASE("forward_kernel") {
-      GenericTensorAccessorR input_accessor =
-          read_only_accessor_from_write_accessor(
-              create_filled_accessor_w(input_shape, allocator, 1.0f));
+      GenericTensorAccessorR input_accessor = create_filled_accessor_r(
+          input_shape, allocator, make_float_data_type_value(1));
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
 
       Kernels::Repartition::forward_kernel(
           managed_stream.raw_stream(), state, input_accessor, output_accessor);
 
-      std::vector<float> check_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-
-      std::vector<float> expected_output_data(
-          input_accessor.shape.num_elements().unwrap_nonnegative(), 1.0f);
-      CHECK(check_output_data == expected_output_data);
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
-      GenericTensorAccessorR output_grad_accessor =
-          read_only_accessor_from_write_accessor(
-              create_filled_accessor_w(output_shape, allocator, 1.0f));
-      GenericTensorAccessorW input_grad_accessor =
-          create_filled_accessor_w(input_shape, allocator, 2.0f);
+      GenericTensorAccessorR output_grad_accessor = create_filled_accessor_r(
+          output_shape, allocator, make_float_data_type_value(1));
+      GenericTensorAccessorW input_grad_accessor = create_filled_accessor_w(
+          input_shape, allocator, make_float_data_type_value(2));
 
       Kernels::Repartition::backward_kernel(managed_stream.raw_stream(),
                                             state,
-                                            input_grad_accessor,
-                                            output_grad_accessor);
-
-      std::vector<float> host_grad_input_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(input_grad_accessor));
+                                            output_grad_accessor,
+                                            input_grad_accessor);
 
-      std::vector<float> expected_grad_input_data(
-          input_grad_accessor.shape.num_elements().unwrap_nonnegative(), 3.0f);
-      CHECK(host_grad_input_data == expected_grad_input_data);
+      CHECK(contains_non_zero(input_grad_accessor));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_pool_2d_kernels.cc b/lib/kernels/test/src/test_pool_2d_kernels.cc
index 62b61707c6..f2ada8387e 100644
--- a/lib/kernels/test/src/test_pool_2d_kernels.cc
+++ b/lib/kernels/test/src/test_pool_2d_kernels.cc
@@ -1,9 +1,10 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/pool_2d_kernels.h"
-#include "test_utils.h"
+#include "op-attrs/datatype_value.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test Pool2D Forward and Backward Kernel") {
     nonnegative_int input_w = 10_n;
     nonnegative_int input_h = 10_n;
@@ -22,7 +23,9 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     PoolOp pool_type = PoolOp::MAX;
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
@@ -46,10 +49,14 @@ TEST_SUITE(FF_TEST_SUITE) {
                                      /*stride_w=*/stride_w.unwrap_nonnegative(),
                                      /*pool_type=*/pool_type);
 
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims(
-        {input_w, input_h, input_c, input_n});
-    TensorShape output_shape = make_float_tensor_shape_from_legion_dims(
-        {output_w, output_h, output_c, output_n});
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{input_n, input_c, input_h, input_w}},
+        DataType::FLOAT,
+    };
+    TensorShape output_shape = TensorShape{
+        TensorDims{FFOrdered{output_n, input_c, output_h, output_w}},
+        DataType::FLOAT,
+    };
 
     GenericTensorAccessorW input_accessor =
         create_random_filled_accessor_w(input_shape, allocator);
@@ -62,28 +69,23 @@ TEST_SUITE(FF_TEST_SUITE) {
                                       input_accessor.ptr,
                                       output_accessor.ptr);
 
-      std::vector<float> host_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-      CHECK(contains_non_zero(host_output_data));
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
-      GenericTensorAccessorW output_grad_accessor =
-          create_filled_accessor_w(output_shape, allocator, 1.0f);
+      GenericTensorAccessorW output_grad_accessor = create_filled_accessor_w(
+          output_shape, allocator, make_float_data_type_value(1));
       GenericTensorAccessorW input_grad_accessor =
           allocator.allocate_tensor(input_shape);
 
       Kernels::Pool2D::backward_kernel(managed_stream.raw_stream(),
                                        state,
-                                       input_accessor.ptr,
-                                       input_grad_accessor.ptr,
                                        output_accessor.ptr,
-                                       output_grad_accessor.ptr);
+                                       output_grad_accessor.ptr,
+                                       input_accessor.ptr,
+                                       input_grad_accessor.ptr);
 
-      std::vector<float> host_input_grad = load_data_to_host_from_device<float>(
-          read_only_accessor_from_write_accessor(input_grad_accessor));
-      CHECK(contains_non_zero(host_input_grad));
+      CHECK(contains_non_zero(input_grad_accessor));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_reduction_kernel.cc b/lib/kernels/test/src/test_reduction_kernel.cc
index 04a3817b84..e13b149769 100644
--- a/lib/kernels/test/src/test_reduction_kernel.cc
+++ b/lib/kernels/test/src/test_reduction_kernel.cc
@@ -1,27 +1,33 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/reduction_kernels.h"
-#include "test_utils.h"
+#include "op-attrs/datatype_value.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test Reduction Forward and Backward Kernel") {
     std::size_t num_replicas = 5;
 
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims(
-        {10_n, 10_n, 10_n, 10_n, 10_n});
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{10_n, 10_n, 10_n, 10_n, 10_n}},
+        DataType::FLOAT,
+    };
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
     SUBCASE("forward_kernel") {
-      TensorShape output_shape =
-          make_float_tensor_shape_from_legion_dims({10_n});
+      TensorShape output_shape = TensorShape{
+          TensorDims{FFOrdered{10_n}},
+          DataType::FLOAT,
+      };
 
       GenericTensorAccessorR input_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(input_shape, allocator));
+          create_random_filled_accessor_r(input_shape, allocator);
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
 
@@ -30,30 +36,22 @@ TEST_SUITE(FF_TEST_SUITE) {
                                          output_accessor,
                                          num_replicas);
 
-      std::vector<float> host_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-      CHECK(contains_non_zero(host_output_data));
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
       TensorShape output_shape = input_shape;
 
-      GenericTensorAccessorR output_grad_accessor =
-          read_only_accessor_from_write_accessor(
-              create_filled_accessor_w(output_shape, allocator, 1.0f));
+      GenericTensorAccessorR output_grad_accessor = create_filled_accessor_r(
+          output_shape, allocator, make_float_data_type_value(1));
       GenericTensorAccessorW input_grad_accessor =
           allocator.allocate_tensor(input_shape);
 
       Kernels::Reduction::backward_kernel(managed_stream.raw_stream(),
-                                          input_grad_accessor,
-                                          output_grad_accessor);
-
-      std::vector<float> expected_grad_input_data(
-          input_grad_accessor.shape.num_elements().unwrap_nonnegative(), 1.0f);
-      std::vector<float> host_grad_data = load_data_to_host_from_device<float>(
-          read_only_accessor_from_write_accessor(input_grad_accessor));
-      CHECK(host_grad_data == expected_grad_input_data);
+                                          output_grad_accessor,
+                                          input_grad_accessor);
+
+      CHECK(contains_non_zero(input_grad_accessor));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc
index fa726898f2..83a9a992f7 100644
--- a/lib/kernels/test/src/test_replicate_kernel.cc
+++ b/lib/kernels/test/src/test_replicate_kernel.cc
@@ -1,55 +1,150 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
+#include "kernels/format_accessor_contents.h"
 #include "kernels/replicate_kernels.h"
-#include "test_utils.h"
+#include "kernels/replicate_kernels_cpu.h"
+#include "test/utils/doctest/check_kv.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
-TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE("Test Replicate Kernel") {
+
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
+  TEST_CASE("Call Replicate Forward and Backward Kernels") {
     nonnegative_int num_replicas = 10_n;
 
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n});
-    TensorShape output_shape = input_shape;
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{3_n}},
+        DataType::FLOAT,
+    };
+    TensorShape output_shape = TensorShape{
+        TensorDims{FFOrdered{3_n}},
+        DataType::FLOAT,
+    };
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
-    Allocator allocator = create_local_cuda_memory_allocator();
+    Allocator gpu_allocator = create_local_cuda_memory_allocator();
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
 
     SUBCASE("forward_kernel") {
-      GenericTensorAccessorR input_accessor =
-          read_only_accessor_from_write_accessor(
-              create_filled_accessor_w(input_shape, allocator, 1.0f));
-      GenericTensorAccessorW output_accessor =
-          allocator.allocate_tensor(output_shape);
+      GenericTensorAccessorR input =
+          create_1d_accessor_r_with_contents({1, 3, 2}, gpu_allocator);
+
+      GenericTensorAccessorW output =
+          gpu_allocator.allocate_tensor(output_shape);
 
       Kernels::Replicate::forward_kernel(
-          managed_stream.raw_stream(), input_accessor, output_accessor);
+          managed_stream.raw_stream(), input, output);
 
-      std::vector<float> check_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
+      GenericTensorAccessorR correct = input;
 
-      std::vector<float> expected_output_data(
-          input_accessor.shape.num_elements().unwrap_nonnegative(), 1.0f);
-      CHECK(check_output_data == expected_output_data);
+      CHECK_MESSAGE(accessors_are_equal(output, correct),
+                    check_kv("output", format_accessor_w_contents(output)));
     }
 
     SUBCASE("backward_kernel") {
-      GenericTensorAccessorW input_grad_accessor =
-          create_filled_accessor_w(input_shape, allocator, 1.0f);
-      GenericTensorAccessorR output_grad_accessor =
-          read_only_accessor_from_write_accessor(
-              create_filled_accessor_w(output_shape, allocator, 1.0f));
+      GenericTensorAccessorR output_grad = create_2d_accessor_r_with_contents(
+          {
+              {1, 2, 3},
+              {4, 3, 3},
+              {1, 3, 5},
+          },
+          gpu_allocator);
+
+      GenericTensorAccessorR correct = create_1d_accessor_r_with_contents(
+          {1 + 2 + 3, 4 + 3 + 3, 1 + 3 + 5}, cpu_allocator);
+
+      GenericTensorAccessorW input_grad =
+          gpu_allocator.allocate_tensor(input_shape);
 
       Kernels::Replicate::backward_kernel(managed_stream.raw_stream(),
-                                          input_grad_accessor,
-                                          output_grad_accessor,
+                                          output_grad,
+                                          input_grad,
                                           num_replicas.unwrap_nonnegative());
 
-      std::vector<float> check_aggregated_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(input_grad_accessor));
-      CHECK(contains_non_zero(check_aggregated_data));
+      CHECK_MESSAGE(
+          accessors_are_equal(input_grad, correct),
+          check_kv("input_grad", format_accessor_w_contents(input_grad)));
+    }
+  }
+
+  TEST_CASE("Check Replicate Forward and Backward Kernel against CPU Kernel") {
+    nonnegative_int num_replicas = 2_n;
+
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{5_n}},
+        DataType::FLOAT,
+    };
+    TensorShape output_shape = TensorShape{
+        TensorDims{FFOrdered{5_n, num_replicas}},
+        DataType::FLOAT,
+    };
+
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
+    ManagedFFStream managed_stream{};
+
+    Allocator gpu_allocator = create_local_cuda_memory_allocator();
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    SUBCASE("forward_kernel") {
+      // Run GPU Replicate Forward Kernel
+      GenericTensorAccessorR input_accessor_gpu =
+          create_random_filled_accessor_r(input_shape, gpu_allocator);
+      GenericTensorAccessorW output_accessor_gpu =
+          create_zero_filled_accessor_w(output_shape, gpu_allocator);
+
+      Kernels::Replicate::forward_kernel(
+          managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu);
+
+      // Run CPU Replicate Forward Kernel
+      GenericTensorAccessorR input_accessor_cpu =
+          copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator);
+      GenericTensorAccessorW output_accessor_cpu =
+          create_zero_filled_accessor_w(output_shape, cpu_allocator);
+
+      Kernels::Replicate::cpu_forward_kernel(input_accessor_cpu,
+                                             output_accessor_cpu);
+
+      CHECK_MESSAGE(
+          accessors_are_equal(output_accessor_gpu, output_accessor_cpu),
+          check_kv("input", format_accessor_r_contents(input_accessor_cpu)),
+          check_kv("gpu", format_accessor_w_contents(output_accessor_gpu)),
+          check_kv("cpu", format_accessor_w_contents(output_accessor_cpu)));
+    }
+
+    SUBCASE("backward_kernel") {
+      // Run GPU Replicate Backward Kernel
+      GenericTensorAccessorR output_grad_accessor_gpu =
+          create_random_filled_accessor_r(output_shape, gpu_allocator);
+      GenericTensorAccessorW input_grad_accessor_gpu =
+          create_zero_filled_accessor_w(input_shape, gpu_allocator);
+
+      Kernels::Replicate::backward_kernel(managed_stream.raw_stream(),
+                                          output_grad_accessor_gpu,
+                                          input_grad_accessor_gpu,
+                                          num_replicas.unwrap_nonnegative());
+
+      // Run CPU Replicate Backward Kernel
+      GenericTensorAccessorR output_grad_accessor_cpu =
+          copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator);
+      GenericTensorAccessorW input_grad_accessor_cpu =
+          create_zero_filled_accessor_w(input_shape, cpu_allocator);
+
+      Kernels::Replicate::cpu_backward_kernel(
+          output_grad_accessor_cpu,
+          input_grad_accessor_cpu,
+          num_replicas.unwrap_nonnegative());
+
+      CHECK_MESSAGE(
+          accessors_are_equal(input_grad_accessor_gpu, input_grad_accessor_cpu),
+          check_kv("output_grad",
+                   format_accessor_r_contents(output_grad_accessor_cpu)),
+          check_kv("gpu", format_accessor_w_contents(input_grad_accessor_gpu)),
+          check_kv("cpu", format_accessor_w_contents(input_grad_accessor_cpu)));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_reshape_kernel.cc b/lib/kernels/test/src/test_reshape_kernel.cc
index d329a347b3..66c6bf849b 100644
--- a/lib/kernels/test/src/test_reshape_kernel.cc
+++ b/lib/kernels/test/src/test_reshape_kernel.cc
@@ -1,16 +1,21 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/reshape_kernels.h"
-#include "test_utils.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test Reshape Forward and Backward") {
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n});
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{100_n}},
+        DataType::FLOAT,
+    };
     TensorShape output_shape = input_shape;
 
     ReshapePerDeviceState state =
@@ -18,42 +23,28 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorR input_accessor =
-          read_only_accessor_from_write_accessor(
-              create_filled_accessor_w(input_shape, allocator, 1.0f));
+          create_random_filled_accessor_r(input_shape, allocator);
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
 
       Kernels::Reshape::forward_kernel(
           managed_stream.raw_stream(), state, input_accessor, output_accessor);
 
-      std::vector<float> check_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-
-      std::vector<float> expected_output_data(
-          input_accessor.shape.num_elements().unwrap_nonnegative(), 1.0f);
-      CHECK(check_output_data == expected_output_data);
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
       GenericTensorAccessorR output_grad_accessor =
-          read_only_accessor_from_write_accessor(
-              create_filled_accessor_w(output_shape, allocator, 1.0f));
+          create_random_filled_accessor_r(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
-          create_filled_accessor_w(input_shape, allocator, 2.0f);
+          allocator.allocate_tensor(input_shape);
 
       Kernels::Reshape::backward_kernel(managed_stream.raw_stream(),
                                         state,
-                                        input_grad_accessor,
-                                        output_grad_accessor);
-
-      std::vector<float> host_grad_input_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(input_grad_accessor));
+                                        output_grad_accessor,
+                                        input_grad_accessor);
 
-      std::vector<float> expected_grad_input_data(
-          input_grad_accessor.shape.num_elements().unwrap_nonnegative(), 3.0f);
-      CHECK(host_grad_input_data == expected_grad_input_data);
+      CHECK(contains_non_zero(input_grad_accessor));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc
index 9c8475f6d6..6e12c48ac3 100644
--- a/lib/kernels/test/src/test_reverse_kernels.cc
+++ b/lib/kernels/test/src/test_reverse_kernels.cc
@@ -1,63 +1,124 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/reverse_kernels.h"
-#include "test_utils.h"
+#include "kernels/reverse_kernels_cpu.h"
+#include "op-attrs/datatype_value.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Call Reverse Forward and Backward Kernels") {
-    nonnegative_int reverse_dim_size = 10_n;
-    nonnegative_int in_blk_size = 10_n;
-    nonnegative_int num_out_blks = 1_n;
-
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n});
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{1_n, 10_n, 10_n}},
+        DataType::FLOAT,
+    };
     TensorShape output_shape = input_shape;
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
+    ReverseAttrs attrs = ReverseAttrs{
+        /*axis=*/ff_dim_t{0_n},
+    };
+
     SUBCASE("forward_kernel") {
       GenericTensorAccessorR input_accessor =
-          read_only_accessor_from_write_accessor(
-              create_filled_accessor_w(input_shape, allocator, 1.0f));
+          read_only_accessor_from_write_accessor(create_filled_accessor_w(
+              input_shape, allocator, make_float_data_type_value(1)));
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
 
       Kernels::Reverse::forward_kernel(
-          managed_stream.raw_stream(),
-          input_accessor.get_float_ptr(),
-          output_accessor.get_float_ptr(),
-          num_out_blks.unwrap_nonnegative(),
-          reverse_dim_size.unwrap_nonnegative(),
-          in_blk_size.unwrap_nonnegative(),
-          input_accessor.shape.num_elements().unwrap_nonnegative());
-
-      std::vector<float> check_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-      CHECK(contains_non_zero(check_output_data));
+          managed_stream.raw_stream(), input_accessor, output_accessor, attrs);
+
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
       GenericTensorAccessorW output_grad_accessor =
           create_random_filled_accessor_w(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
-          create_random_filled_accessor_w(input_shape, allocator);
-
-      Kernels::Reverse::backward_kernel(
-          managed_stream.raw_stream(),
-          output_grad_accessor.get_float_ptr(),
-          input_grad_accessor.get_float_ptr(),
-          num_out_blks.unwrap_nonnegative(),
-          reverse_dim_size.unwrap_nonnegative(),
-          in_blk_size.unwrap_nonnegative(),
-          input_grad_accessor.shape.num_elements().unwrap_nonnegative());
-
-      std::vector<float> host_grad_input_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(input_grad_accessor));
-      CHECK(contains_non_zero(host_grad_input_data));
+          allocator.allocate_tensor(input_shape);
+
+      Kernels::Reverse::backward_kernel(managed_stream.raw_stream(),
+                                        output_grad_accessor,
+                                        input_grad_accessor,
+                                        attrs);
+
+      CHECK(contains_non_zero(input_grad_accessor));
+    }
+  }
+
+  TEST_CASE("Check Reverse Forward and Backward Kernels against CPU Kernels") {
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{1_n, 4_n, 3_n}},
+        DataType::FLOAT,
+    };
+    TensorShape output_shape = input_shape;
+
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
+    ManagedFFStream managed_stream{};
+
+    Allocator gpu_allocator = create_local_cuda_memory_allocator();
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    ReverseAttrs attrs = ReverseAttrs{
+        /*axis=*/ff_dim_t{0_n},
+    };
+
+    SUBCASE("forward_kernel") {
+      // Run GPU Cast Forward Kernel
+      GenericTensorAccessorR input_accessor_gpu =
+          create_random_filled_accessor_r(input_shape, gpu_allocator);
+      GenericTensorAccessorW output_accessor_gpu =
+          create_zero_filled_accessor_w(output_shape, gpu_allocator);
+
+      Kernels::Reverse::forward_kernel(managed_stream.raw_stream(),
+                                       input_accessor_gpu,
+                                       output_accessor_gpu,
+                                       attrs);
+
+      // Run CPU Cast Forward Kernel
+      GenericTensorAccessorR input_accessor_cpu =
+          copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator);
+      GenericTensorAccessorW output_accessor_cpu =
+          create_zero_filled_accessor_w(output_shape, cpu_allocator);
+
+      Kernels::Reverse::cpu_forward_kernel(
+          input_accessor_cpu, output_accessor_cpu, attrs);
+
+      CHECK(accessors_are_equal(output_accessor_cpu, output_accessor_cpu));
+    }
+
+    SUBCASE("backward_kernel") {
+      // Run GPU Cast Backward Kernel
+      GenericTensorAccessorR output_grad_accessor_gpu =
+          create_random_filled_accessor_r(output_shape, gpu_allocator);
+
+      GenericTensorAccessorW input_grad_accessor_gpu =
+          create_zero_filled_accessor_w(input_shape, gpu_allocator);
+
+      Kernels::Reverse::backward_kernel(managed_stream.raw_stream(),
+                                        output_grad_accessor_gpu,
+                                        input_grad_accessor_gpu,
+                                        attrs);
+
+      // Run CPU Cast Backward Kernel
+      GenericTensorAccessorR output_grad_accessor_cpu =
+          copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator);
+      GenericTensorAccessorW input_grad_accessor_cpu =
+          create_zero_filled_accessor_w(input_shape, cpu_allocator);
+
+      Kernels::Reverse::cpu_backward_kernel(
+          output_grad_accessor_cpu, input_grad_accessor_cpu, attrs);
+
+      CHECK(accessors_are_equal(input_grad_accessor_gpu,
+                                input_grad_accessor_cpu));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_softmax_kernel.cc b/lib/kernels/test/src/test_softmax_kernel.cc
index c9eaa76b86..904cca2d3e 100644
--- a/lib/kernels/test/src/test_softmax_kernel.cc
+++ b/lib/kernels/test/src/test_softmax_kernel.cc
@@ -1,10 +1,10 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/softmax_kernels.h"
-#include "test_utils.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
 
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test Softmax Kernel Operations") {
     nonnegative_int input_n = 1_n;
     nonnegative_int input_c = 1_n;
@@ -12,12 +12,17 @@ TEST_SUITE(FF_TEST_SUITE) {
     nonnegative_int input_w = 100_n;
     nonnegative_int channels = 100_n;
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n});
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{100_n}},
+        DataType::FLOAT,
+    };
     TensorShape output_shape = input_shape;
 
     SoftmaxPerDeviceState state =
@@ -40,30 +45,22 @@ TEST_SUITE(FF_TEST_SUITE) {
                                        input_accessor.get_float_ptr(),
                                        output_accessor.get_float_ptr());
 
-      std::vector<float> host_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-      CHECK(contains_non_zero(host_output_data));
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
-      GenericTensorAccessorW output_grad_accessor =
-          create_filled_accessor_w(output_shape, allocator, 1.0f);
+      GenericTensorAccessorR output_grad_accessor =
+          create_random_filled_accessor_r(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
           allocator.allocate_tensor(input_shape);
 
       Kernels::Softmax::backward_kernel(
           managed_stream.raw_stream(),
-          input_grad_accessor.get_float_ptr(),
           output_grad_accessor.get_float_ptr(),
+          input_grad_accessor.get_float_ptr(),
           output_grad_accessor.shape.num_elements().unwrap_nonnegative());
 
-      std::vector<float> expected_input_grad_data = std::vector<float>(
-          input_grad_accessor.shape.num_elements().unwrap_nonnegative(), 1.0f);
-      std::vector<float> host_input_grad_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(input_grad_accessor));
-      CHECK(host_input_grad_data == expected_input_grad_data);
+      CHECK(contains_non_zero(input_grad_accessor));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_split_kernel.cc b/lib/kernels/test/src/test_split_kernel.cc
index ea0d280f68..44e8f42f76 100644
--- a/lib/kernels/test/src/test_split_kernel.cc
+++ b/lib/kernels/test/src/test_split_kernel.cc
@@ -1,24 +1,33 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/split_kernels.h"
-#include "test_utils.h"
+#include "op-attrs/datatype_value.h"
 #include "utils/containers/repeat.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
 
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test Split Forward and Backward Kernel") {
     nonnegative_int num_outputs = 2_n;
     coord_t out_blk_sizes[] = {50, 50};
     coord_t in_blk_size = 100;
     coord_t num_blks = 1;
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n});
-    TensorShape output_shape = make_float_tensor_shape_from_legion_dims({50_n});
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{100_n}},
+        DataType::FLOAT,
+    };
+    TensorShape output_shape = TensorShape{
+        TensorDims{FFOrdered{50_n}},
+        DataType::FLOAT,
+    };
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorW input_accessor =
@@ -47,8 +56,8 @@ TEST_SUITE(FF_TEST_SUITE) {
         output_grad_ptrs[i] = output_grad_accessor.get_float_ptr();
       }
 
-      GenericTensorAccessorW input_grad_accessor =
-          create_filled_accessor_w(input_shape, allocator, 0.0f);
+      GenericTensorAccessorW input_grad_accessor = create_filled_accessor_w(
+          input_shape, allocator, make_float_data_type_value(0));
 
       Kernels::Split::backward_kernel(managed_stream.raw_stream(),
                                       input_grad_accessor.get_float_ptr(),
diff --git a/lib/kernels/test/src/test_transpose_kernel.cc b/lib/kernels/test/src/test_transpose_kernel.cc
index 02d99c86a1..3c15661396 100644
--- a/lib/kernels/test/src/test_transpose_kernel.cc
+++ b/lib/kernels/test/src/test_transpose_kernel.cc
@@ -1,58 +1,54 @@
-#include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/transpose_kernels.h"
-#include "test_utils.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test Transpose Kernel Operations") {
     TransposeAttrs attrs = TransposeAttrs{
-        FFOrdered<ff_dim_t>{
-            ff_dim_t{0_n},
+        FFOrdered{
             ff_dim_t{1_n},
+            ff_dim_t{0_n},
         },
     };
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle{
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    TensorShape input_shape =
-        make_float_tensor_shape_from_legion_dims({10_n, 10_n});
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered{10_n, 10_n}},
+        DataType::FLOAT,
+    };
     TensorShape output_shape = input_shape;
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorR input_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(input_shape, allocator));
+          create_random_filled_accessor_r(input_shape, allocator);
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
 
       Kernels::Transpose::forward_kernel(
           managed_stream.raw_stream(), attrs, input_accessor, output_accessor);
 
-      std::vector<float> host_output_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(output_accessor));
-      CHECK(contains_non_zero(host_output_data));
+      CHECK(contains_non_zero(output_accessor));
     }
 
     SUBCASE("backward_kernel") {
       GenericTensorAccessorR output_grad_accessor =
-          read_only_accessor_from_write_accessor(
-              create_random_filled_accessor_w(output_shape, allocator));
+          create_random_filled_accessor_r(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
           create_random_filled_accessor_w(input_shape, allocator);
 
       Kernels::Transpose::backward_kernel(managed_stream.raw_stream(),
                                           attrs,
-                                          input_grad_accessor,
-                                          output_grad_accessor);
+                                          output_grad_accessor,
+                                          input_grad_accessor);
 
-      std::vector<float> host_grad_input_data =
-          load_data_to_host_from_device<float>(
-              read_only_accessor_from_write_accessor(input_grad_accessor));
-      CHECK(contains_non_zero(host_grad_input_data));
+      CHECK(contains_non_zero(input_grad_accessor));
     }
   }
 }
diff --git a/lib/kernels/test/src/test_utils.cc b/lib/kernels/test/src/test_utils.cc
deleted file mode 100644
index 903b666fa9..0000000000
--- a/lib/kernels/test/src/test_utils.cc
+++ /dev/null
@@ -1,106 +0,0 @@
-#include "test_utils.h"
-
-GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape,
-                                                       Allocator &allocator,
-                                                       bool cpu_fill) {
-  GenericTensorAccessorW accessor = allocator.allocate_tensor(shape);
-  size_t volume = accessor.shape.num_elements().unwrap_nonnegative();
-  std::vector<float> host_data(volume);
-  std::random_device rd;
-  std::mt19937 gen(rd());
-  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
-
-  for (auto &val : host_data) {
-    val = dist(gen);
-  }
-
-  if (cpu_fill) {
-    memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(float));
-  } else {
-    checkCUDA(cudaMemcpy(accessor.ptr,
-                         host_data.data(),
-                         host_data.size() * sizeof(float),
-                         cudaMemcpyHostToDevice));
-  }
-
-  return accessor;
-}
-
-GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape,
-                                                Allocator &allocator,
-                                                float val,
-                                                bool cpu_fill) {
-  GenericTensorAccessorW accessor = allocator.allocate_tensor(shape);
-  size_t volume = accessor.shape.num_elements().unwrap_nonnegative();
-  std::vector<float> host_data(volume, val);
-
-  if (cpu_fill) {
-    memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(float));
-  } else {
-    checkCUDA(cudaMemcpy(accessor.ptr,
-                         host_data.data(),
-                         host_data.size() * sizeof(float),
-                         cudaMemcpyHostToDevice));
-  }
-
-  return accessor;
-}
-
-GenericTensorAccessorW create_iota_filled_accessor_w(TensorShape const &shape,
-                                                     Allocator &allocator,
-                                                     bool cpu_fill) {
-  GenericTensorAccessorW accessor = allocator.allocate_tensor(shape);
-  size_t volume = accessor.shape.num_elements().unwrap_nonnegative();
-  std::vector<float> host_data(volume);
-
-  for (size_t i = 0; i < volume; i++) {
-    host_data[i] = i;
-  }
-
-  if (cpu_fill) {
-    memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(float));
-  } else {
-    checkCUDA(cudaMemcpy(accessor.ptr,
-                         host_data.data(),
-                         host_data.size() * sizeof(float),
-                         cudaMemcpyHostToDevice));
-  }
-
-  return accessor;
-}
-
-void fill_tensor_accessor_w(GenericTensorAccessorW accessor,
-                            float val,
-                            bool cpu_fill) {
-  size_t volume = accessor.shape.num_elements().unwrap_nonnegative();
-  std::vector<float> host_data(volume, val);
-
-  if (cpu_fill) {
-    memcpy(accessor.ptr, host_data.data(), host_data.size() * sizeof(float));
-  } else {
-    checkCUDA(cudaMemcpy(accessor.ptr,
-                         host_data.data(),
-                         host_data.size() * sizeof(float),
-                         cudaMemcpyHostToDevice));
-  }
-}
-
-TensorShape
-    make_float_tensor_shape_from_legion_dims(FFOrdered<nonnegative_int> dims) {
-  return TensorShape{
-      TensorDims{
-          dims,
-      },
-      DataType::FLOAT,
-  };
-}
-
-TensorShape
-    make_double_tensor_shape_from_legion_dims(FFOrdered<nonnegative_int> dims) {
-  return TensorShape{
-      TensorDims{
-          dims,
-      },
-      DataType::DOUBLE,
-  };
-}
diff --git a/lib/kernels/test/src/test_utils.h b/lib/kernels/test/src/test_utils.h
deleted file mode 100644
index 08f0f382fb..0000000000
--- a/lib/kernels/test/src/test_utils.h
+++ /dev/null
@@ -1,72 +0,0 @@
-#ifndef _FLEXFLOW_KERNELS_TEST_UTILS
-#define _FLEXFLOW_KERNELS_TEST_UTILS
-
-#include "kernels/device.h"
-#include "kernels/local_cuda_allocator.h"
-#include "kernels/managed_ff_stream.h"
-#include "kernels/managed_per_device_ff_handle.h"
-#include <doctest/doctest.h>
-#include <random>
-#include <sstream>
-#include <string>
-#include <vector>
-
-using namespace FlexFlow;
-
-GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape,
-                                                       Allocator &allocator,
-                                                       bool cpu_fill = false);
-
-GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape,
-                                                Allocator &allocator,
-                                                float val,
-                                                bool cpu_fill = false);
-
-GenericTensorAccessorW create_iota_filled_accessor_w(TensorShape const &shape,
-                                                     Allocator &allocator,
-                                                     bool cpu_fill = false);
-
-void fill_tensor_accessor_w(GenericTensorAccessorW accessor,
-                            float val,
-                            bool cpu_fill = false);
-
-TensorShape
-    make_float_tensor_shape_from_legion_dims(FFOrdered<nonnegative_int> dims);
-
-TensorShape
-    make_double_tensor_shape_from_legion_dims(FFOrdered<nonnegative_int> dims);
-
-template <typename T>
-std::vector<T> load_data_to_host_from_device(GenericTensorAccessorR accessor) {
-  int volume = accessor.shape.get_volume();
-
-  std::vector<T> local_data(volume);
-  checkCUDA(cudaMemcpy(local_data.data(),
-                       accessor.ptr,
-                       local_data.size() * sizeof(T),
-                       cudaMemcpyDeviceToHost));
-  return local_data;
-}
-
-template <typename T>
-bool contains_non_zero(std::vector<T> &data) {
-  return !all_of(
-      data.begin(), data.end(), [](T const &val) { return val == 0; });
-}
-
-// Specialize doctest's StringMaker for std::vector<float>
-template <>
-struct doctest::StringMaker<std::vector<float>> {
-  static doctest::String convert(std::vector<float> const &vec) {
-    std::ostringstream oss;
-    for (size_t i = 0; i < vec.size(); ++i) {
-      oss << vec[i];
-      if (i != vec.size() - 1) {
-        oss << ", ";
-      }
-    }
-    return doctest::String(("[" + oss.str() + "]").c_str());
-  }
-};
-
-#endif
diff --git a/lib/local-execution/include/local-execution/per_device_op_state.h b/lib/local-execution/include/local-execution/per_device_op_state.h
index 1edd5b6360..f1f357a86e 100644
--- a/lib/local-execution/include/local-execution/per_device_op_state.h
+++ b/lib/local-execution/include/local-execution/per_device_op_state.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_PER_DEVICE_STATE_H
 #define _FLEXFLOW_LOCAL_EXECUTION_PER_DEVICE_STATE_H
 
+#include "kernels/per_device_op_state.dtg.h"
 #include "local-execution/device_specific_device_states.dtg.h"
-#include "local-execution/per_device_op_state.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/task_argument_accessor.h b/lib/local-execution/include/local-execution/task_argument_accessor.h
index 54c8dfc5f1..48584588e3 100644
--- a/lib/local-execution/include/local-execution/task_argument_accessor.h
+++ b/lib/local-execution/include/local-execution/task_argument_accessor.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_ARGUMENT_ACCESSOR_H
 #define _FLEXFLOW_LOCAL_EXECUTION_TASK_ARGUMENT_ACCESSOR_H
 
+#include "kernels/per_device_op_state.dtg.h"
 #include "local-execution/device_specific.h"
 #include "local-execution/itask_argument_accessor.h"
-#include "local-execution/per_device_op_state.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/tracked_allocator.h b/lib/local-execution/include/local-execution/tracked_allocator.h
index 731e04fdc8..f697337c52 100644
--- a/lib/local-execution/include/local-execution/tracked_allocator.h
+++ b/lib/local-execution/include/local-execution/tracked_allocator.h
@@ -13,6 +13,9 @@ struct TrackedAllocator : public IAllocator {
 
   void *allocate(size_t) override;
   void deallocate(void *) override;
+
+  DeviceType get_allocation_device_type() const override;
+
   size_t get_current_mem_usage();
 
 private:
diff --git a/lib/local-execution/src/local_task_argument_accessor.cc b/lib/local-execution/src/local_task_argument_accessor.cc
index 54eca7e514..5d099c6b46 100644
--- a/lib/local-execution/src/local_task_argument_accessor.cc
+++ b/lib/local-execution/src/local_task_argument_accessor.cc
@@ -24,8 +24,8 @@ GenericTensorAccessor LocalTaskArgumentAccessor::get_tensor(
   auto tensor_backing = std::get<GenericTensorAccessorW>(
       this->tensor_slots_backing.at(slot_grad_pair));
   if (priv == Permissions::RO) {
-    GenericTensorAccessorR readonly_tensor_backing = {
-        tensor_backing.data_type, tensor_backing.shape, tensor_backing.ptr};
+    GenericTensorAccessorR readonly_tensor_backing =
+        read_only_accessor_from_write_accessor(tensor_backing);
     return readonly_tensor_backing;
   } else if (priv == Permissions::RW || priv == Permissions::WO) {
     return tensor_backing;
@@ -33,6 +33,7 @@ GenericTensorAccessor LocalTaskArgumentAccessor::get_tensor(
     throw mk_runtime_error(fmt::format("Unhandled privilege mode {}", priv));
   }
 }
+
 VariadicGenericTensorAccessor LocalTaskArgumentAccessor::get_variadic_tensor(
     slot_id_t slot, Permissions priv, IsGrad is_grad) const {
   SlotGradId slot_grad_pair = SlotGradId{slot, is_grad};
@@ -43,7 +44,7 @@ VariadicGenericTensorAccessor LocalTaskArgumentAccessor::get_variadic_tensor(
     for (GenericTensorAccessorW const &tensor_backing :
          variadic_tensor_backing) {
       readonly_variadic_tensor_backing.push_back(
-          {tensor_backing.data_type, tensor_backing.shape, tensor_backing.ptr});
+          read_only_accessor_from_write_accessor(tensor_backing));
     }
     return readonly_variadic_tensor_backing;
   } else if (priv == Permissions::RW || priv == Permissions::WO) {
diff --git a/lib/local-execution/src/ops/batch_norm.cc b/lib/local-execution/src/ops/batch_norm.cc
index 1df6da8d8e..5cf8742918 100644
--- a/lib/local-execution/src/ops/batch_norm.cc
+++ b/lib/local-execution/src/ops/batch_norm.cc
@@ -134,9 +134,9 @@ static std::optional<float>
                  profiling,
                  "[BatchNorm] backward_time = {:.2lf}ms\n",
                  per_device_state,
-                 input.get_float_ptr(),
-                 output_grad.get_float_ptr(),
                  output.get_float_ptr(),
+                 output_grad.get_float_ptr(),
+                 input.get_float_ptr(),
                  input_grad.get_float_ptr(),
                  scale.get_float_ptr(),
                  scale_grad.get_float_ptr(),
diff --git a/lib/local-execution/src/ops/cast.cc b/lib/local-execution/src/ops/cast.cc
index 3e7baf49a9..e9adf88422 100644
--- a/lib/local-execution/src/ops/cast.cc
+++ b/lib/local-execution/src/ops/cast.cc
@@ -54,9 +54,7 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
                  profiling,
                  "[Cast] forward_time = {:.2lf}ms\n",
                  input,
-                 output,
-                 input.data_type,
-                 attrs.dtype);
+                 output);
 }
 
 static std::optional<float>
@@ -73,9 +71,7 @@ static std::optional<float>
                  profiling,
                  "[Cast] forward_time = {:.2lf}ms\n",
                  input_grad,
-                 output_grad,
-                 input.data_type,
-                 attrs.dtype);
+                 output_grad);
 }
 
 TaskImplFunction get_cast_fwd_task_impl() {
diff --git a/lib/local-execution/src/ops/conv_2d.cc b/lib/local-execution/src/ops/conv_2d.cc
index bb1504a3f5..55ff354483 100644
--- a/lib/local-execution/src/ops/conv_2d.cc
+++ b/lib/local-execution/src/ops/conv_2d.cc
@@ -107,8 +107,8 @@ static std::optional<float>
       acc.get_argument<Conv2DPerDeviceState>(PER_DEVICE_STATE);
   auto attrs = acc.get_argument<Conv2DAttrs>(ATTRS);
 
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
+  auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto filter = acc.get_tensor<Permissions::RO>(FILTER);
 
   auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
@@ -120,10 +120,10 @@ static std::optional<float>
                  profiling,
                  "[Conv2d] backward_time = {:.2lf}ms\n",
                  per_device_state,
-                 input.get_float_ptr(),
-                 input_grad.get_float_ptr(),
                  output.get_float_ptr(),
                  output_grad.get_float_ptr(),
+                 input.get_float_ptr(),
+                 input_grad.get_float_ptr(),
                  filter.get_float_ptr(),
                  filter_grad.get_float_ptr(),
                  bias_grad.get_float_ptr(),
diff --git a/lib/local-execution/src/ops/element_unary.cc b/lib/local-execution/src/ops/element_unary.cc
index c5ff9199f3..311b8e7924 100644
--- a/lib/local-execution/src/ops/element_unary.cc
+++ b/lib/local-execution/src/ops/element_unary.cc
@@ -58,8 +58,10 @@ static DeviceSpecificDeviceStates
 
   ParallelTensorShape output_shape =
       throw_if_unexpected(get_output_shape(attrs, input_shape));
-  ElementUnaryPerDeviceState per_device_state = init_kernel(
-      get_piece_shape(input_shape), get_piece_shape(output_shape), attrs);
+  ElementUnaryPerDeviceState per_device_state =
+      init_kernel(array_shape_from_tensor_shape(get_piece_shape(input_shape)),
+                  array_shape_from_tensor_shape(get_piece_shape(output_shape)),
+                  attrs);
 
   return DeviceSpecificDeviceStates{
       DeviceSpecific<ElementUnaryPerDeviceState>::create(per_device_state)};
@@ -88,10 +90,10 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
 static std::optional<float>
     backward_task_impl(TaskArgumentAccessor const &acc) {
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
   auto output = acc.get_tensor<Permissions::RO>(OUTPUT);
   auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
+  auto input = acc.get_tensor<Permissions::RO>(INPUT);
+  auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
 
   auto const &attrs = acc.get_argument<ElementUnaryAttrs>(ATTRS);
   auto handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
@@ -106,10 +108,10 @@ static std::optional<float>
                  per_device_state,
                  attrs,
                  handle,
-                 input,
-                 input_grad,
                  output,
-                 output_grad);
+                 output_grad,
+                 input,
+                 input_grad);
 }
 
 TaskImplFunction get_element_unary_init_task_impl() {
diff --git a/lib/local-execution/src/ops/flat.cc b/lib/local-execution/src/ops/flat.cc
index 0f872b5d50..af6fc16272 100644
--- a/lib/local-execution/src/ops/flat.cc
+++ b/lib/local-execution/src/ops/flat.cc
@@ -40,15 +40,15 @@ static std::optional<float>
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
 
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
   auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
+  auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
 
   return profile(backward_kernel,
                  profiling,
                  "[Flat] backward_time = {:.2lf}ms\n",
                  input,
-                 input_grad.get_float_ptr(),
-                 output_grad.get_float_ptr());
+                 output_grad.get_float_ptr(),
+                 input_grad.get_float_ptr());
 }
 
 TaskImplFunction get_flat_fwd_task_impl() {
diff --git a/lib/local-execution/src/ops/linear.cc b/lib/local-execution/src/ops/linear.cc
index 6f0901e66a..9641cdbd4a 100644
--- a/lib/local-execution/src/ops/linear.cc
+++ b/lib/local-execution/src/ops/linear.cc
@@ -26,9 +26,9 @@ OpTaskInvocation init(LinearAttrs const &attrs) {
   binding.bind_arg(HANDLE, ff_handle());
   binding.bind_arg(ATTRS, attrs);
 
-  binding.bind(INPUT, input_tensor(0));   // input
-  binding.bind(WEIGHT, weight_tensor(0)); // weight
-  binding.bind(OUTPUT, output_tensor(0)); // output
+  binding.bind(INPUT, input_tensor(0));
+  binding.bind(WEIGHT, weight_tensor(0));
+  binding.bind(OUTPUT, output_tensor(0));
 
   return {task_id_t::LINEAR_INIT_TASK_ID, binding};
 }
@@ -36,11 +36,11 @@ OpTaskInvocation init(LinearAttrs const &attrs) {
 OpTaskInvocation forward(LinearAttrs const &attrs) {
   OpTaskBinding binding;
 
-  binding.bind(INPUT, input_tensor(0));   // input
-  binding.bind(WEIGHT, weight_tensor(0)); // weight
-  binding.bind(OUTPUT, output_tensor(0)); // output
+  binding.bind(INPUT, input_tensor(0));
+  binding.bind(WEIGHT, weight_tensor(0));
+  binding.bind(OUTPUT, output_tensor(0));
   if (attrs.use_bias) {
-    binding.bind(BIAS, weight_tensor(1)); // bias
+    binding.bind(BIAS, weight_tensor(1));
   }
 
   binding.bind_arg(PROFILING, profiling_settings());
@@ -124,20 +124,21 @@ static std::optional<float>
     backward_task_impl(TaskArgumentAccessor const &acc) {
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto weight = acc.get_tensor<Permissions::RO>(WEIGHT);
-  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-  auto bias = acc.get_tensor<Permissions::RO>(BIAS);
+  auto output = acc.get_tensor<Permissions::RO>(OUTPUT);
 
   auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
   auto weight_grad = acc.get_tensor_grad<Permissions::RW>(WEIGHT);
-  auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
+  auto output_grad = acc.get_tensor_grad<Permissions::RW>(OUTPUT);
+
   auto per_device_state =
       acc.get_argument<LinearPerDeviceState>(PER_DEVICE_STATE);
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto attrs = acc.get_argument<LinearAttrs>(ATTRS);
 
-  float const *bias_ptr = NULL;
+  float *bias_grad_ptr = NULL;
   if (attrs.use_bias) {
-    bias_ptr = bias.get_float_ptr();
+    auto bias_grad = acc.get_tensor_grad<Permissions::RW>(BIAS);
+    bias_grad_ptr = bias_grad.get_float_ptr();
   }
 
   nonnegative_int in_dim = input.shape.at(ff_dim_t{0_n});
@@ -148,13 +149,13 @@ static std::optional<float>
                  profiling,
                  "[Linear] backward_time = {:.2lf}ms\n",
                  per_device_state,
-                 (void *)input.get_float_ptr(),
-                 (void *)input_grad.get_float_ptr(),
-                 (void *)output.get_float_ptr(),
-                 (void *)output_grad.get_float_ptr(),
-                 (void *)weight.get_float_ptr(),
-                 (void *)weight_grad.get_float_ptr(),
-                 (void *)bias_ptr,
+                 output.get_float_ptr(),
+                 output_grad.get_float_ptr(),
+                 input.get_float_ptr(),
+                 input_grad.get_float_ptr(),
+                 weight.get_float_ptr(),
+                 weight_grad.get_float_ptr(),
+                 bias_grad_ptr,
                  in_dim.unwrap_nonnegative(),
                  out_dim.unwrap_nonnegative(),
                  batch_size.unwrap_nonnegative());
diff --git a/lib/local-execution/src/ops/pool_2d.cc b/lib/local-execution/src/ops/pool_2d.cc
index fb0635efba..f85874dc0a 100644
--- a/lib/local-execution/src/ops/pool_2d.cc
+++ b/lib/local-execution/src/ops/pool_2d.cc
@@ -115,19 +115,19 @@ static std::optional<float>
   Pool2DPerDeviceState state =
       acc.get_argument<Pool2DPerDeviceState>(PER_DEVICE_STATE);
 
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto input_grad = acc.get_tensor<Permissions::RW>(INPUT);
   auto output = acc.get_tensor<Permissions::RO>(OUTPUT);
   auto output_grad = acc.get_tensor<Permissions::RO>(OUTPUT);
+  auto input = acc.get_tensor<Permissions::RO>(INPUT);
+  auto input_grad = acc.get_tensor<Permissions::RW>(INPUT);
 
   return profile(backward_kernel,
                  profiling,
                  "[Pool2D] backward_time = {:.2lf}ms\n",
                  state,
-                 input.get_float_ptr(),
-                 input_grad.get_float_ptr(),
                  output.get_float_ptr(),
-                 output_grad.get_float_ptr());
+                 output_grad.get_float_ptr(),
+                 input.get_float_ptr(),
+                 input_grad.get_float_ptr());
 }
 
 TaskImplFunction get_pool_2d_init_task_impl() {
diff --git a/lib/local-execution/src/ops/reduction.cc b/lib/local-execution/src/ops/reduction.cc
index ee1a7c6c4e..b07d9fe965 100644
--- a/lib/local-execution/src/ops/reduction.cc
+++ b/lib/local-execution/src/ops/reduction.cc
@@ -63,13 +63,13 @@ static std::optional<float>
     backward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
 
-  auto input_grad = acc.get_tensor_grad<Permissions::WO>(INPUT);
   auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
+  auto input_grad = acc.get_tensor_grad<Permissions::WO>(INPUT);
   return profile(backward_kernel,
                  profiling,
                  "[Reduction] backward_time = {:.2lf}ms\n",
-                 input_grad,
-                 output_grad);
+                 output_grad,
+                 input_grad);
 }
 
 TaskImplFunction get_reduction_fwd_task_impl() {
diff --git a/lib/local-execution/src/ops/repartition.cc b/lib/local-execution/src/ops/repartition.cc
index 6c0c813c8d..7b6e9fe2f6 100644
--- a/lib/local-execution/src/ops/repartition.cc
+++ b/lib/local-execution/src/ops/repartition.cc
@@ -85,8 +85,8 @@ static std::optional<float>
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto per_device_state =
       acc.get_argument<RepartitionPerDeviceState>(PER_DEVICE_STATE);
-  auto input_grad = acc.get_tensor_grad<Permissions::RO>(INPUT);
-  auto output_grad = acc.get_tensor_grad<Permissions::WO>(OUTPUT);
+  auto output_grad = acc.get_tensor_grad<Permissions::RO>(INPUT);
+  auto input_grad = acc.get_tensor_grad<Permissions::WO>(OUTPUT);
 
   return profile(backward_kernel,
                  profiling,
diff --git a/lib/local-execution/src/ops/replicate.cc b/lib/local-execution/src/ops/replicate.cc
index d3ada35d93..99aeb913ba 100644
--- a/lib/local-execution/src/ops/replicate.cc
+++ b/lib/local-execution/src/ops/replicate.cc
@@ -66,8 +66,8 @@ static std::optional<float>
   return profile(backward_kernel,
                  profiling,
                  "[replicate] backward_time = {:.2lf}ms\n",
-                 input_grad,
                  output_grad,
+                 input_grad,
                  attrs.replicate_degree.unwrap_nonnegative());
 }
 
diff --git a/lib/local-execution/src/ops/reshape.cc b/lib/local-execution/src/ops/reshape.cc
index fc3a75607d..e382b2668e 100644
--- a/lib/local-execution/src/ops/reshape.cc
+++ b/lib/local-execution/src/ops/reshape.cc
@@ -86,8 +86,8 @@ static std::optional<float>
                  profiling,
                  "[Reshape] backward time = {:.2lf}ms\n",
                  per_device_state,
-                 input_grad,
-                 output_grad);
+                 output_grad,
+                 input_grad);
 }
 
 TaskImplFunction get_reshape_init_task_impl() {
diff --git a/lib/local-execution/src/ops/reverse.cc b/lib/local-execution/src/ops/reverse.cc
index ddd47d355d..00f56c6892 100644
--- a/lib/local-execution/src/ops/reverse.cc
+++ b/lib/local-execution/src/ops/reverse.cc
@@ -48,30 +48,12 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
   auto attrs = acc.get_argument<ReverseAttrs>(ATTRS);
 
-  nonnegative_int output_size = output.shape.get_volume();
-  auto axis = attrs.axis;
-  nonnegative_int in_blk_size = 1_n;
-  nonnegative_int reverse_dim_size = 1_n;
-  nonnegative_int num_out_blks = 1_n;
-  for (nonnegative_int i : nonnegative_range(output.shape.get_dim())) {
-    if (i < axis.value) {
-      in_blk_size *= output.shape.at(ff_dim_t{i});
-    } else if (i == axis.value) {
-      reverse_dim_size = output.shape.at(ff_dim_t{i});
-    } else {
-      num_out_blks *= output.shape.at(ff_dim_t{i});
-    }
-  }
-
   return profile(forward_kernel,
                  profiling,
                  "[reverse] forward_time = {:.2lf}ms\n",
-                 input.get_float_ptr(),
-                 output.get_float_ptr(),
-                 num_out_blks.unwrap_nonnegative(),
-                 reverse_dim_size.unwrap_nonnegative(),
-                 in_blk_size.unwrap_nonnegative(),
-                 output_size.unwrap_nonnegative());
+                 input,
+                 output,
+                 attrs);
 }
 
 static std::optional<float>
@@ -81,30 +63,12 @@ static std::optional<float>
   auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
   auto attrs = acc.get_argument<ReverseAttrs>(ATTRS);
 
-  int axis = input_grad.shape.num_dims().unwrap_nonnegative() -
-             attrs.axis.value.unwrap_nonnegative() - 1;
-  nonnegative_int in_blk_size = 1_n;
-  nonnegative_int reverse_dim_size = 1_n;
-  nonnegative_int num_out_blks = 1_n;
-  for (nonnegative_int i : nonnegative_range(input_grad.shape.get_dim())) {
-    if (i < axis) {
-      in_blk_size *= input_grad.shape.at(ff_dim_t{i});
-    } else if (i == axis) {
-      reverse_dim_size = input_grad.shape.at(ff_dim_t{i});
-    } else {
-      num_out_blks *= input_grad.shape.at(ff_dim_t{i});
-    }
-  }
-
   return profile(backward_kernel,
                  profiling,
                  "[reverse] backward_time = {:.2lf}ms\n",
-                 output_grad.get_float_ptr(),
-                 input_grad.get_float_ptr(),
-                 num_out_blks.unwrap_nonnegative(),
-                 reverse_dim_size.unwrap_nonnegative(),
-                 in_blk_size.unwrap_nonnegative(),
-                 input_grad.shape.get_volume().unwrap_nonnegative());
+                 output_grad,
+                 input_grad,
+                 attrs);
 }
 
 TaskImplFunction get_reverse_fwd_task_impl() {
diff --git a/lib/local-execution/src/ops/softmax.cc b/lib/local-execution/src/ops/softmax.cc
index 0e94422c5f..e008098e05 100644
--- a/lib/local-execution/src/ops/softmax.cc
+++ b/lib/local-execution/src/ops/softmax.cc
@@ -106,8 +106,8 @@ static std::optional<float>
   return profile(backward_kernel,
                  profiling,
                  "[SoftMax] backward_time = {:.2lf}ms\n",
-                 input_grad.get_float_ptr(),
                  output_grad.get_float_ptr(),
+                 input_grad.get_float_ptr(),
                  output_grad.shape.get_volume().unwrap_nonnegative());
 }
 
diff --git a/lib/local-execution/src/ops/transpose.cc b/lib/local-execution/src/ops/transpose.cc
index 4146836b9a..1859bb0ccc 100644
--- a/lib/local-execution/src/ops/transpose.cc
+++ b/lib/local-execution/src/ops/transpose.cc
@@ -67,8 +67,8 @@ static std::optional<float>
                  profiling,
                  "[Transpose] Backward_time = {:.2lf} [ms]",
                  attrs,
-                 input_grad,
-                 output_grad);
+                 output_grad,
+                 input_grad);
 }
 
 OpTaskInvocation backward(TransposeAttrs const &attrs) {
diff --git a/lib/local-execution/src/per_device_state.cc b/lib/local-execution/src/per_device_op_state.cc
similarity index 100%
rename from lib/local-execution/src/per_device_state.cc
rename to lib/local-execution/src/per_device_op_state.cc
diff --git a/lib/local-execution/src/tracked_allocator.cc b/lib/local-execution/src/tracked_allocator.cc
index e6c3a11711..ed181aea32 100644
--- a/lib/local-execution/src/tracked_allocator.cc
+++ b/lib/local-execution/src/tracked_allocator.cc
@@ -23,8 +23,13 @@ size_t TrackedAllocator::get_current_mem_usage() {
   return this->current_mem_usage;
 }
 
+DeviceType TrackedAllocator::get_allocation_device_type() const {
+  return this->allocator.get_allocation_device_type();
+}
+
 Allocator get_tracked_memory_allocator(Allocator const &base_allocator) {
-  return Allocator::create<TrackedAllocator>(base_allocator);
+  Allocator allocator = Allocator::create<TrackedAllocator>(base_allocator);
+  return allocator;
 }
 
 } // namespace FlexFlow
diff --git a/lib/local-execution/test/src/test_local_cost_estimator.cc b/lib/local-execution/test/src/test_local_cost_estimator.cc
index da3af6e3ad..9f8b4092c1 100644
--- a/lib/local-execution/test/src/test_local_cost_estimator.cc
+++ b/lib/local-execution/test/src/test_local_cost_estimator.cc
@@ -12,68 +12,71 @@
 // TEST_SUITE(FF_CUDA_TEST_SUITE) {
 //   TEST_CASE("Local Cost Estimator") {
 //     // local backing initialization
-//     ManagedPerDeviceFFHandle managed_handle{};
+//     ManagedPerDeviceFFHandle managed_handle{
+//       /*workSpaceSize=*/1024 * 1024,
+//       /*allowTensorOpMathConversion=*/true};
 
-//     RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{
-//         DeviceSpecific<PerDeviceFFHandle>::create(managed_handle.raw_handle()),
-//         EnableProfiling::YES,
-//         ProfilingSettings{/*warmup_iters=*/0,
-//                           /*measure_iters=*/1}};
+//         RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{
+//             DeviceSpecific<PerDeviceFFHandle>::create(managed_handle.raw_handle()),
+//             EnableProfiling::YES,
+//             ProfilingSettings{/*warmup_iters=*/0,
+//                               /*measure_iters=*/1}};
 
-//     LocalCostEstimator cost_estimator =
-//     LocalCostEstimator{runtime_arg_config};
+//         LocalCostEstimator cost_estimator =
+//         LocalCostEstimator{runtime_arg_config};
 
-//     SUBCASE("Estimate cost -- Attention Op") {
-//       int embed_dim = 32;
-//       int num_heads = 10;
-//       MultiHeadAttentionAttrs attrs = MultiHeadAttentionAttrs{
-//           /*embed_dim=*/embed_dim,
-//           /*num_heads=*/num_heads,
-//           /*kdim=*/embed_dim,
-//           /*vdim=*/embed_dim,
-//           /*dropout=*/0.0,
-//           /*bias=*/true,
-//           /*add_bias_kv=*/false,
-//           /*add_zero_attn=*/false,
-//       };
+//         SUBCASE("Estimate cost -- Attention Op") {
+//           int embed_dim = 32;
+//           int num_heads = 10;
+//           MultiHeadAttentionAttrs attrs = MultiHeadAttentionAttrs{
+//               /*embed_dim=*/embed_dim,
+//               /*num_heads=*/num_heads,
+//               /*kdim=*/embed_dim,
+//               /*vdim=*/embed_dim,
+//               /*dropout=*/0.0,
+//               /*bias=*/true,
+//               /*add_bias_kv=*/false,
+//               /*add_zero_attn=*/false,
+//           };
 
-//       size_t batch_size = 40;
-//       size_t seq_len = 48;
-//       size_t feature_size = 36;
+//           size_t batch_size = 40;
+//           size_t seq_len = 48;
+//           size_t feature_size = 36;
 
-//       DataType dtype = DataType::FLOAT;
-//       ParallelTensorShape inputs_shape = lift_to_parallel(TensorShape{
-//           TensorDims{FFOrdered<size_t>{batch_size, seq_len, feature_size}},
-//           DataType::FLOAT,
-//       });
+//           DataType dtype = DataType::FLOAT;
+//           ParallelTensorShape inputs_shape = lift_to_parallel(TensorShape{
+//               TensorDims{FFOrdered<size_t>{batch_size, seq_len,
+//               feature_size}}, DataType::FLOAT,
+//           });
 
-//       ParallelTensorShape weights_shape = throw_if_unexpected(
-//           get_weights_shape(attrs, inputs_shape, inputs_shape,
-//           inputs_shape));
-//       ParallelTensorAttrs weight_attrs =
-//           ParallelTensorAttrs{weights_shape,
-//                               /*sync_type=*/std::nullopt,
-//                               /*initializer=*/std::nullopt,
-//                               CreateGrad::YES};
+//           ParallelTensorShape weights_shape = throw_if_unexpected(
+//               get_weights_shape(attrs, inputs_shape, inputs_shape,
+//               inputs_shape));
+//           ParallelTensorAttrs weight_attrs =
+//               ParallelTensorAttrs{weights_shape,
+//                                   /*sync_type=*/std::nullopt,
+//                                   /*initializer=*/std::nullopt,
+//                                   CreateGrad::YES};
 
-//       ParallelTensorShape output_shape = throw_if_unexpected(
-//           get_output_shape(attrs, inputs_shape, inputs_shape, inputs_shape));
-//       ParallelTensorAttrs output_attrs =
-//           ParallelTensorAttrs{output_shape,
-//                               /*sync_type=*/std::nullopt,
-//                               /*initializer=*/std::nullopt,
-//                               CreateGrad::YES};
+//           ParallelTensorShape output_shape = throw_if_unexpected(
+//               get_output_shape(attrs, inputs_shape, inputs_shape,
+//               inputs_shape));
+//           ParallelTensorAttrs output_attrs =
+//               ParallelTensorAttrs{output_shape,
+//                                   /*sync_type=*/std::nullopt,
+//                                   /*initializer=*/std::nullopt,
+//                                   CreateGrad::YES};
 
-//       CostDetails result = cost_estimator.estimate_cost(
-//           PCGOperatorAttrs{attrs},
-//           std::vector<ParallelTensorShape>{
-//               inputs_shape, inputs_shape, inputs_shape},
-//           std::vector<ParallelTensorAttrs>{weight_attrs},
-//           std::vector<ParallelTensorAttrs>{output_attrs},
-//           make_1d_machine_view(gpu_id_t{0}, gpu_id_t{1}));
+//           CostDetails result = cost_estimator.estimate_cost(
+//               PCGOperatorAttrs{attrs},
+//               std::vector<ParallelTensorShape>{
+//                   inputs_shape, inputs_shape, inputs_shape},
+//               std::vector<ParallelTensorAttrs>{weight_attrs},
+//               std::vector<ParallelTensorAttrs>{output_attrs},
+//               make_1d_machine_view(gpu_id_t{0}, gpu_id_t{1}));
 
-//       CHECK(result.total_elapsed_time > 0);
-//       CHECK(result.total_mem_usage > 0);
+//           CHECK(result.total_elapsed_time > 0);
+//           CHECK(result.total_mem_usage > 0);
+//         }
+//       }
 //     }
-//   }
-// }
diff --git a/lib/local-execution/test/src/test_local_slots_backing.cc b/lib/local-execution/test/src/test_local_slots_backing.cc
index dffb19398c..e55d1eddf5 100644
--- a/lib/local-execution/test/src/test_local_slots_backing.cc
+++ b/lib/local-execution/test/src/test_local_slots_backing.cc
@@ -1,6 +1,6 @@
 #include "kernels/attention_kernels.h"
+#include "kernels/local_cpu_allocator.h"
 #include "local-execution/local_cost_estimator.h"
-#include "local-execution/local_cpu_allocator.h"
 #include "local-execution/local_slots_backing.h"
 #include "op-attrs/ops/attention.h"
 #include "op-attrs/parallel_tensor_shape.h"
@@ -106,24 +106,24 @@ TEST_SUITE(FF_TEST_SUITE) {
           std::pair<ArrayShape, DataType> result =
               get_result_shape_and_dtype_for_tensor_guid_and_map(
                   query_guid, local_slots_backing.gradient_tensor_mapping);
-          std::pair<ArrayShape, DataType> correct = {ArrayShape{query_shape},
-                                                     dtype};
+          std::pair<ArrayShape, DataType> correct = {
+              array_shape_from_tensor_shape(query_shape), dtype};
           CHECK(result == correct);
         }
         SUBCASE("Key grad") {
           std::pair<ArrayShape, DataType> result =
               get_result_shape_and_dtype_for_tensor_guid_and_map(
                   key_guid, local_slots_backing.gradient_tensor_mapping);
-          std::pair<ArrayShape, DataType> correct = {ArrayShape{key_shape},
-                                                     dtype};
+          std::pair<ArrayShape, DataType> correct = {
+              array_shape_from_tensor_shape(key_shape), dtype};
           CHECK(result == correct);
         }
         SUBCASE("Value grad") {
           std::pair<ArrayShape, DataType> result =
               get_result_shape_and_dtype_for_tensor_guid_and_map(
                   value_guid, local_slots_backing.gradient_tensor_mapping);
-          std::pair<ArrayShape, DataType> correct = {ArrayShape{value_shape},
-                                                     dtype};
+          std::pair<ArrayShape, DataType> correct = {
+              array_shape_from_tensor_shape(value_shape), dtype};
           CHECK(result == correct);
         }
       }
@@ -135,9 +135,9 @@ TEST_SUITE(FF_TEST_SUITE) {
               get_result_shape_and_dtype_for_tensor_guid_and_map(
                   output_guid, local_slots_backing.tensor_mapping);
           std::pair<ArrayShape, DataType> correct = {
-              ArrayShape{
+              array_shape_from_tensor_shape(
                   get_tensor_attrs(cg_builder.computation_graph, output_guid)
-                      .shape},
+                      .shape),
               dtype};
           CHECK(result == correct);
         }
@@ -146,9 +146,9 @@ TEST_SUITE(FF_TEST_SUITE) {
               get_result_shape_and_dtype_for_tensor_guid_and_map(
                   output_guid, local_slots_backing.gradient_tensor_mapping);
           std::pair<ArrayShape, DataType> correct = {
-              ArrayShape{
+              array_shape_from_tensor_shape(
                   get_tensor_attrs(cg_builder.computation_graph, output_guid)
-                      .shape},
+                      .shape),
               dtype};
           CHECK(result == correct);
         }
diff --git a/lib/local-execution/test/src/test_local_task_arg_accessor.cc b/lib/local-execution/test/src/test_local_task_arg_accessor.cc
index 0fab0f6a60..a39bb229e2 100644
--- a/lib/local-execution/test/src/test_local_task_arg_accessor.cc
+++ b/lib/local-execution/test/src/test_local_task_arg_accessor.cc
@@ -1,5 +1,5 @@
 #include "doctest/doctest.h"
-#include "local-execution/local_cpu_allocator.h"
+#include "kernels/local_cpu_allocator.h"
 #include "local-execution/local_task_argument_accessor.h"
 #include "local-execution/task_signature_impl.h"
 #include "utils/fmt/variant.h"
diff --git a/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml b/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml
index 27aa50f38f..09ee99915d 100644
--- a/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml
+++ b/lib/op-attrs/include/op-attrs/aggregate_op.enum.toml
@@ -10,5 +10,6 @@ features = [
 [[values]]
 name = "SUM"
 
-[[value]]
+[[values]]
 name = "AVG"
+
diff --git a/lib/op-attrs/include/op-attrs/datatype_value.h b/lib/op-attrs/include/op-attrs/datatype_value.h
new file mode 100644
index 0000000000..723e69bddd
--- /dev/null
+++ b/lib/op-attrs/include/op-attrs/datatype_value.h
@@ -0,0 +1,16 @@
+#ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DATATYPE_VALUE_H
+#define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DATATYPE_VALUE_H
+
+#include "op-attrs/datatype_value.dtg.h"
+
+namespace FlexFlow {
+
+DataTypeValue make_float_data_type_value(float value);
+DataTypeValue make_double_data_type_value(double value);
+DataTypeValue make_int32_data_type_value(int32_t value);
+DataTypeValue make_int64_data_type_value(int64_t value);
+DataTypeValue make_bool_data_type_value(bool value);
+
+} // namespace FlexFlow
+
+#endif // _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_MAKE_DATATYPE_VALUE_H
diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/dim_ordered.h b/lib/op-attrs/include/op-attrs/dim_ordered/dim_ordered.h
index f2355289dc..5c47745209 100644
--- a/lib/op-attrs/include/op-attrs/dim_ordered/dim_ordered.h
+++ b/lib/op-attrs/include/op-attrs/dim_ordered/dim_ordered.h
@@ -17,13 +17,9 @@ struct DimOrdered {
   DimOrdered(std::initializer_list<T> const &l)
       : contents(l.begin(), l.end()) {}
 
-  /* template <typename I, typename std::enable_if<std::is_convertible<I,
-   * T>::value>::type> */
   DimOrdered(std::vector<T> const &contents)
       : contents(contents.begin(), contents.end()) {}
 
-  /* template <typename It, typename std::enable_if<std::is_convertible<typename
-   * It::value_type, T>::value>::type> */
   template <typename It>
   DimOrdered(It begin, It end) : contents(begin, end) {}
 
@@ -62,10 +58,6 @@ struct DimOrdered {
     return this->contents != other.contents;
   }
 
-  bool operator<(DimOrdered const &other) const {
-    return this->contents < other.contents;
-  }
-
   using iterator = typename stack_vector<T, MAX_TENSOR_DIM>::iterator;
   using const_iterator =
       typename stack_vector<T, MAX_TENSOR_DIM>::const_iterator;
@@ -116,7 +108,7 @@ struct DimOrdered {
   }
 
   reverse_iterator rend() {
-    return this->contents.crend();
+    return this->contents.rend();
   }
 
   const_reverse_iterator rend() const {
@@ -145,195 +137,26 @@ struct DimOrdered {
   stack_vector<T, MAX_TENSOR_DIM> contents;
 };
 
-template <typename T>
-struct DimOrdered<ff_dim_t, T> {
-  DimOrdered() {}
-
-  DimOrdered(std::initializer_list<T> const &l)
-      : contents(l.begin(), l.end()) {}
-
-  DimOrdered(std::vector<T> const &contents)
-      : contents(contents.begin(), contents.end()) {}
-
-  template <typename It>
-  DimOrdered(It begin, It end) : contents(begin, end) {}
-
-  template <size_t MAXSIZE>
-  DimOrdered(stack_vector<T, MAXSIZE> const &contents)
-      : contents(contents.begin(), contents.end()) {}
-
-  T const &at(ff_dim_t idx) const {
-    int raw = idx.value.unwrap_nonnegative();
-    return this->contents.at(raw);
-  }
-
-  T const &at(relative_ff_dim_t idx) const {
-    int raw = idx.value;
-    if (raw < 0) {
-      raw = this->contents.size() + raw;
-    }
-    return this->contents.at(raw);
-  }
-
-  T &at(ff_dim_t idx) {
-    int raw = idx.value.unwrap_nonnegative();
-    return this->contents.at(raw);
-  }
-
-  T &at(relative_ff_dim_t idx) {
-    int raw = idx.value;
-    if (raw < 0) {
-      raw = this->contents.size() + raw;
-    }
-    return this->contents.at(raw);
-  }
-
-  T const &operator[](ff_dim_t idx) const {
-    return this->at(idx);
-  }
-
-  T const &operator[](relative_ff_dim_t idx) const {
-    return this->at(idx);
-  }
-
-  T &operator[](ff_dim_t idx) {
-    return this->at(idx);
-  }
-
-  T &operator[](relative_ff_dim_t idx) {
-    return this->at(idx);
-  }
-
-  bool idx_is_valid(ff_dim_t const &idx) const {
-    int raw = idx.value.unwrap_nonnegative();
-    return raw < this->contents.size();
-  }
-
-  bool idx_is_valid(relative_ff_dim_t const &idx) const {
-    int raw = idx.value;
-    if (raw < 0) {
-      raw = this->contents.size() + raw;
-    }
-    return (raw >= 0 && raw < this->contents.size());
-  }
-
-  bool operator==(DimOrdered const &other) const {
-    return this->contents == other.contents;
-  }
-
-  bool operator!=(DimOrdered const &other) const {
-    return this->contents != other.contents;
-  }
-
-  bool operator<(DimOrdered const &other) const {
-    return this->contents < other.contents;
-  }
-
-  using iterator = typename stack_vector<T, MAX_TENSOR_DIM>::iterator;
-  using const_iterator =
-      typename stack_vector<T, MAX_TENSOR_DIM>::const_iterator;
-  using reverse_iterator =
-      typename stack_vector<T, MAX_TENSOR_DIM>::reverse_iterator;
-  using const_reverse_iterator =
-      typename stack_vector<T, MAX_TENSOR_DIM>::const_reverse_iterator;
-  using value_type = T;
-  using pointer = value_type *;
-  using const_pointer = value_type const *;
-  using reference = value_type &;
-  using const_reference = value_type const &;
-
-  iterator begin() {
-    return this->contents.begin();
-  }
-
-  const_iterator begin() const {
-    return this->cbegin();
-  }
-
-  const_iterator cbegin() const {
-    return this->contents.cbegin();
-  }
-
-  iterator end() {
-    return this->contents.end();
-  }
-
-  const_iterator end() const {
-    return this->cend();
-  }
-
-  const_iterator cend() const {
-    return this->contents.cend();
-  }
-
-  reverse_iterator rbegin() {
-    return this->contents.rbegin();
-  }
-
-  const_reverse_iterator rbegin() const {
-    return this->crbegin();
-  }
-
-  const_reverse_iterator crbegin() const {
-    return this->contents.crbegin();
-  }
-
-  reverse_iterator rend() {
-    return this->contents.crend();
-  }
-
-  const_reverse_iterator rend() const {
-    return this->crend();
-  }
-
-  const_reverse_iterator crend() const {
-    return this->contents.crend();
-  }
-
-  size_t size() const {
-    return this->contents.size();
-  }
-
-  size_t empty() const {
-    return this->contents.empty();
-  }
-
-  size_t num_dims() const {
-    return this->size();
-  }
-
-  friend struct ::std::hash<DimOrdered>;
-
-private:
-  stack_vector<T, MAX_TENSOR_DIM> contents;
-};
-
-template <typename T>
-using FFOrdered = DimOrdered<ff_dim_t, T>;
+template <typename Idx, typename T>
+auto operator<(DimOrdered<Idx, T> const &lhs, DimOrdered<Idx, T> const &rhs)
+    -> std::enable_if_t<is_lt_comparable_v<T>, bool> {
+  return std::lexicographical_compare(
+      lhs.cbegin(), lhs.cend(), rhs.cbegin(), rhs.cend());
+}
 
-template <typename T>
-std::string format_as(FFOrdered<T> const &v) {
+template <typename Idx, typename T>
+std::string format_as(DimOrdered<Idx, T> const &v) {
   std::vector<T> as_vec(v.cbegin(), v.cend());
   return fmt::format("<ff_ordered {}>", as_vec);
 }
 
-template <typename T>
-std::ostream &operator<<(std::ostream &s, FFOrdered<T> const &v) {
+template <typename Idx, typename T>
+std::ostream &operator<<(std::ostream &s, DimOrdered<Idx, T> const &v) {
   return (s << fmt::to_string(v));
 }
 
 } // namespace FlexFlow
 
-/* template <typename Idx, typename T> */
-/* void to_json(json &j, DimOrdered<Idx, T> const &x) { */
-/*   /1* j = std::vector<T>{x.cbegin(), x.cend()}; *1/ */
-/* } */
-
-/* template <typename Idx, typename T> */
-/* void from_json(json const &j, DimOrdered<Idx, T> &x) { */
-/*   /1* x = DimOrdered<Idx, T>{j.template get<std::vector<T>>()}; *1/ */
-/* } */
-
 namespace nlohmann {
 template <typename Idx, typename T>
 struct adl_serializer<::FlexFlow::DimOrdered<Idx, T>> {
diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/slice.h b/lib/op-attrs/include/op-attrs/dim_ordered/slice.h
index 166916dd44..76526447be 100644
--- a/lib/op-attrs/include/op-attrs/dim_ordered/slice.h
+++ b/lib/op-attrs/include/op-attrs/dim_ordered/slice.h
@@ -2,7 +2,7 @@
 #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_SLICE_H
 
 #include "op-attrs/dim_ordered/dim_ordered.h"
-#include "utils/containers/subvec.h"
+#include "utils/containers/slice.h"
 #include "utils/containers/transform.h"
 #include "utils/containers/vector_of.h"
 #include "utils/optional.h"
@@ -18,35 +18,8 @@ DimOrdered<Idx, T> nonoverloaded_slice(DimOrdered<Idx, T> const &d,
   };
 
   return DimOrdered<Idx, T>{
-      subvec(vector_of(d), to_raw_idx(start), to_raw_idx(end))};
+      slice(vector_of(d), to_raw_idx(start), to_raw_idx(end))};
 }
-
-template <typename T>
-FFOrdered<T> ff_dim_t_nonoverloaded_slice(FFOrdered<T> const &d,
-                                          std::optional<ff_dim_t> const &start,
-                                          std::optional<ff_dim_t> const &end) {
-  auto to_raw_idx =
-      [](std::optional<ff_dim_t> const &idx) -> std::optional<int> {
-    return transform(
-        idx, [](ff_dim_t const &i) { return i.value.unwrap_nonnegative(); });
-  };
-
-  return FFOrdered<T>{subvec(vector_of(d), to_raw_idx(start), to_raw_idx(end))};
-}
-
-template <typename T>
-FFOrdered<T> relative_ff_dim_t_nonoverloaded_slice(
-    FFOrdered<T> const &d,
-    std::optional<relative_ff_dim_t> const &start,
-    std::optional<relative_ff_dim_t> const &end) {
-  auto to_raw_idx =
-      [](std::optional<relative_ff_dim_t> const &idx) -> std::optional<int> {
-    return transform(idx, [](relative_ff_dim_t const &i) { return i.value; });
-  };
-
-  return FFOrdered<T>{subvec(vector_of(d), to_raw_idx(start), to_raw_idx(end))};
-}
-
 template <typename Idx, typename T>
 DimOrdered<Idx, T> slice(DimOrdered<Idx, T> const &d,
                          std::optional<Idx> const &start = std::nullopt,
@@ -54,20 +27,6 @@ DimOrdered<Idx, T> slice(DimOrdered<Idx, T> const &d,
   return ff_dim_t_nonoverloaded_slice(d, start, end);
 }
 
-template <typename T>
-FFOrdered<T> slice(FFOrdered<T> const &d,
-                   std::optional<ff_dim_t> const &start = std::nullopt,
-                   std::optional<ff_dim_t> const &end = std::nullopt) {
-  return ff_dim_t_nonoverloaded_slice(d, start, end);
-}
-
-template <typename T>
-FFOrdered<T> slice(FFOrdered<T> const &d,
-                   std::optional<relative_ff_dim_t> const &start = std::nullopt,
-                   std::optional<relative_ff_dim_t> const &end = std::nullopt) {
-  return relative_ff_dim_t_nonoverloaded_slice(d, start, end);
-}
-
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/concat.h b/lib/op-attrs/include/op-attrs/ff_ordered/concat.h
similarity index 95%
rename from lib/op-attrs/include/op-attrs/dim_ordered/concat.h
rename to lib/op-attrs/include/op-attrs/ff_ordered/concat.h
index 9b9eaf9b93..a5faed2b36 100644
--- a/lib/op-attrs/include/op-attrs/dim_ordered/concat.h
+++ b/lib/op-attrs/include/op-attrs/ff_ordered/concat.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_CONCAT_H
 #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_CONCAT_H
 
-#include "op-attrs/dim_ordered/dim_ordered.h"
+#include "op-attrs/ff_ordered/ff_ordered.h"
 #include "utils/containers/concat_vectors.h"
 #include "utils/containers/transform.h"
 
diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/enumerate.h b/lib/op-attrs/include/op-attrs/ff_ordered/enumerate.h
similarity index 95%
rename from lib/op-attrs/include/op-attrs/dim_ordered/enumerate.h
rename to lib/op-attrs/include/op-attrs/ff_ordered/enumerate.h
index 9e4271a1ff..bc8636615c 100644
--- a/lib/op-attrs/include/op-attrs/dim_ordered/enumerate.h
+++ b/lib/op-attrs/include/op-attrs/ff_ordered/enumerate.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_ENUMERATE_H
 #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_ENUMERATE_H
 
-#include "op-attrs/dim_ordered/dim_ordered.h"
+#include "op-attrs/ff_ordered/ff_ordered.h"
 #include "utils/bidict/bidict.h"
 #include "utils/containers/count.h"
 
diff --git a/lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered.h b/lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered.h
new file mode 100644
index 0000000000..92ed211c31
--- /dev/null
+++ b/lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered.h
@@ -0,0 +1,228 @@
+#ifndef _FLEXFLOW_OPATTRS_INCLUDE_OPATTRS_DIM_ORDERED_FF_ORDERED_H
+#define _FLEXFLOW_OPATTRS_INCLUDE_OPATTRS_DIM_ORDERED_FF_ORDERED_H
+
+#include "op-attrs/ff_dim_t.dtg.h"
+#include "op-attrs/relative_ff_dim_t.dtg.h"
+#include "utils/fmt/vector.h"
+#include "utils/stack_vector/stack_vector.h"
+
+namespace FlexFlow {
+
+template <typename T>
+struct FFOrdered {
+  FFOrdered() {}
+
+  FFOrdered(std::initializer_list<T> const &l) : contents(l.begin(), l.end()) {}
+
+  FFOrdered(std::vector<T> const &contents)
+      : contents(contents.begin(), contents.end()) {}
+
+  template <typename It>
+  FFOrdered(It begin, It end) : contents(begin, end) {}
+
+  template <size_t MAXSIZE>
+  FFOrdered(stack_vector<T, MAXSIZE> const &contents)
+      : contents(contents.begin(), contents.end()) {}
+
+  T const &at(ff_dim_t idx) const {
+    int raw = idx.value.unwrap_nonnegative();
+    return this->contents.at(raw);
+  }
+
+  T const &at(relative_ff_dim_t idx) const {
+    int raw = idx.value;
+    if (raw < 0) {
+      raw = this->contents.size() + raw;
+    }
+    return this->contents.at(raw);
+  }
+
+  T &at(ff_dim_t idx) {
+    int raw = idx.value.unwrap_nonnegative();
+    return this->contents.at(raw);
+  }
+
+  T &at(relative_ff_dim_t idx) {
+    int raw = idx.value;
+    if (raw < 0) {
+      raw = this->contents.size() + raw;
+    }
+    return this->contents.at(raw);
+  }
+
+  T const &operator[](ff_dim_t idx) const {
+    return this->at(idx);
+  }
+
+  T const &operator[](relative_ff_dim_t idx) const {
+    return this->at(idx);
+  }
+
+  T &operator[](ff_dim_t idx) {
+    return this->at(idx);
+  }
+
+  T &operator[](relative_ff_dim_t idx) {
+    return this->at(idx);
+  }
+
+  bool idx_is_valid(ff_dim_t const &idx) const {
+    int raw = idx.value.unwrap_nonnegative();
+    return raw < this->contents.size();
+  }
+
+  bool idx_is_valid(relative_ff_dim_t const &idx) const {
+    int raw = idx.value;
+    if (raw < 0) {
+      raw = this->contents.size() + raw;
+    }
+    return (raw >= 0 && raw < this->contents.size());
+  }
+
+  bool operator==(FFOrdered const &other) const {
+    return this->contents == other.contents;
+  }
+
+  bool operator!=(FFOrdered const &other) const {
+    return this->contents != other.contents;
+  }
+
+  using iterator = typename stack_vector<T, MAX_TENSOR_DIM>::iterator;
+  using const_iterator =
+      typename stack_vector<T, MAX_TENSOR_DIM>::const_iterator;
+  using reverse_iterator =
+      typename stack_vector<T, MAX_TENSOR_DIM>::reverse_iterator;
+  using const_reverse_iterator =
+      typename stack_vector<T, MAX_TENSOR_DIM>::const_reverse_iterator;
+  using value_type = T;
+  using pointer = value_type *;
+  using const_pointer = value_type const *;
+  using reference = value_type &;
+  using const_reference = value_type const &;
+
+  iterator begin() {
+    return this->contents.begin();
+  }
+
+  const_iterator begin() const {
+    return this->cbegin();
+  }
+
+  const_iterator cbegin() const {
+    return this->contents.cbegin();
+  }
+
+  iterator end() {
+    return this->contents.end();
+  }
+
+  const_iterator end() const {
+    return this->cend();
+  }
+
+  const_iterator cend() const {
+    return this->contents.cend();
+  }
+
+  reverse_iterator rbegin() {
+    return this->contents.rbegin();
+  }
+
+  const_reverse_iterator rbegin() const {
+    return this->crbegin();
+  }
+
+  const_reverse_iterator crbegin() const {
+    return this->contents.crbegin();
+  }
+
+  reverse_iterator rend() {
+    return this->contents.rend();
+  }
+
+  const_reverse_iterator rend() const {
+    return this->crend();
+  }
+
+  const_reverse_iterator crend() const {
+    return this->contents.crend();
+  }
+
+  size_t size() const {
+    return this->contents.size();
+  }
+
+  size_t empty() const {
+    return this->contents.empty();
+  }
+
+  size_t num_dims() const {
+    return this->size();
+  }
+
+  friend struct ::std::hash<FFOrdered>;
+
+private:
+  stack_vector<T, MAX_TENSOR_DIM> contents;
+};
+
+template <typename T>
+auto operator<(FFOrdered<T> const &lhs, FFOrdered<T> const &rhs)
+    -> std::enable_if_t<is_lt_comparable_v<T>, bool> {
+  return std::lexicographical_compare(
+      lhs.cbegin(), lhs.cend(), rhs.cbegin(), rhs.cend());
+}
+
+template <typename T>
+std::string format_as(FFOrdered<T> const &v) {
+  std::vector<T> as_vec(v.cbegin(), v.cend());
+  return fmt::format("<ff_ordered {}>", as_vec);
+}
+
+template <typename T>
+std::ostream &operator<<(std::ostream &s, FFOrdered<T> const &v) {
+  return (s << fmt::to_string(v));
+}
+
+} // namespace FlexFlow
+
+namespace nlohmann {
+template <typename T>
+struct adl_serializer<::FlexFlow::FFOrdered<T>> {
+  static ::FlexFlow::FFOrdered<T> from_json(nlohmann::json const &j) {
+    return {j.template get<std::vector<T>>()};
+  }
+
+  static void to_json(nlohmann::json &j, ::FlexFlow::FFOrdered<T> const &x) {
+    j = std::vector<T>{x.cbegin(), x.cend()};
+  }
+};
+} // namespace nlohmann
+
+namespace std {
+
+template <typename T>
+struct hash<::FlexFlow::FFOrdered<T>> {
+  size_t operator()(::FlexFlow::FFOrdered<T> const &t) const {
+    static_assert(::FlexFlow::is_hashable<T>::value,
+                  "Elements must be hashable");
+
+    return get_std_hash(t.contents);
+  }
+};
+
+} // namespace std
+
+namespace rc {
+
+template <typename T>
+struct Arbitrary<::FlexFlow::FFOrdered<T>> {
+  static Gen<::FlexFlow::FFOrdered<T>> arbitrary() {
+    return gen::construct<::FlexFlow::FFOrdered<T>>(
+        gen::arbitrary<::FlexFlow::stack_vector<T, MAX_TENSOR_DIM>>());
+  }
+};
+
+} // namespace rc
+
+#endif
diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/ff_ordered_from_map.h b/lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered_from_map.h
similarity index 88%
rename from lib/op-attrs/include/op-attrs/dim_ordered/ff_ordered_from_map.h
rename to lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered_from_map.h
index f8f49233ec..9232afddfb 100644
--- a/lib/op-attrs/include/op-attrs/dim_ordered/ff_ordered_from_map.h
+++ b/lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered_from_map.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_FF_ORDERED_FROM_MAP_H
 #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_FF_ORDERED_FROM_MAP_H
 
-#include "op-attrs/dim_ordered/dim_ordered.h"
-#include "op-attrs/dim_ordered/ff_ordered_of.h"
 #include "op-attrs/ff_dim_t.h"
+#include "op-attrs/ff_ordered/ff_ordered.h"
+#include "op-attrs/ff_ordered/ff_ordered_of.h"
 
 namespace FlexFlow {
 
diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/ff_ordered_of.h b/lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered_of.h
similarity index 88%
rename from lib/op-attrs/include/op-attrs/dim_ordered/ff_ordered_of.h
rename to lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered_of.h
index 8cc1bf3a51..ace60b7e3d 100644
--- a/lib/op-attrs/include/op-attrs/dim_ordered/ff_ordered_of.h
+++ b/lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered_of.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_FF_ORDERED_OF_H
 #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_FF_ORDERED_OF_H
 
-#include "op-attrs/dim_ordered/dim_ordered.h"
+#include "op-attrs/ff_ordered/ff_ordered.h"
 
 namespace FlexFlow {
 
diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/get_idxs.h b/lib/op-attrs/include/op-attrs/ff_ordered/get_idxs.h
similarity index 91%
rename from lib/op-attrs/include/op-attrs/dim_ordered/get_idxs.h
rename to lib/op-attrs/include/op-attrs/ff_ordered/get_idxs.h
index 4e7f8530a4..5ff390d3fe 100644
--- a/lib/op-attrs/include/op-attrs/dim_ordered/get_idxs.h
+++ b/lib/op-attrs/include/op-attrs/ff_ordered/get_idxs.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_GET_IDXS_H
 #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_GET_IDXS_H
 
-#include "op-attrs/dim_ordered/dim_ordered.h"
 #include "op-attrs/ff_dim_t.h"
+#include "op-attrs/ff_ordered/ff_ordered.h"
 #include "utils/containers/count.h"
 #include "utils/containers/transform.h"
 
diff --git a/lib/op-attrs/include/op-attrs/ff_ordered/slice.h b/lib/op-attrs/include/op-attrs/ff_ordered/slice.h
new file mode 100644
index 0000000000..79217c4cc3
--- /dev/null
+++ b/lib/op-attrs/include/op-attrs/ff_ordered/slice.h
@@ -0,0 +1,49 @@
+#ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_FF_ORDERED_SLICE_H
+#define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_FF_ORDERED_SLICE_H
+
+#include "op-attrs/ff_ordered/ff_ordered.h"
+#include "utils/containers/slice.h"
+#include "utils/containers/transform.h"
+#include "utils/containers/vector_of.h"
+
+namespace FlexFlow {
+
+template <typename T>
+FFOrdered<T> ff_dim_t_nonoverloaded_slice(FFOrdered<T> const &d,
+                                          ff_dim_t const &start,
+                                          std::optional<ff_dim_t> const &end) {
+  int raw_start = start.value.unwrap_nonnegative();
+  std::optional<int> raw_end = transform(
+      end, [](ff_dim_t const &i) { return i.value.unwrap_nonnegative(); });
+  return FFOrdered<T>{slice(vector_of(d), raw_start, raw_end)};
+}
+
+template <typename T>
+FFOrdered<T> relative_ff_dim_t_nonoverloaded_slice(
+    FFOrdered<T> const &d,
+    relative_ff_dim_t const &start,
+    std::optional<relative_ff_dim_t> const &end) {
+  int raw_start = start.value;
+  std::optional<int> raw_end =
+      transform(end, [](relative_ff_dim_t const &i) { return i.value; });
+
+  return FFOrdered<T>{slice(vector_of(d), raw_start, raw_end)};
+}
+
+template <typename T>
+FFOrdered<T> slice(FFOrdered<T> const &d,
+                   ff_dim_t const &start = ff_dim_t{0_n},
+                   std::optional<ff_dim_t> const &end = std::nullopt) {
+  return ff_dim_t_nonoverloaded_slice(d, start, end);
+}
+
+template <typename T>
+FFOrdered<T> slice(FFOrdered<T> const &d,
+                   relative_ff_dim_t const &start = relative_ff_dim_t{0},
+                   std::optional<relative_ff_dim_t> const &end = std::nullopt) {
+  return relative_ff_dim_t_nonoverloaded_slice(d, start, end);
+}
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/op-attrs/include/op-attrs/ff_ordered/transform.h b/lib/op-attrs/include/op-attrs/ff_ordered/transform.h
new file mode 100644
index 0000000000..3a8eeb9ecf
--- /dev/null
+++ b/lib/op-attrs/include/op-attrs/ff_ordered/transform.h
@@ -0,0 +1,17 @@
+#ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_TRANSFORM_H
+#define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_TRANSFORM_H
+
+#include "op-attrs/ff_ordered/ff_ordered.h"
+#include "utils/containers/vector_of.h"
+#include "utils/containers/vector_transform.h"
+
+namespace FlexFlow {
+
+template <typename T, typename F, typename Out = std::invoke_result_t<F, T>>
+FFOrdered<Out> transform(FFOrdered<T> const &d, F &&f) {
+  return FFOrdered<Out>{vector_transform(vector_of(d), f)};
+}
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/op-attrs/include/op-attrs/ff_ordered/zip.h b/lib/op-attrs/include/op-attrs/ff_ordered/zip.h
new file mode 100644
index 0000000000..fe207740f7
--- /dev/null
+++ b/lib/op-attrs/include/op-attrs/ff_ordered/zip.h
@@ -0,0 +1,18 @@
+#ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_FF_ORDERED_ZIP_H
+#define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_FF_ORDERED_ZIP_H
+
+#include "op-attrs/ff_ordered/ff_ordered.h"
+#include "utils/containers/vector_of.h"
+#include "utils/containers/zip.h"
+
+namespace FlexFlow {
+
+template <typename T1, typename T2>
+FFOrdered<std::pair<T1, T2>> zip(FFOrdered<T1> const &lhs,
+                                 FFOrdered<T2> const &rhs) {
+  return FFOrdered<std::pair<T1, T2>>{zip(vector_of(lhs), vector_of(rhs))};
+}
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/op-attrs/include/op-attrs/ops/transpose_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/transpose_attrs.struct.toml
index b1c5f60382..50756f095b 100644
--- a/lib/op-attrs/include/op-attrs/ops/transpose_attrs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/transpose_attrs.struct.toml
@@ -12,7 +12,7 @@ features = [
 includes = [
   "op-attrs/ff_dim_t.h",
   "op-attrs/ff_dim_t.dtg.h",
-  "op-attrs/dim_ordered/dim_ordered.h",
+  "op-attrs/ff_ordered/ff_ordered.h",
 ]
 
 [[fields]]
diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_dim_degrees.struct.toml b/lib/op-attrs/include/op-attrs/parallel_tensor_dim_degrees.struct.toml
index be3a95eec8..d68ef02ec1 100644
--- a/lib/op-attrs/include/op-attrs/parallel_tensor_dim_degrees.struct.toml
+++ b/lib/op-attrs/include/op-attrs/parallel_tensor_dim_degrees.struct.toml
@@ -12,7 +12,7 @@ features = [
 includes = [
   "op-attrs/parallel_tensor_shape/sum_degree.dtg.h",
   "op-attrs/parallel_tensor_shape/discard_copy_degree.dtg.h",
-  "op-attrs/dim_ordered/dim_ordered.h",
+  "op-attrs/ff_ordered/ff_ordered.h",
   "utils/nonnegative_int/nonnegative_int.h",
 ]
 
diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_dims.struct.toml b/lib/op-attrs/include/op-attrs/parallel_tensor_dims.struct.toml
index f24fa12309..d2f8758377 100644
--- a/lib/op-attrs/include/op-attrs/parallel_tensor_dims.struct.toml
+++ b/lib/op-attrs/include/op-attrs/parallel_tensor_dims.struct.toml
@@ -10,7 +10,7 @@ features = [
 ]
 
 includes = [
-  "op-attrs/dim_ordered/dim_ordered.h",
+  "op-attrs/ff_ordered/ff_ordered.h",
   "op-attrs/shard_parallel_dim.dtg.h",
   "op-attrs/replica_parallel_dim_set.dtg.h",
   "<unordered_map>",
diff --git a/lib/op-attrs/include/op-attrs/tensor_dims.h b/lib/op-attrs/include/op-attrs/tensor_dims.h
index 97f3432c2f..ba35295e09 100644
--- a/lib/op-attrs/include/op-attrs/tensor_dims.h
+++ b/lib/op-attrs/include/op-attrs/tensor_dims.h
@@ -19,7 +19,7 @@ std::optional<TensorDims>
     get_broadcast_target_dims(std::unordered_set<TensorDims> const &);
 
 TensorDims slice_tensor_dims(TensorDims const &,
-                             std::optional<relative_ff_dim_t> const &start,
+                             relative_ff_dim_t const &start,
                              std::optional<relative_ff_dim_t> const &stop);
 
 } // namespace FlexFlow
diff --git a/lib/op-attrs/include/op-attrs/tensor_dims.struct.toml b/lib/op-attrs/include/op-attrs/tensor_dims.struct.toml
index e86b866fd6..8c6d1098cc 100644
--- a/lib/op-attrs/include/op-attrs/tensor_dims.struct.toml
+++ b/lib/op-attrs/include/op-attrs/tensor_dims.struct.toml
@@ -10,7 +10,7 @@ features = [
 ]
 
 includes = [
-  "op-attrs/dim_ordered/dim_ordered.h",
+  "op-attrs/ff_ordered/ff_ordered.h",
   "utils/nonnegative_int/nonnegative_int.h",
 ]
 
diff --git a/lib/op-attrs/include/op-attrs/tensor_shape.h b/lib/op-attrs/include/op-attrs/tensor_shape.h
index a3cd8bfd9a..298ea04638 100644
--- a/lib/op-attrs/include/op-attrs/tensor_shape.h
+++ b/lib/op-attrs/include/op-attrs/tensor_shape.h
@@ -12,7 +12,7 @@ nonnegative_int get_num_elements(TensorShape const &);
 nonnegative_int get_size_in_bytes(TensorShape const &);
 
 TensorShape slice_tensor_shape(TensorShape const &,
-                               std::optional<relative_ff_dim_t> const &start,
+                               relative_ff_dim_t const &start,
                                std::optional<relative_ff_dim_t> const &stop);
 
 } // namespace FlexFlow
diff --git a/lib/op-attrs/src/op-attrs/datatype_value.cc b/lib/op-attrs/src/op-attrs/datatype_value.cc
new file mode 100644
index 0000000000..4604ef0b4e
--- /dev/null
+++ b/lib/op-attrs/src/op-attrs/datatype_value.cc
@@ -0,0 +1,25 @@
+#include "op-attrs/datatype_value.h"
+
+namespace FlexFlow {
+
+DataTypeValue make_float_data_type_value(float value) {
+  return DataTypeValue{value};
+}
+
+DataTypeValue make_double_data_type_value(double value) {
+  return DataTypeValue{value};
+}
+
+DataTypeValue make_int32_data_type_value(int32_t value) {
+  return DataTypeValue{value};
+}
+
+DataTypeValue make_int64_data_type_value(int64_t value) {
+  return DataTypeValue{value};
+}
+
+DataTypeValue make_bool_data_type_value(bool value) {
+  return DataTypeValue{value};
+}
+
+} // namespace FlexFlow
diff --git a/lib/op-attrs/src/op-attrs/dim_ordered/concat.cc b/lib/op-attrs/src/op-attrs/dim_ordered/concat.cc
deleted file mode 100644
index cb29f708a3..0000000000
--- a/lib/op-attrs/src/op-attrs/dim_ordered/concat.cc
+++ /dev/null
@@ -1 +0,0 @@
-#include "op-attrs/dim_ordered/concat.h"
diff --git a/lib/op-attrs/src/op-attrs/dim_ordered/enumerate.cc b/lib/op-attrs/src/op-attrs/dim_ordered/enumerate.cc
deleted file mode 100644
index 6edd5485af..0000000000
--- a/lib/op-attrs/src/op-attrs/dim_ordered/enumerate.cc
+++ /dev/null
@@ -1 +0,0 @@
-#include "op-attrs/dim_ordered/enumerate.h"
diff --git a/lib/op-attrs/src/op-attrs/dim_ordered/ff_ordered_from_map.cc b/lib/op-attrs/src/op-attrs/dim_ordered/ff_ordered_from_map.cc
deleted file mode 100644
index 2de88f38c8..0000000000
--- a/lib/op-attrs/src/op-attrs/dim_ordered/ff_ordered_from_map.cc
+++ /dev/null
@@ -1 +0,0 @@
-#include "op-attrs/dim_ordered/ff_ordered_from_map.h"
diff --git a/lib/op-attrs/src/op-attrs/dim_ordered/ff_ordered_of.cc b/lib/op-attrs/src/op-attrs/dim_ordered/ff_ordered_of.cc
deleted file mode 100644
index 8e5c2fd38a..0000000000
--- a/lib/op-attrs/src/op-attrs/dim_ordered/ff_ordered_of.cc
+++ /dev/null
@@ -1 +0,0 @@
-#include "op-attrs/dim_ordered/ff_ordered_of.h"
diff --git a/lib/op-attrs/src/op-attrs/dim_ordered/get_idxs.cc b/lib/op-attrs/src/op-attrs/dim_ordered/get_idxs.cc
deleted file mode 100644
index 175ae8d4bd..0000000000
--- a/lib/op-attrs/src/op-attrs/dim_ordered/get_idxs.cc
+++ /dev/null
@@ -1 +0,0 @@
-#include "op-attrs/dim_ordered/get_idxs.h"
diff --git a/lib/op-attrs/src/op-attrs/dim_ordered/slice.cc b/lib/op-attrs/src/op-attrs/dim_ordered/slice.cc
index 75ab1a32aa..8c3dbd7bbc 100644
--- a/lib/op-attrs/src/op-attrs/dim_ordered/slice.cc
+++ b/lib/op-attrs/src/op-attrs/dim_ordered/slice.cc
@@ -1,26 +1 @@
 #include "op-attrs/dim_ordered/slice.h"
-#include "utils/archetypes/value_type.h"
-
-namespace FlexFlow {
-
-using T = value_type<0>;
-
-template FFOrdered<T>
-    ff_dim_t_nonoverloaded_slice(FFOrdered<T> const &d,
-                                 std::optional<ff_dim_t> const &start,
-                                 std::optional<ff_dim_t> const &end);
-
-template FFOrdered<T> relative_ff_dim_t_nonoverloaded_slice(
-    FFOrdered<T> const &d,
-    std::optional<relative_ff_dim_t> const &start,
-    std::optional<relative_ff_dim_t> const &end);
-
-template FFOrdered<T> slice(FFOrdered<T> const &d,
-                            std::optional<ff_dim_t> const &start,
-                            std::optional<ff_dim_t> const &end);
-
-template FFOrdered<T> slice(FFOrdered<T> const &d,
-                            std::optional<relative_ff_dim_t> const &start,
-                            std::optional<relative_ff_dim_t> const &end);
-
-} // namespace FlexFlow
diff --git a/lib/op-attrs/src/op-attrs/dim_ordered/transform.cc b/lib/op-attrs/src/op-attrs/dim_ordered/transform.cc
new file mode 100644
index 0000000000..73683eba94
--- /dev/null
+++ b/lib/op-attrs/src/op-attrs/dim_ordered/transform.cc
@@ -0,0 +1 @@
+#include "op-attrs/dim_ordered/transform.h"
diff --git a/lib/op-attrs/src/op-attrs/ff_ordered/enumerate.cc b/lib/op-attrs/src/op-attrs/ff_ordered/enumerate.cc
new file mode 100644
index 0000000000..e06c144149
--- /dev/null
+++ b/lib/op-attrs/src/op-attrs/ff_ordered/enumerate.cc
@@ -0,0 +1,10 @@
+#include "op-attrs/ff_ordered/enumerate.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using T = value_type<0>;
+
+template std::map<ff_dim_t, T> enumerate(FFOrdered<T> const &);
+
+} // namespace FlexFlow
diff --git a/lib/op-attrs/src/op-attrs/ff_ordered/ff_ordered.cc b/lib/op-attrs/src/op-attrs/ff_ordered/ff_ordered.cc
new file mode 100644
index 0000000000..1420586809
--- /dev/null
+++ b/lib/op-attrs/src/op-attrs/ff_ordered/ff_ordered.cc
@@ -0,0 +1,14 @@
+#include "op-attrs/ff_ordered/ff_ordered.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using T = value_type<0>;
+
+template struct FFOrdered<T>;
+
+template std::string format_as(FFOrdered<T> const &);
+
+template std::ostream &operator<<(std::ostream &, FFOrdered<T> const &);
+
+} // namespace FlexFlow
diff --git a/lib/op-attrs/src/op-attrs/ff_ordered/ff_ordered_from_map.cc b/lib/op-attrs/src/op-attrs/ff_ordered/ff_ordered_from_map.cc
new file mode 100644
index 0000000000..e39fedb858
--- /dev/null
+++ b/lib/op-attrs/src/op-attrs/ff_ordered/ff_ordered_from_map.cc
@@ -0,0 +1,13 @@
+#include "op-attrs/ff_ordered/ff_ordered_from_map.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using T = value_type<0>;
+
+template FFOrdered<T> ff_ordered_from_map(std::map<ff_dim_t, T> const &);
+
+template FFOrdered<T>
+    ff_ordered_from_map(std::unordered_map<ff_dim_t, T> const &);
+
+} // namespace FlexFlow
diff --git a/lib/op-attrs/src/op-attrs/ff_ordered/get_idxs.cc b/lib/op-attrs/src/op-attrs/ff_ordered/get_idxs.cc
new file mode 100644
index 0000000000..3da15bebba
--- /dev/null
+++ b/lib/op-attrs/src/op-attrs/ff_ordered/get_idxs.cc
@@ -0,0 +1,10 @@
+#include "op-attrs/ff_ordered/get_idxs.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using T = value_type<0>;
+
+template std::vector<ff_dim_t> get_idxs(FFOrdered<T> const &);
+
+} // namespace FlexFlow
diff --git a/lib/op-attrs/src/op-attrs/ff_ordered/slice.cc b/lib/op-attrs/src/op-attrs/ff_ordered/slice.cc
new file mode 100644
index 0000000000..059fd811cd
--- /dev/null
+++ b/lib/op-attrs/src/op-attrs/ff_ordered/slice.cc
@@ -0,0 +1,24 @@
+#include "op-attrs/ff_ordered/slice.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using T = value_type<0>;
+
+template FFOrdered<T> ff_dim_t_nonoverloaded_slice(
+    FFOrdered<T> const &, ff_dim_t const &, std::optional<ff_dim_t> const &);
+
+template FFOrdered<T> relative_ff_dim_t_nonoverloaded_slice(
+    FFOrdered<T> const &,
+    relative_ff_dim_t const &,
+    std::optional<relative_ff_dim_t> const &);
+
+template FFOrdered<T> slice(FFOrdered<T> const &,
+                            ff_dim_t const &,
+                            std::optional<ff_dim_t> const &);
+
+template FFOrdered<T> slice(FFOrdered<T> const &,
+                            relative_ff_dim_t const &,
+                            std::optional<relative_ff_dim_t> const &);
+
+} // namespace FlexFlow
diff --git a/lib/op-attrs/src/op-attrs/ff_ordered/transform.cc b/lib/op-attrs/src/op-attrs/ff_ordered/transform.cc
new file mode 100644
index 0000000000..74bf4895a3
--- /dev/null
+++ b/lib/op-attrs/src/op-attrs/ff_ordered/transform.cc
@@ -0,0 +1,12 @@
+#include "op-attrs/ff_ordered/transform.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using T = value_type<0>;
+using Out = value_type<1>;
+using F = std::function<Out(T const &)>;
+
+template FFOrdered<Out> transform(FFOrdered<T> const &, F &&);
+
+} // namespace FlexFlow
diff --git a/lib/op-attrs/src/op-attrs/ff_ordered/zip.cc b/lib/op-attrs/src/op-attrs/ff_ordered/zip.cc
new file mode 100644
index 0000000000..dc715ea97c
--- /dev/null
+++ b/lib/op-attrs/src/op-attrs/ff_ordered/zip.cc
@@ -0,0 +1,12 @@
+#include "op-attrs/ff_ordered/zip.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using T1 = value_type<0>;
+using T2 = value_type<1>;
+
+template FFOrdered<std::pair<T1, T2>> zip(FFOrdered<T1> const &,
+                                          FFOrdered<T2> const &);
+
+} // namespace FlexFlow
diff --git a/lib/op-attrs/src/op-attrs/ops/batch_norm.cc b/lib/op-attrs/src/op-attrs/ops/batch_norm.cc
index d4763ef004..ddd92bd417 100644
--- a/lib/op-attrs/src/op-attrs/ops/batch_norm.cc
+++ b/lib/op-attrs/src/op-attrs/ops/batch_norm.cc
@@ -1,6 +1,6 @@
 #include "op-attrs/ops/batch_norm.h"
-#include "op-attrs/dim_ordered/concat.h"
-#include "op-attrs/dim_ordered/slice.h"
+#include "op-attrs/ff_ordered/concat.h"
+#include "op-attrs/ff_ordered/slice.h"
 #include "op-attrs/parallel_tensor_shape.h"
 #include "op-attrs/tensor_shape.h"
 #include "utils/containers/any_of.h"
diff --git a/lib/op-attrs/src/op-attrs/ops/concat.cc b/lib/op-attrs/src/op-attrs/ops/concat.cc
index fc42241ef2..bf0ba553e4 100644
--- a/lib/op-attrs/src/op-attrs/ops/concat.cc
+++ b/lib/op-attrs/src/op-attrs/ops/concat.cc
@@ -1,6 +1,6 @@
 #include "op-attrs/ops/concat.h"
-#include "op-attrs/dim_ordered/enumerate.h"
-#include "op-attrs/dim_ordered/ff_ordered_from_map.h"
+#include "op-attrs/ff_ordered/enumerate.h"
+#include "op-attrs/ff_ordered/ff_ordered_from_map.h"
 #include "op-attrs/parallel_tensor_shape.h"
 #include "op-attrs/tensor_dims.h"
 #include "op-attrs/tensor_shape.h"
diff --git a/lib/op-attrs/src/op-attrs/ops/embedding.cc b/lib/op-attrs/src/op-attrs/ops/embedding.cc
index 4dc602646b..5b5b91a8e7 100644
--- a/lib/op-attrs/src/op-attrs/ops/embedding.cc
+++ b/lib/op-attrs/src/op-attrs/ops/embedding.cc
@@ -1,8 +1,10 @@
 #include "op-attrs/ops/embedding.h"
-#include "op-attrs/dim_ordered/slice.h"
-#include "op-attrs/dim_ordered/transform.h"
+#include "op-attrs/ff_ordered/slice.h"
+#include "op-attrs/ff_ordered/transform.h"
+#include "op-attrs/ops/embedding_attrs.dtg.h"
 #include "op-attrs/parallel_tensor_dims.h"
 #include "utils/containers/product.h"
+#include "utils/fmt/optional.h"
 #include "utils/integer_conversions.h"
 
 namespace FlexFlow {
diff --git a/lib/op-attrs/src/op-attrs/ops/flat.cc b/lib/op-attrs/src/op-attrs/ops/flat.cc
index 8ed12167b3..b4eeda76ab 100644
--- a/lib/op-attrs/src/op-attrs/ops/flat.cc
+++ b/lib/op-attrs/src/op-attrs/ops/flat.cc
@@ -1,6 +1,6 @@
 #include "op-attrs/ops/flat.h"
-#include "op-attrs/dim_ordered/concat.h"
-#include "op-attrs/dim_ordered/slice.h"
+#include "op-attrs/ff_ordered/concat.h"
+#include "op-attrs/ff_ordered/slice.h"
 #include "op-attrs/parallel_tensor_shape.h"
 #include "op-attrs/tensor_dims.h"
 #include "utils/containers/any_of.h"
diff --git a/lib/op-attrs/src/op-attrs/ops/layer_norm.cc b/lib/op-attrs/src/op-attrs/ops/layer_norm.cc
index 00c6bb5e9b..c9798368e2 100644
--- a/lib/op-attrs/src/op-attrs/ops/layer_norm.cc
+++ b/lib/op-attrs/src/op-attrs/ops/layer_norm.cc
@@ -1,6 +1,6 @@
 #include "op-attrs/ops/layer_norm.h"
-#include "op-attrs/dim_ordered/ff_ordered_of.h"
-#include "op-attrs/dim_ordered/get_idxs.h"
+#include "op-attrs/ff_ordered/ff_ordered_of.h"
+#include "op-attrs/ff_ordered/get_idxs.h"
 #include "op-attrs/parallel_tensor_shape.h"
 #include "op-attrs/tensor_shape.h"
 #include "utils/containers/all_of.h"
diff --git a/lib/op-attrs/src/op-attrs/ops/linear.cc b/lib/op-attrs/src/op-attrs/ops/linear.cc
index fb26113613..bee9d0cf4f 100644
--- a/lib/op-attrs/src/op-attrs/ops/linear.cc
+++ b/lib/op-attrs/src/op-attrs/ops/linear.cc
@@ -1,11 +1,12 @@
 #include "op-attrs/ops/linear.h"
-#include "op-attrs/dim_ordered/slice.h"
-#include "op-attrs/dim_ordered/transform.h"
+#include "op-attrs/ff_ordered/slice.h"
+#include "op-attrs/ff_ordered/transform.h"
 #include "op-attrs/initializers/kaiming_initializer_mode.h"
 #include "op-attrs/parallel_tensor_shape.h"
 #include "op-attrs/tensor_shape.h"
 #include "utils/containers/product.h"
 #include "utils/expected.h"
+#include "utils/fmt/optional.h"
 #include "utils/integer_conversions.h"
 
 namespace FlexFlow {
@@ -101,7 +102,7 @@ tl::expected<ParallelTensorShape, std::string>
   SumDegree sum_degree = SumDegree{1_n};
   DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{
       get_sum_degree(input) * product(slice(ff_ordered_shard_degrees(input),
-                                            std::nullopt,
+                                            relative_ff_dim_t{0},
                                             relative_ff_dim_t{-1}))};
   FFOrdered<nonnegative_int> shard_degrees = FFOrdered<nonnegative_int>{
       shard_dim_at_idx(input, relative_ff_dim_t{-1}).degree,
@@ -126,8 +127,10 @@ tl::expected<ParallelTensorShape, std::string>
   SumDegree sum_degree =
       SumDegree{get_sum_degree(input) *
                 shard_dim_at_idx(input, relative_ff_dim_t{-1}).degree};
-  DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{product(slice(
-      ff_ordered_shard_degrees(input), std::nullopt, relative_ff_dim_t{-1}))};
+  DiscardCopyDegree discard_copy_degree =
+      DiscardCopyDegree{product(slice(ff_ordered_shard_degrees(input),
+                                      relative_ff_dim_t{0},
+                                      relative_ff_dim_t{-1}))};
   FFOrdered<nonnegative_int> shard_degrees =
       FFOrdered<nonnegative_int>{get_discard_copy_degree(input)};
 
diff --git a/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc b/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc
index 7a8f91e498..3f2245b2dc 100644
--- a/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc
+++ b/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc
@@ -1,6 +1,6 @@
 #include "op-attrs/parallel_tensor_dims.h"
-#include "op-attrs/dim_ordered/transform.h"
-#include "op-attrs/dim_ordered/zip.h"
+#include "op-attrs/ff_ordered/transform.h"
+#include "op-attrs/ff_ordered/zip.h"
 #include "op-attrs/replica_parallel_dim.h"
 #include "op-attrs/replica_parallel_dim_set.h"
 #include "op-attrs/shard_parallel_dim.h"
diff --git a/lib/op-attrs/src/op-attrs/tensor_dims.cc b/lib/op-attrs/src/op-attrs/tensor_dims.cc
index 8d0592eab7..760278297c 100644
--- a/lib/op-attrs/src/op-attrs/tensor_dims.cc
+++ b/lib/op-attrs/src/op-attrs/tensor_dims.cc
@@ -1,6 +1,6 @@
 #include "op-attrs/tensor_dims.h"
-#include "op-attrs/dim_ordered/slice.h"
-#include "op-attrs/dim_ordered/zip.h"
+#include "op-attrs/ff_ordered/slice.h"
+#include "op-attrs/ff_ordered/zip.h"
 #include "op-attrs/replica_parallel_dim_set.h"
 #include "op-attrs/shard_parallel_dim.dtg.h"
 #include "utils/containers/all_of.h"
@@ -67,7 +67,7 @@ std::optional<TensorDims>
 }
 
 TensorDims slice_tensor_dims(TensorDims const &dims,
-                             std::optional<relative_ff_dim_t> const &start,
+                             relative_ff_dim_t const &start,
                              std::optional<relative_ff_dim_t> const &stop) {
   return TensorDims{
       slice(dims.ff_ordered, start, stop),
diff --git a/lib/op-attrs/src/op-attrs/tensor_shape.cc b/lib/op-attrs/src/op-attrs/tensor_shape.cc
index 04b18794f1..afc14af54c 100644
--- a/lib/op-attrs/src/op-attrs/tensor_shape.cc
+++ b/lib/op-attrs/src/op-attrs/tensor_shape.cc
@@ -29,7 +29,7 @@ nonnegative_int get_size_in_bytes(TensorShape const &s) {
 }
 
 TensorShape slice_tensor_shape(TensorShape const &shape,
-                               std::optional<relative_ff_dim_t> const &start,
+                               relative_ff_dim_t const &start,
                                std::optional<relative_ff_dim_t> const &stop) {
   return TensorShape{
       slice_tensor_dims(shape.dims, start, stop),
diff --git a/lib/op-attrs/test/src/op-attrs/datatype_value.cc b/lib/op-attrs/test/src/op-attrs/datatype_value.cc
new file mode 100644
index 0000000000..9b0e90b601
--- /dev/null
+++ b/lib/op-attrs/test/src/op-attrs/datatype_value.cc
@@ -0,0 +1,68 @@
+#include "op-attrs/datatype_value.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("test make_data_type_value") {
+    SUBCASE("make_float_data_type_value") {
+      float value = 1.0f;
+      DataTypeValue data_type_value = make_float_data_type_value(value);
+
+      CHECK(data_type_value.has<float>());
+      CHECK_FALSE(data_type_value.has<double>());
+      CHECK_FALSE(data_type_value.has<int32_t>());
+      CHECK_FALSE(data_type_value.has<int64_t>());
+      CHECK_FALSE(data_type_value.has<bool>());
+      CHECK(data_type_value.get<float>() == value);
+    }
+
+    SUBCASE("make_double_data_type_value") {
+      double value = 2.71828;
+      DataTypeValue data_type_value = make_double_data_type_value(value);
+
+      CHECK(data_type_value.has<double>());
+      CHECK_FALSE(data_type_value.has<float>());
+      CHECK_FALSE(data_type_value.has<int32_t>());
+      CHECK_FALSE(data_type_value.has<int64_t>());
+      CHECK_FALSE(data_type_value.has<bool>());
+      CHECK(data_type_value.get<double>() == value);
+    }
+
+    SUBCASE("make_int32_data_type_value") {
+      int32_t value = -42;
+      DataTypeValue data_type_value = make_int32_data_type_value(value);
+
+      CHECK(data_type_value.has<int32_t>());
+      CHECK_FALSE(data_type_value.has<float>());
+      CHECK_FALSE(data_type_value.has<double>());
+      CHECK_FALSE(data_type_value.has<int64_t>());
+      CHECK_FALSE(data_type_value.has<bool>());
+      CHECK(data_type_value.get<int32_t>() == value);
+    }
+
+    SUBCASE("make_int64_data_type_value") {
+      int64_t value = 1LL << 40;
+      DataTypeValue data_type_value = make_int64_data_type_value(value);
+
+      CHECK(data_type_value.has<int64_t>());
+      CHECK_FALSE(data_type_value.has<float>());
+      CHECK_FALSE(data_type_value.has<double>());
+      CHECK_FALSE(data_type_value.has<int32_t>());
+      CHECK_FALSE(data_type_value.has<bool>());
+      CHECK(data_type_value.get<int64_t>() == value);
+    }
+
+    SUBCASE("make_bool_data_type_value") {
+      bool value = true;
+      DataTypeValue data_type_value = make_bool_data_type_value(value);
+
+      CHECK(data_type_value.has<bool>());
+      CHECK_FALSE(data_type_value.has<float>());
+      CHECK_FALSE(data_type_value.has<double>());
+      CHECK_FALSE(data_type_value.has<int32_t>());
+      CHECK_FALSE(data_type_value.has<int64_t>());
+      CHECK(data_type_value.get<bool>() == value);
+    }
+  }
+}
diff --git a/lib/op-attrs/test/src/op-attrs/dim_ordered/dim_ordered.cc b/lib/op-attrs/test/src/op-attrs/dim_ordered/dim_ordered.cc
index d7901a0c53..a5a261da25 100644
--- a/lib/op-attrs/test/src/op-attrs/dim_ordered/dim_ordered.cc
+++ b/lib/op-attrs/test/src/op-attrs/dim_ordered/dim_ordered.cc
@@ -10,8 +10,4 @@ TEST_SUITE(FF_TEST_SUITE) {
       "Arbitrary<DimOrdered<int, T>> with T=", T, int, double, char) {
     RC_SUBCASE([](DimOrdered<int, T>) {});
   }
-
-  TEST_CASE_TEMPLATE("Arbitrary<FFOrdered<T>> with T=", T, int, double, char) {
-    RC_SUBCASE([](FFOrdered<T>) {});
-  }
 }
diff --git a/lib/op-attrs/test/src/op-attrs/dim_ordered/concat.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/concat.cc
similarity index 97%
rename from lib/op-attrs/test/src/op-attrs/dim_ordered/concat.cc
rename to lib/op-attrs/test/src/op-attrs/ff_ordered/concat.cc
index 2ac641cfc2..d8e04124bc 100644
--- a/lib/op-attrs/test/src/op-attrs/dim_ordered/concat.cc
+++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/concat.cc
@@ -1,4 +1,4 @@
-#include "op-attrs/dim_ordered/concat.h"
+#include "op-attrs/ff_ordered/concat.h"
 #include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
diff --git a/lib/op-attrs/test/src/op-attrs/dim_ordered/enumerate.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/enumerate.cc
similarity index 92%
rename from lib/op-attrs/test/src/op-attrs/dim_ordered/enumerate.cc
rename to lib/op-attrs/test/src/op-attrs/ff_ordered/enumerate.cc
index bf4c33d65a..e1a94e72c3 100644
--- a/lib/op-attrs/test/src/op-attrs/dim_ordered/enumerate.cc
+++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/enumerate.cc
@@ -1,4 +1,4 @@
-#include "op-attrs/dim_ordered/enumerate.h"
+#include "op-attrs/ff_ordered/enumerate.h"
 #include "test/utils/doctest/fmt/map.h"
 #include <doctest/doctest.h>
 
diff --git a/lib/op-attrs/test/src/op-attrs/ff_ordered/ff_ordered.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/ff_ordered.cc
new file mode 100644
index 0000000000..b0812ba9d6
--- /dev/null
+++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/ff_ordered.cc
@@ -0,0 +1,11 @@
+#include "op-attrs/ff_ordered/ff_ordered.h"
+#include "test/utils/rapidcheck.h"
+#include <doctest/doctest.h>
+
+using namespace FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE_TEMPLATE("Arbitrary<FFOrdered<T>> with T=", T, int, double, char) {
+    RC_SUBCASE([](FFOrdered<T>) {});
+  }
+}
diff --git a/lib/op-attrs/test/src/op-attrs/dim_ordered/ff_ordered_from_map.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/ff_ordered_from_map.cc
similarity index 96%
rename from lib/op-attrs/test/src/op-attrs/dim_ordered/ff_ordered_from_map.cc
rename to lib/op-attrs/test/src/op-attrs/ff_ordered/ff_ordered_from_map.cc
index bba989920e..73036d5662 100644
--- a/lib/op-attrs/test/src/op-attrs/dim_ordered/ff_ordered_from_map.cc
+++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/ff_ordered_from_map.cc
@@ -1,4 +1,4 @@
-#include "op-attrs/dim_ordered/ff_ordered_from_map.h"
+#include "op-attrs/ff_ordered/ff_ordered_from_map.h"
 #include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
diff --git a/lib/op-attrs/test/src/op-attrs/dim_ordered/slice.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/slice.cc
similarity index 79%
rename from lib/op-attrs/test/src/op-attrs/dim_ordered/slice.cc
rename to lib/op-attrs/test/src/op-attrs/ff_ordered/slice.cc
index b2fddd058e..2f1dfecd65 100644
--- a/lib/op-attrs/test/src/op-attrs/dim_ordered/slice.cc
+++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/slice.cc
@@ -1,4 +1,4 @@
-#include "op-attrs/dim_ordered/slice.h"
+#include "op-attrs/ff_ordered/slice.h"
 #include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
@@ -25,13 +25,6 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       CHECK(result == correct);
     }
-    SUBCASE("std::nullopt_t, ff_dim_t") {
-      FFOrdered<size_t> result =
-          slice(d, std::nullopt, ff_dim_t{nonnegative_int{3}});
-      FFOrdered<size_t> correct = FFOrdered<size_t>{1, 2, 3};
-
-      CHECK(result == correct);
-    }
     SUBCASE("relative_ff_dim_t, relative_ff_dim_t") {
       FFOrdered<size_t> result =
           slice(d, relative_ff_dim_t{1}, relative_ff_dim_t{-1});
@@ -45,12 +38,6 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       CHECK(result == correct);
     }
-    SUBCASE("std::nullopt_t, relative_ff_dim_t") {
-      FFOrdered<size_t> result = slice(d, std::nullopt, relative_ff_dim_t{-1});
-      FFOrdered<size_t> correct = FFOrdered<size_t>{1, 2, 3};
-
-      CHECK(result == correct);
-    }
     SUBCASE("start index = stop index") {
       FFOrdered<size_t> result =
           slice(d, relative_ff_dim_t{1}, relative_ff_dim_t{1});
@@ -86,10 +73,10 @@ TEST_SUITE(FF_TEST_SUITE) {
       CHECK_THROWS(slice(d, relative_ff_dim_t{10}, std::nullopt));
     }
     SUBCASE("stop index out of bounds (too low)") {
-      CHECK_THROWS(slice(d, std::nullopt, relative_ff_dim_t{-10}));
+      CHECK_THROWS(slice(d, relative_ff_dim_t{0}, relative_ff_dim_t{-10}));
     }
     SUBCASE("stop index out of bounds (too high)") {
-      CHECK_THROWS(slice(d, std::nullopt, relative_ff_dim_t{10}));
+      CHECK_THROWS(slice(d, relative_ff_dim_t{0}, relative_ff_dim_t{10}));
     }
   }
 }
diff --git a/lib/op-attrs/test/src/op-attrs/ff_ordered/transform.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/transform.cc
new file mode 100644
index 0000000000..4bf189ec77
--- /dev/null
+++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/transform.cc
@@ -0,0 +1,35 @@
+#include "op-attrs/ff_ordered/transform.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("transform(FFOrdered<T>, F)") {
+    SUBCASE("input is empty") {
+      FFOrdered<std::string> input = {};
+
+      FFOrdered<int> result = transform(input, [](std::string const &) -> int {
+        CHECK(false);
+        return 0;
+      });
+      FFOrdered<int> correct = {};
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("input is not empty") {
+      FFOrdered<int> input = {2, 1, 2, 5};
+
+      FFOrdered<std::string> result =
+          transform(input, [](int x) { return fmt::to_string(x); });
+      FFOrdered<std::string> correct = FFOrdered<std::string>{
+          "2",
+          "1",
+          "2",
+          "5",
+      };
+
+      CHECK(result == correct);
+    }
+  }
+}
diff --git a/lib/op-attrs/test/src/op-attrs/ff_ordered/zip.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/zip.cc
new file mode 100644
index 0000000000..19167cd0ff
--- /dev/null
+++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/zip.cc
@@ -0,0 +1,38 @@
+#include "op-attrs/ff_ordered/zip.h"
+#include "test/utils/doctest/fmt/pair.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("zip(FFOrdered<T1>, FFOrdered<T2>)") {
+    FFOrdered<int> lhs_input = {9, 9, 8, 9};
+    FFOrdered<std::string> rhs_input = {"m", "m", "k", "l", "m"};
+
+    SUBCASE("lhs is longer") {
+      FFOrdered<std::pair<int, std::string>> result = zip(lhs_input, rhs_input);
+
+      FFOrdered<std::pair<int, std::string>> correct = {
+          {9, "m"},
+          {9, "m"},
+          {8, "k"},
+          {9, "l"},
+      };
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("rhs is longer") {
+      FFOrdered<std::pair<std::string, int>> result = zip(rhs_input, lhs_input);
+
+      FFOrdered<std::pair<std::string, int>> correct = {
+          {"m", 9},
+          {"m", 9},
+          {"k", 8},
+          {"l", 9},
+      };
+
+      CHECK(result == correct);
+    }
+  }
+}
diff --git a/lib/pcg/include/pcg/metric.enum.toml b/lib/pcg/include/pcg/metric.enum.toml
new file mode 100644
index 0000000000..ebb2323203
--- /dev/null
+++ b/lib/pcg/include/pcg/metric.enum.toml
@@ -0,0 +1,26 @@
+namespace = "FlexFlow"
+name = "Metric"
+features = [
+  "hash",
+  "json",
+  "rapidcheck",
+  "fmt",
+]
+
+[[values]]
+name = "ACCURACY"
+
+[[values]]
+name = "CATEGORICAL_CROSSENTROPY"
+
+[[values]]
+name = "SPARSE_CATEGORICAL_CROSSENTROPY"
+
+[[values]]
+name = "MEAN_SQUARED_ERROR"
+
+[[values]]
+name = "ROOT_MEAN_SQUARED_ERROR"
+
+[[values]]
+name = "MEAN_ABSOLUTE_ERROR"
diff --git a/lib/pcg/include/pcg/metric_attrs.h b/lib/pcg/include/pcg/metric_attrs.h
new file mode 100644
index 0000000000..343c2154dd
--- /dev/null
+++ b/lib/pcg/include/pcg/metric_attrs.h
@@ -0,0 +1,28 @@
+#ifndef _FF_METRICS_H_
+#define _FF_METRICS_H_
+
+#include "op-attrs/ops/loss_functions/loss_functions.h"
+#include "pcg/metric.dtg.h"
+#include "utils/fmt.h"
+#include <unordered_set>
+
+namespace FlexFlow {
+
+class MetricsAttrs {
+public:
+  MetricsAttrs() = delete;
+  MetricsAttrs(LossFunction, std::unordered_set<Metric> const &);
+
+public:
+  LossFunction loss_type;
+  bool measure_accuracy;
+  bool measure_categorical_crossentropy;
+  bool measure_sparse_categorical_crossentropy;
+  bool measure_mean_squared_error;
+  bool measure_root_mean_squared_error;
+  bool measure_mean_absolute_error;
+};
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h
index 3542e73dea..f820c56d61 100644
--- a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h
+++ b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h
@@ -1,6 +1,7 @@
 #ifndef _FLEXFLOW_PCG_INCLUDE_PCG_PARALLEL_COMPUTATION_GRAPH_H
 #define _FLEXFLOW_PCG_INCLUDE_PCG_PARALLEL_COMPUTATION_GRAPH_H
 
+#include "pcg/computation_graph.dtg.h"
 #include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h"
 #include "pcg/parallel_computation_graph/parallel_computation_graph_edge.dtg.h"
 #include "pcg/parallel_computation_graph/parallel_layer_added_result.dtg.h"
diff --git a/lib/pcg/src/pcg/metric_attrs.cc b/lib/pcg/src/pcg/metric_attrs.cc
new file mode 100644
index 0000000000..9a93e75350
--- /dev/null
+++ b/lib/pcg/src/pcg/metric_attrs.cc
@@ -0,0 +1,38 @@
+#include "pcg/metric_attrs.h"
+
+namespace FlexFlow {
+MetricsAttrs::MetricsAttrs(LossFunction _loss_type,
+                           std::unordered_set<Metric> const &metrics)
+    : loss_type(_loss_type), measure_accuracy(false),
+      measure_categorical_crossentropy(false),
+      measure_sparse_categorical_crossentropy(false),
+      measure_mean_squared_error(false), measure_root_mean_squared_error(false),
+      measure_mean_absolute_error(false) {
+  for (Metric const &m : metrics) {
+    switch (m) {
+      case Metric::ACCURACY:
+        measure_accuracy = true;
+        continue;
+      case Metric::CATEGORICAL_CROSSENTROPY:
+        measure_categorical_crossentropy = true;
+        continue;
+      case Metric::SPARSE_CATEGORICAL_CROSSENTROPY:
+        measure_sparse_categorical_crossentropy = true;
+        continue;
+      case Metric::MEAN_SQUARED_ERROR:
+        measure_mean_squared_error = true;
+        continue;
+      case Metric::ROOT_MEAN_SQUARED_ERROR:
+        measure_root_mean_squared_error = true;
+        continue;
+      case Metric::MEAN_ABSOLUTE_ERROR:
+        measure_mean_absolute_error = true;
+        continue;
+      default:
+        throw mk_runtime_error(fmt::format(
+            "Initializing MetricsAttrs with unrecogonized metrics type {}", m));
+    }
+  }
+}
+
+} // namespace FlexFlow
diff --git a/lib/pcg/src/pcg/parallel_computation_graph/generate_weight_transform.cc b/lib/pcg/src/pcg/parallel_computation_graph/generate_weight_transform.cc
index 2cf149f78a..940024c9b6 100644
--- a/lib/pcg/src/pcg/parallel_computation_graph/generate_weight_transform.cc
+++ b/lib/pcg/src/pcg/parallel_computation_graph/generate_weight_transform.cc
@@ -1,5 +1,5 @@
 #include "pcg/parallel_computation_graph/generate_weight_transform.h"
-#include "op-attrs/dim_ordered/enumerate.h"
+#include "op-attrs/ff_ordered/enumerate.h"
 #include "op-attrs/parallel_tensor_shape.h"
 
 namespace FlexFlow {
diff --git a/lib/runtime/src/metrics_functions.cc b/lib/runtime/src/metrics_functions.cc
index feb6e704b2..33e15baed2 100644
--- a/lib/runtime/src/metrics_functions.cc
+++ b/lib/runtime/src/metrics_functions.cc
@@ -25,39 +25,6 @@ namespace FlexFlow {
 
 LegionRuntime::Logger::Category log_metrics("metrics");
 
-MetricsAttrs::MetricsAttrs(LossFunction _loss_type,
-                           std::vector<Metric> const &metrics)
-    : loss_type(_loss_type), measure_accuracy(false),
-      measure_categorical_crossentropy(false),
-      measure_sparse_categorical_crossentropy(false),
-      measure_mean_squared_error(false), measure_root_mean_squared_error(false),
-      measure_mean_absolute_error(false) {
-  for (Metric const &m : metrics) {
-    switch (m) {
-      case Metric::ACCURACY:
-        measure_accuracy = true;
-        continue;
-      case Metric::CATEGORICAL_CROSSENTROPY:
-        measure_categorical_crossentropy = true;
-        continue;
-      case Metric::SPARSE_CATEGORICAL_CROSSENTROPY:
-        measure_sparse_categorical_crossentropy = true;
-        continue;
-      case Metric::MEAN_SQUARED_ERROR:
-        measure_mean_squared_error = true;
-        continue;
-      case Metric::ROOT_MEAN_SQUARED_ERROR:
-        measure_root_mean_squared_error = true;
-        continue;
-      case Metric::MEAN_ABSOLUTE_ERROR:
-        measure_mean_absolute_error = true;
-        continue;
-      default:
-        throw mk_runtime_error("Unrecogonized metrics type {}", m);
-    }
-  }
-}
-
 enum Slots {
   LOGIT,
   LABEL,
diff --git a/lib/runtime/src/metrics_functions.h b/lib/runtime/src/metrics_functions.h
index fbb0b633bf..73dc3bbc51 100644
--- a/lib/runtime/src/metrics_functions.h
+++ b/lib/runtime/src/metrics_functions.h
@@ -16,38 +16,13 @@
 #ifndef _FF_METRICS_FUNCTIONS_H_
 #define _FF_METRICS_FUNCTIONS_H_
 
+#include "kernels/metric.h"
 #include "kernels/perf_metrics.h"
 #include "legion.h"
-#include "op-attrs/ops/loss_functions.h"
 #include "task_spec/task_invocation.h"
-#include "utils/fmt.h"
 
 namespace FlexFlow {
 
-enum class Metric {
-  ACCURACY,
-  CATEGORICAL_CROSSENTROPY,
-  SPARSE_CATEGORICAL_CROSSENTROPY,
-  MEAN_SQUARED_ERROR,
-  ROOT_MEAN_SQUARED_ERROR,
-  MEAN_ABSOLUTE_ERROR,
-};
-
-class MetricsAttrs {
-public:
-  MetricsAttrs() = delete;
-  MetricsAttrs(LossFunction, std::vector<Metric> const &);
-
-public:
-  LossFunction loss_type;
-  bool measure_accuracy;
-  bool measure_categorical_crossentropy;
-  bool measure_sparse_categorical_crossentropy;
-  bool measure_mean_squared_error;
-  bool measure_root_mean_squared_error;
-  bool measure_mean_absolute_error;
-};
-
 TypedIndexTaskInvocation<PerfMetrics>
     compute_metrics(MetricsAttrs const &,
                     parallel_tensor_guid_t const &logit,
@@ -79,40 +54,4 @@ VISITABLE_STRUCT(::FlexFlow::MetricsAttrs,
                  measure_root_mean_squared_error,
                  measure_mean_absolute_error);
 
-namespace fmt {
-
-template <>
-struct formatter<::FlexFlow::Metric> : formatter<string_view> {
-  template <typename FormatContext>
-  auto format(::FlexFlow::Metric m, FormatContext &ctx) const
-      -> decltype(ctx.out()) {
-    using namespace FlexFlow;
-
-    string_view name = "unknown";
-    switch (m) {
-      case Metric::ACCURACY:
-        name = "Accuracy";
-        break;
-      case Metric::CATEGORICAL_CROSSENTROPY:
-        name = "CategoricalCrossEntropy";
-        break;
-      case Metric::SPARSE_CATEGORICAL_CROSSENTROPY:
-        name = "SparseCategoricalCrossEntropy";
-        break;
-      case Metric::MEAN_SQUARED_ERROR:
-        name = "MeanSquaredError";
-        break;
-      case Metric::ROOT_MEAN_SQUARED_ERROR:
-        name = "RootMeanSquaredError";
-        break;
-      case Metric::MEAN_ABSOLUTE_ERROR:
-        name = "MeanAbsoluteError";
-        break;
-    }
-    return formatter<string_view>::format(name, ctx);
-  }
-};
-
-} // namespace fmt
-
 #endif
diff --git a/lib/runtime/src/ops/embedding.cc b/lib/runtime/src/ops/embedding.cc
index 253fd3cb4f..83e7c15460 100644
--- a/lib/runtime/src/ops/embedding.cc
+++ b/lib/runtime/src/ops/embedding.cc
@@ -77,11 +77,11 @@ static std::optional<float>
   return profile(backward_kernel,
                  profiling,
                  "[Embedding] backward_time = {:.2lf}ms\n",
-                 input,
                  output,
+                 input,
                  weight_grad,
-                 input.data_type,
                  output.data_type,
+                 input.data_type,
                  attrs.aggr,
                  input.shape.get_dim(),
                  output.shape.get_dim(),
diff --git a/lib/substitutions/include/substitutions/pcg_pattern.h b/lib/substitutions/include/substitutions/pcg_pattern.h
index f0962b15c2..5005a0b51c 100644
--- a/lib/substitutions/include/substitutions/pcg_pattern.h
+++ b/lib/substitutions/include/substitutions/pcg_pattern.h
@@ -12,6 +12,10 @@ namespace FlexFlow {
 
 std::unordered_set<PatternNode> get_nodes(PCGPattern const &);
 
+std::optional<PCGPatternMatch>
+    get_random_pattern_match(PCGPattern const &pattern,
+                             SubParallelComputationGraph const &pcg);
+
 /**
  * @brief Find all locations in \p pcg that match \p pattern
  */
diff --git a/lib/substitutions/include/substitutions/unity_substitution_set.h b/lib/substitutions/include/substitutions/unity_substitution_set.h
index 183f76ac8a..959ba3da2c 100644
--- a/lib/substitutions/include/substitutions/unity_substitution_set.h
+++ b/lib/substitutions/include/substitutions/unity_substitution_set.h
@@ -6,6 +6,8 @@
 #include "utils/fmt/vector.h"
 
 namespace FlexFlow {
+std::optional<Substitution>
+    get_random_substitution(MachineSpecification const &resources);
 
 std::vector<Substitution>
     get_substitution_set(MachineSpecification const &resources);
diff --git a/lib/substitutions/src/substitutions/operator_pattern/satisfies_constraint.cc b/lib/substitutions/src/substitutions/operator_pattern/satisfies_constraint.cc
index 194ae49255..f39b771364 100644
--- a/lib/substitutions/src/substitutions/operator_pattern/satisfies_constraint.cc
+++ b/lib/substitutions/src/substitutions/operator_pattern/satisfies_constraint.cc
@@ -16,6 +16,16 @@ bool operator_satisfies_constraint(
   switch (constraint.constraint_type) {
     case ConstraintType::EQUAL:
       return expr_val.value() == constraint.attribute_value;
+    case ConstraintType::DIVISIBLE_BY: {
+      if (expr_val.value().has<nonnegative_int>() &&
+          constraint.attribute_value.has<nonnegative_int>()) {
+        return expr_val.value().get<nonnegative_int>() %
+                   constraint.attribute_value.get<nonnegative_int>() ==
+               0;
+      }
+      throw mk_runtime_error(
+          "DIVISIBLE_BY constraint requires nonnegative_int values");
+    }
     default:
       throw mk_runtime_error(
           fmt::format("Unknown constraint type {}",
diff --git a/lib/substitutions/src/substitutions/pcg_pattern.cc b/lib/substitutions/src/substitutions/pcg_pattern.cc
index a0af875848..fbc181a0f9 100644
--- a/lib/substitutions/src/substitutions/pcg_pattern.cc
+++ b/lib/substitutions/src/substitutions/pcg_pattern.cc
@@ -11,6 +11,7 @@
 #include "utils/graph/node/algorithms.h"
 #include "utils/graph/open_dataflow_graph/algorithms/get_inputs.h"
 #include "utils/graph/open_dataflow_graph/algorithms/get_open_dataflow_graph_inputs.h"
+#include "utils/random_utils.h"
 
 namespace FlexFlow {
 
@@ -37,6 +38,17 @@ static MatchAdditionalCriterion
       }};
 }
 
+std::optional<PCGPatternMatch>
+    get_random_pattern_match(PCGPattern const &pattern,
+                             SubParallelComputationGraph const &pcg) {
+  std::vector<PCGPatternMatch> pattern_matches =
+      find_pattern_matches(pattern, pcg);
+  if (pattern_matches.empty()) {
+    return std::nullopt;
+  }
+  return select_random(pattern_matches);
+}
+
 std::vector<PCGPatternMatch>
     find_pattern_matches(PCGPattern const &pattern,
                          SubParallelComputationGraph const &pcg) {
diff --git a/lib/substitutions/src/substitutions/sub_parallel_computation_graph.cc b/lib/substitutions/src/substitutions/sub_parallel_computation_graph.cc
index 83df74f21b..0c673f0a8a 100644
--- a/lib/substitutions/src/substitutions/sub_parallel_computation_graph.cc
+++ b/lib/substitutions/src/substitutions/sub_parallel_computation_graph.cc
@@ -188,34 +188,33 @@ bool sub_pcgs_are_isomorphic(SubParallelComputationGraph const &lhs,
 }
 
 std::string as_dot(SubParallelComputationGraph const &spcg) {
-  NOT_IMPLEMENTED();
-  // std::function<std::string(ParallelLayerAttrs const &)> get_node_label =
-  //     [](ParallelLayerAttrs const &a) -> std::string {
-  //   RecordFormatter r = as_dot(a.op_attrs);
-  //
-  //   if (a.name.has_value()) {
-  //     RecordFormatter rr;
-  //     rr << "Name" << a.name.value();
-  //     r << rr;
-  //   }
-  //
-  //   std::ostringstream oss;
-  //   oss << r;
-  //   return oss.str();
-  // };
-  //
-  // std::function<std::string(ParallelTensorAttrs const &)> get_input_label =
-  //     [](ParallelTensorAttrs const &a) -> std::string {
-  //   RecordFormatter r;
-  //
-  //   r << fmt::to_string(a.shape);
-  //
-  //   std::ostringstream oss;
-  //   oss << r;
-  //   return oss.str();
-  // };
-  //
-  // return as_dot(spcg.raw_graph, get_node_label, get_input_label);
+  std::function<std::string(ParallelLayerAttrs const &)> get_node_label =
+      [](ParallelLayerAttrs const &a) -> std::string {
+    RecordFormatter r = as_dot(a.op_attrs);
+
+    if (a.name.has_value()) {
+      RecordFormatter rr;
+      rr << "Name" << a.name.value();
+      r << rr;
+    }
+
+    std::ostringstream oss;
+    oss << r;
+    return oss.str();
+  };
+
+  std::function<std::string(ParallelTensorAttrs const &)> get_input_label =
+      [](ParallelTensorAttrs const &a) -> std::string {
+    RecordFormatter r;
+
+    r << fmt::to_string(a.shape);
+
+    std::ostringstream oss;
+    oss << r;
+    return oss.str();
+  };
+
+  return as_dot(spcg.raw_graph, get_node_label, get_input_label);
 }
 
 void debug_print_dot(SubParallelComputationGraph const &spcg) {
diff --git a/lib/substitutions/src/substitutions/tensor_pattern/satisfies_constraint.cc b/lib/substitutions/src/substitutions/tensor_pattern/satisfies_constraint.cc
index 974bfcabc0..cc0af12c91 100644
--- a/lib/substitutions/src/substitutions/tensor_pattern/satisfies_constraint.cc
+++ b/lib/substitutions/src/substitutions/tensor_pattern/satisfies_constraint.cc
@@ -12,6 +12,16 @@ bool parallel_tensor_satisfies_constraint(
   switch (constraint.constraint_type) {
     case ConstraintType::EQUAL:
       return expr_val == constraint.attribute_value;
+    case ConstraintType::DIVISIBLE_BY: {
+      if (expr_val.has<nonnegative_int>() &&
+          constraint.attribute_value.has<nonnegative_int>()) {
+        return expr_val.get<nonnegative_int>() %
+                   constraint.attribute_value.get<nonnegative_int>() ==
+               0;
+      }
+      throw mk_runtime_error(
+          "DIVISIBLE_BY constraint requires nonnegative_int values");
+    }
     default:
       throw mk_runtime_error(
           fmt::format("Unknown constraint type {}",
diff --git a/lib/substitutions/src/substitutions/unity_substitution_set.cc b/lib/substitutions/src/substitutions/unity_substitution_set.cc
index 4b00cdd95f..c8d9266978 100644
--- a/lib/substitutions/src/substitutions/unity_substitution_set.cc
+++ b/lib/substitutions/src/substitutions/unity_substitution_set.cc
@@ -7,9 +7,19 @@
 #include "utils/containers/get_only.h"
 #include "utils/nonnegative_int/nonnegative_int.h"
 #include "utils/nonnegative_int/nonnegative_range.h"
+#include "utils/random_utils.h"
 
 namespace FlexFlow {
 
+std::optional<Substitution>
+    get_random_substitution(MachineSpecification const &resources) {
+  std::vector<Substitution> substitutions = get_substitution_set(resources);
+  if (substitutions.empty()) {
+    return std::nullopt;
+  }
+  return select_random(substitutions);
+}
+
 std::vector<Substitution>
     get_substitution_set(MachineSpecification const &resources) {
   std::vector<Substitution> substitutions;
diff --git a/lib/substitutions/src/substitutions/unlabelled/find_pattern_matches.cc b/lib/substitutions/src/substitutions/unlabelled/find_pattern_matches.cc
index 9d8e4bc259..fa0ff7794a 100644
--- a/lib/substitutions/src/substitutions/unlabelled/find_pattern_matches.cc
+++ b/lib/substitutions/src/substitutions/unlabelled/find_pattern_matches.cc
@@ -140,7 +140,6 @@ std::vector<UnlabelledDataflowGraphPatternMatch>
       }
     }
   }
-
   return matches;
 }
 
diff --git a/lib/utils/include/utils/containers/subvec.h b/lib/utils/include/utils/containers/slice.h
similarity index 69%
rename from lib/utils/include/utils/containers/subvec.h
rename to lib/utils/include/utils/containers/slice.h
index c89e9227de..a82fb383b5 100644
--- a/lib/utils/include/utils/containers/subvec.h
+++ b/lib/utils/include/utils/containers/slice.h
@@ -9,9 +9,9 @@
 namespace FlexFlow {
 
 template <typename T>
-std::vector<T> subvec(std::vector<T> const &v,
-                      std::optional<int> const &maybe_start,
-                      std::optional<int> const &maybe_end) {
+std::vector<T> slice(std::vector<T> const &v,
+                     int const &maybe_start,
+                     std::optional<int> const &maybe_end) {
   auto begin_iter = v.cbegin();
   auto end_iter = v.cend();
 
@@ -22,15 +22,13 @@ std::vector<T> subvec(std::vector<T> const &v,
         if (idx < 0) {
           new_idx = size + idx;
         }
-        if (new_idx < 0 || new_idx > size) {
-          throw mk_runtime_error("Index {} is out of bounds for array {}");
-        }
+
+        ASSERT(new_idx >= 0, "Index out of bounds");
+        ASSERT(new_idx <= size, "Index out of bounds");
         return new_idx;
       };
 
-  if (maybe_start.has_value()) {
-    begin_iter += resolve_loc(maybe_start.value());
-  }
+  begin_iter += resolve_loc(maybe_start);
 
   if (maybe_end.has_value()) {
     end_iter = v.cbegin() + resolve_loc(maybe_end.value());
diff --git a/lib/utils/include/utils/containers/zip_strict.h b/lib/utils/include/utils/containers/zip_strict.h
index 64049042d4..5606fccff1 100644
--- a/lib/utils/include/utils/containers/zip_strict.h
+++ b/lib/utils/include/utils/containers/zip_strict.h
@@ -4,21 +4,17 @@
 #include "utils/containers/zip.h"
 #include "utils/exception.h"
 #include "utils/fmt/vector.h"
+#include <libassert/assert.hpp>
 
 namespace FlexFlow {
 
 template <typename L, typename R>
 std::vector<std::pair<L, R>> zip_strict(std::vector<L> const &lhs,
                                         std::vector<R> const &rhs) {
-  if (lhs.size() != rhs.size()) {
-    throw mk_runtime_error(
-        fmt::format("zip_strict requires lhs and rhs to have the same length, "
-                    "but received lhs={} (length {}), rhs={} (length {})",
-                    lhs,
-                    lhs.size(),
-                    rhs,
-                    rhs.size()));
-  }
+  ASSERT(lhs.size() == rhs.size(),
+         "zip_strict requires lhs and rhs to have the same length",
+         lhs,
+         rhs);
 
   return zip(lhs, rhs);
 }
diff --git a/lib/utils/include/utils/exception.h b/lib/utils/include/utils/exception.h
index 080cbb3611..f95eb8a38d 100644
--- a/lib/utils/include/utils/exception.h
+++ b/lib/utils/include/utils/exception.h
@@ -3,6 +3,7 @@
 
 #include "utils/fmt.h"
 #include <fmt/format.h>
+#include <libassert/assert.hpp>
 #include <stdexcept>
 #include <tl/expected.hpp>
 
diff --git a/lib/utils/include/utils/full_binary_tree/as_dot.h b/lib/utils/include/utils/full_binary_tree/as_dot.h
new file mode 100644
index 0000000000..e104d05e06
--- /dev/null
+++ b/lib/utils/include/utils/full_binary_tree/as_dot.h
@@ -0,0 +1,81 @@
+#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_FULL_BINARY_TREE_AS_DOT_H
+#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_FULL_BINARY_TREE_AS_DOT_H
+
+#include "utils/containers/get_only.h"
+#include "utils/dot_file.h"
+#include "utils/full_binary_tree/full_binary_tree_implementation.dtg.h"
+#include "utils/full_binary_tree/full_binary_tree_visitor.dtg.h"
+#include "utils/full_binary_tree/visit.h"
+#include "utils/graph/dataflow_graph/dataflow_graph.h"
+#include "utils/graph/dataflow_graph/dataflow_graph_view.h"
+#include "utils/graph/digraph/digraph_view.h"
+#include "utils/graph/instances/adjacency_digraph.h"
+#include "utils/graph/instances/unordered_set_dataflow_graph.h"
+#include "utils/graph/instances/unordered_set_labelled_open_dataflow_graph.h"
+#include "utils/graph/labelled_dataflow_graph/algorithms/view_as_labelled_open_dataflow_graph.h"
+#include "utils/graph/labelled_dataflow_graph/labelled_dataflow_graph.h"
+#include "utils/graph/labelled_open_dataflow_graph/algorithms/as_dot.h"
+#include <functional>
+#include <sstream>
+#include <string>
+
+namespace FlexFlow {
+
+template <typename Tree, typename Parent, typename Leaf, typename NodeLabel>
+LabelledDataflowGraph<NodeLabel, std::monostate> as_labelled_dataflow_graph(
+    Tree const &tree,
+    FullBinaryTreeImplementation<Tree, Parent, Leaf> const &impl,
+    std::function<NodeLabel(Parent const &)> const &get_parent_label,
+    std::function<NodeLabel(Leaf const &)> const &get_leaf_label) {
+  auto g = LabelledDataflowGraph<NodeLabel, std::monostate>::template create<
+      UnorderedSetLabelledOpenDataflowGraph<NodeLabel, std::monostate>>();
+
+  FullBinaryTreeVisitor<DataflowOutput, Tree, Parent, Leaf> visitor =
+      FullBinaryTreeVisitor<DataflowOutput, Tree, Parent, Leaf>{
+          [&](Parent const &parent) -> DataflowOutput {
+            DataflowOutput left_child_output =
+                visit(impl.get_left_child(parent), impl, visitor);
+            DataflowOutput right_child_output =
+                visit(impl.get_right_child(parent), impl, visitor);
+            NodeLabel parent_label = get_parent_label(parent);
+            NodeAddedResult parent_added =
+                g.add_node(parent_label,
+                           {left_child_output, right_child_output},
+                           {std::monostate{}});
+            return get_only(parent_added.outputs);
+          },
+          [&](Leaf const &leaf) -> DataflowOutput {
+            NodeLabel leaf_label = get_leaf_label(leaf);
+            NodeAddedResult leaf_added =
+                g.add_node(leaf_label, {}, {std::monostate{}});
+            return get_only(leaf_added.outputs);
+          },
+      };
+
+  visit(tree, impl, visitor);
+
+  return g;
+}
+
+template <typename Tree, typename Parent, typename Leaf>
+std::string
+    as_dot(Tree const &tree,
+           FullBinaryTreeImplementation<Tree, Parent, Leaf> const &impl,
+           std::function<std::string(Parent const &)> const &get_parent_label,
+           std::function<std::string(Leaf const &)> const &get_leaf_label) {
+
+  LabelledDataflowGraphView<std::string, std::monostate> g =
+      as_labelled_dataflow_graph(tree, impl, get_parent_label, get_leaf_label);
+
+  std::function<std::string(std::string const &)> get_node_label =
+      [](std::string const &s) { return s; };
+  std::function<std::string(std::monostate const &)> get_input_label =
+      [](std::monostate const &) { return ""; };
+
+  return as_dot(
+      view_as_labelled_open_dataflow_graph(g), get_node_label, get_input_label);
+}
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/utils/include/utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.h b/lib/utils/include/utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.h
index de48cd17e9..9b4ea6cd20 100644
--- a/lib/utils/include/utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.h
+++ b/lib/utils/include/utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.h
@@ -1,11 +1,13 @@
 #ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_GRAPH_SERIES_PARALLEL_BINARY_SP_DECOMPOSITION_TREE_BINARY_SP_DECOMPOSITION_TREE_H
 #define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_GRAPH_SERIES_PARALLEL_BINARY_SP_DECOMPOSITION_TREE_BINARY_SP_DECOMPOSITION_TREE_H
 
+#include "utils/full_binary_tree/binary_tree_path.dtg.h"
 #include "utils/graph/series_parallel/binary_sp_decomposition_tree/binary_parallel_split.dtg.h"
 #include "utils/graph/series_parallel/binary_sp_decomposition_tree/binary_series_split.dtg.h"
 #include "utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.dtg.h"
 #include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree_implementation.dtg.h"
 #include "utils/graph/series_parallel/sp_decomposition_tree_node_type.dtg.h"
+#include <optional>
 #include <unordered_set>
 
 namespace FlexFlow {
@@ -23,6 +25,10 @@ std::unordered_multiset<Node> get_leaves(BinarySPDecompositionTree const &);
 
 SPDecompositionTreeNodeType get_node_type(BinarySPDecompositionTree const &);
 
+std::optional<BinarySPDecompositionTree>
+    binary_sp_decomposition_tree_get_subtree_at_path(
+        BinarySPDecompositionTree const &, BinaryTreePath const &);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/utils/include/utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/as_dot.h b/lib/utils/include/utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/as_dot.h
new file mode 100644
index 0000000000..9c999d8f6e
--- /dev/null
+++ b/lib/utils/include/utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/as_dot.h
@@ -0,0 +1,43 @@
+#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_GRAPH_SERIES_PARALLEL_BINARY_SP_DECOMPOSITION_TREE_GENERIC_BINARY_SP_DECOMPOSITION_TREE_AS_DOT_H
+#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_GRAPH_SERIES_PARALLEL_BINARY_SP_DECOMPOSITION_TREE_GENERIC_BINARY_SP_DECOMPOSITION_TREE_AS_DOT_H
+
+#include "utils/full_binary_tree/as_dot.h"
+#include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree_implementation.dtg.h"
+#include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree_implementation.h"
+#include "utils/overload.h"
+
+namespace FlexFlow {
+
+template <typename Tree, typename Series, typename Parallel, typename Leaf>
+std::string as_dot(
+    Tree const &tree,
+    GenericBinarySPDecompositionTreeImplementation<Tree,
+                                                   Series,
+                                                   Parallel,
+                                                   Leaf> const &impl,
+    std::function<std::string(Series const &)> const &get_series_label,
+    std::function<std::string(Parallel const &)> const &get_parallel_label,
+    std::function<std::string(Leaf const &)> const &get_leaf_label) {
+  FullBinaryTreeImplementation<Tree, std::variant<Series, Parallel>, Leaf>
+      full_binary_tree_impl = get_full_binary_impl_from_generic_sp_impl(impl);
+
+  std::function<std::string(std::variant<Series, Parallel> const &)>
+      get_parent_label =
+          [&](std::variant<Series, Parallel> const &parent) -> std::string {
+    return std::visit(overload{
+                          [&](Series const &series) -> std::string {
+                            return get_series_label(series);
+                          },
+                          [&](Parallel const &parallel) -> std::string {
+                            return get_parallel_label(parallel);
+                          },
+                      },
+                      parent);
+  };
+
+  return as_dot(tree, full_binary_tree_impl, get_parent_label, get_leaf_label);
+}
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/utils/include/utils/indent.h b/lib/utils/include/utils/indent.h
new file mode 100644
index 0000000000..eccbd34cfc
--- /dev/null
+++ b/lib/utils/include/utils/indent.h
@@ -0,0 +1,12 @@
+#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_INDENT_H
+#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_INDENT_H
+
+#include <string>
+
+namespace FlexFlow {
+
+std::string indent(std::string const &, int indent_size = 2);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/utils/include/utils/optional.h b/lib/utils/include/utils/optional.h
index 377561d70c..8673264d36 100644
--- a/lib/utils/include/utils/optional.h
+++ b/lib/utils/include/utils/optional.h
@@ -32,6 +32,11 @@ T const &assert_unwrap(std::optional<T> const &o) {
   return o.value();
 }
 
+template <typename T>
+T expect(std::optional<T> const &x, std::string const &err) {
+  return unwrap(x, [&]() { throw mk_runtime_error(err); });
+}
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/utils/include/utils/random_utils.h b/lib/utils/include/utils/random_utils.h
index 99da9646a1..014c38fc51 100644
--- a/lib/utils/include/utils/random_utils.h
+++ b/lib/utils/include/utils/random_utils.h
@@ -5,7 +5,7 @@
 #include <stdexcept>
 #include <vector>
 
-float randf() {
+inline float randf() {
   return static_cast<float>(std::rand()) / static_cast<float>(RAND_MAX);
 }
 
diff --git a/lib/utils/include/utils/stack_vector/stack_vector.h b/lib/utils/include/utils/stack_vector/stack_vector.h
index 5d4d6eaad3..64d005a10e 100644
--- a/lib/utils/include/utils/stack_vector/stack_vector.h
+++ b/lib/utils/include/utils/stack_vector/stack_vector.h
@@ -272,18 +272,6 @@ struct stack_vector {
     return !(*this == other);
   }
 
-  bool operator<(stack_vector const &other) const {
-    for (std::size_t i = 0; i < std::min(this->m_size, other.m_size); i++) {
-      if (this->at(i) < other.at(i)) {
-        return true;
-      } else if (this->at(i) > other.at(i)) {
-        return false;
-      }
-    }
-
-    return (this->m_size < other.m_size);
-  }
-
   std::size_t size() const {
     return this->m_size;
   }
@@ -305,17 +293,16 @@ struct stack_vector {
 private:
   std::size_t m_size = 0;
   std::array<element_type, MAXSIZE> contents;
-
-  static_assert(
-      implies<is_equal_comparable<T>, is_equal_comparable<stack_vector>>::value,
-      "");
-  static_assert(
-      implies<is_neq_comparable<T>, is_neq_comparable<stack_vector>>::value,
-      "");
-  static_assert(
-      implies<is_lt_comparable<T>, is_lt_comparable<stack_vector>>::value, "");
 };
 
+template <typename T, std::size_t MAXSIZE>
+auto operator<(stack_vector<T, MAXSIZE> const &lhs,
+               stack_vector<T, MAXSIZE> const &rhs)
+    -> std::enable_if_t<is_lt_comparable_v<T>, bool> {
+  return std::lexicographical_compare(
+      lhs.begin(), lhs.end(), rhs.begin(), rhs.end());
+}
+
 template <typename T, std::size_t MAXSIZE>
 std::ostream &operator<<(std::ostream &s, stack_vector<T, MAXSIZE> const &v) {
   return s << fmt::to_string(v);
diff --git a/lib/utils/src/utils/containers/slice.cc b/lib/utils/src/utils/containers/slice.cc
new file mode 100644
index 0000000000..f960c21881
--- /dev/null
+++ b/lib/utils/src/utils/containers/slice.cc
@@ -0,0 +1,3 @@
+#include "utils/containers/slice.h"
+
+namespace FlexFlow {} // namespace FlexFlow
diff --git a/lib/utils/src/utils/containers/subvec.cc b/lib/utils/src/utils/containers/subvec.cc
deleted file mode 100644
index 93c7de31c5..0000000000
--- a/lib/utils/src/utils/containers/subvec.cc
+++ /dev/null
@@ -1 +0,0 @@
-#include "utils/containers/subvec.h"
diff --git a/lib/utils/src/utils/full_binary_tree/as_dot.cc b/lib/utils/src/utils/full_binary_tree/as_dot.cc
new file mode 100644
index 0000000000..12a1ab5533
--- /dev/null
+++ b/lib/utils/src/utils/full_binary_tree/as_dot.cc
@@ -0,0 +1,16 @@
+#include "utils/full_binary_tree/as_dot.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using Tree = value_type<0>;
+using Parent = value_type<1>;
+using Leaf = value_type<2>;
+
+template std::string
+    as_dot(Tree const &,
+           FullBinaryTreeImplementation<Tree, Parent, Leaf> const &,
+           std::function<std::string(Parent const &)> const &,
+           std::function<std::string(Leaf const &)> const &);
+
+} // namespace FlexFlow
diff --git a/lib/utils/src/utils/full_binary_tree/binary_tree_path.cc b/lib/utils/src/utils/full_binary_tree/binary_tree_path.cc
index 8445a2721a..8aed06ae01 100644
--- a/lib/utils/src/utils/full_binary_tree/binary_tree_path.cc
+++ b/lib/utils/src/utils/full_binary_tree/binary_tree_path.cc
@@ -1,5 +1,5 @@
 #include "utils/full_binary_tree/binary_tree_path.h"
-#include "utils/containers/subvec.h"
+#include "utils/containers/slice.h"
 
 namespace FlexFlow {
 
@@ -27,7 +27,7 @@ BinaryTreePathEntry binary_tree_path_get_top_level(BinaryTreePath const &p) {
 
 BinaryTreePath binary_tree_path_get_non_top_level(BinaryTreePath const &p) {
   return BinaryTreePath{
-      subvec(p.entries, 1, std::nullopt),
+      slice(p.entries, 1, std::nullopt),
   };
 }
 
diff --git a/lib/utils/src/utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.cc b/lib/utils/src/utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.cc
index 62489ff75f..3e4bc13289 100644
--- a/lib/utils/src/utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.cc
+++ b/lib/utils/src/utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.cc
@@ -1,5 +1,6 @@
 #include "utils/graph/series_parallel/binary_sp_decomposition_tree/binary_sp_decomposition_tree.h"
 #include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/get_leaves.h"
+#include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/get_subtree_at_path.h"
 #include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/is_binary_sp_tree_left_associative.h"
 #include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/is_binary_sp_tree_right_associative.h"
 
@@ -82,4 +83,10 @@ SPDecompositionTreeNodeType
   });
 }
 
+std::optional<BinarySPDecompositionTree>
+    binary_sp_decomposition_tree_get_subtree_at_path(
+        BinarySPDecompositionTree const &tree, BinaryTreePath const &path) {
+  return get_subtree_at_path(tree, generic_impl_for_binary_sp_tree(), path);
+}
+
 } // namespace FlexFlow
diff --git a/lib/utils/src/utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/as_dot.cc b/lib/utils/src/utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/as_dot.cc
new file mode 100644
index 0000000000..f557515c83
--- /dev/null
+++ b/lib/utils/src/utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/as_dot.cc
@@ -0,0 +1,21 @@
+#include "utils/graph/series_parallel/binary_sp_decomposition_tree/generic_binary_sp_decomposition_tree/as_dot.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using Tree = value_type<0>;
+using Series = value_type<1>;
+using Parallel = value_type<2>;
+using Leaf = value_type<3>;
+
+template std::string
+    as_dot(Tree const &,
+           GenericBinarySPDecompositionTreeImplementation<Tree,
+                                                          Series,
+                                                          Parallel,
+                                                          Leaf> const &,
+           std::function<std::string(Series const &)> const &,
+           std::function<std::string(Parallel const &)> const &,
+           std::function<std::string(Leaf const &)> const &);
+
+} // namespace FlexFlow
diff --git a/lib/utils/src/utils/graph/series_parallel/series_reduction.cc b/lib/utils/src/utils/graph/series_parallel/series_reduction.cc
index 5b9b592444..459e61be71 100644
--- a/lib/utils/src/utils/graph/series_parallel/series_reduction.cc
+++ b/lib/utils/src/utils/graph/series_parallel/series_reduction.cc
@@ -3,7 +3,7 @@
 #include "utils/containers/contains_key.h"
 #include "utils/containers/get_only.h"
 #include "utils/containers/require_same.h"
-#include "utils/containers/subvec.h"
+#include "utils/containers/slice.h"
 #include "utils/containers/unordered_set_of.h"
 #include "utils/containers/values.h"
 #include "utils/graph/digraph/algorithms/get_predecessors.h"
@@ -103,7 +103,7 @@ MultiDiEdge
   Node last = g.get_multidiedge_dst(reduction.edges.back());
 
   std::vector<Node> internal_nodes;
-  for (MultiDiEdge const &e : subvec(reduction.edges, std::nullopt, -1)) {
+  for (MultiDiEdge const &e : slice(reduction.edges, 0, -1)) {
     internal_nodes.push_back(g.get_multidiedge_dst(e));
   }
 
diff --git a/lib/utils/src/utils/indent.cc b/lib/utils/src/utils/indent.cc
new file mode 100644
index 0000000000..2761ad1878
--- /dev/null
+++ b/lib/utils/src/utils/indent.cc
@@ -0,0 +1,17 @@
+#include "utils/indent.h"
+#include "utils/containers/flatmap.h"
+
+namespace FlexFlow {
+
+std::string indent(std::string const &s, int indent_size) {
+  std::string indent_str(indent_size, ' ');
+  return indent_str + flatmap(s, [&](char c) -> std::string {
+           if (c == '\n') {
+             return "\n" + indent_str;
+           } else {
+             return std::string{c};
+           };
+         });
+}
+
+} // namespace FlexFlow
diff --git a/lib/utils/src/utils/stack_vector/stack_vector.cc b/lib/utils/src/utils/stack_vector/stack_vector.cc
index d4fb849412..e2009d74d3 100644
--- a/lib/utils/src/utils/stack_vector/stack_vector.cc
+++ b/lib/utils/src/utils/stack_vector/stack_vector.cc
@@ -1,9 +1,9 @@
 #include "utils/stack_vector/stack_vector.h"
-#include "utils/archetypes/ordered_value_type.h"
+#include "utils/archetypes/value_type.h"
 
 namespace FlexFlow {
 
-using T = ordered_value_type<0>;
+using T = value_type<0>;
 
 template struct stack_vector<T, 5>;
 template struct stack_vector<int, 5>;
diff --git a/lib/utils/test/common/include/test/utils/doctest/check_kv.h b/lib/utils/test/common/include/test/utils/doctest/check_kv.h
new file mode 100644
index 0000000000..6449b8ac87
--- /dev/null
+++ b/lib/utils/test/common/include/test/utils/doctest/check_kv.h
@@ -0,0 +1,12 @@
+#ifndef _FLEXFLOW_LIB_UTILS_TEST_COMMON_INCLUDE_TEST_UTILS_DOCTEST_CHECK_KV_H
+#define _FLEXFLOW_LIB_UTILS_TEST_COMMON_INCLUDE_TEST_UTILS_DOCTEST_CHECK_KV_H
+
+#include <string>
+
+namespace FlexFlow {
+
+std::string check_kv(std::string const &k, std::string const &v);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/utils/test/common/src/main.cc b/lib/utils/test/common/src/main.cc
index 9522fa7fdb..6df2d925b7 100644
--- a/lib/utils/test/common/src/main.cc
+++ b/lib/utils/test/common/src/main.cc
@@ -1,2 +1,15 @@
-#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
-#include "doctest/doctest.h"
+#define DOCTEST_CONFIG_IMPLEMENT
+#include <doctest/doctest.h>
+
+#include <libassert/assert.hpp>
+#include <stdexcept>
+
+void libassert_throw_exception_handler(libassert::assertion_info const &info) {
+  throw std::runtime_error("Assertion failed:\n" + info.to_string());
+}
+
+int main(int argc, char **argv) {
+  libassert::set_failure_handler(libassert_throw_exception_handler);
+
+  return doctest::Context(argc, argv).run();
+}
diff --git a/lib/utils/test/common/src/test/utils/doctest/check_kv.cc b/lib/utils/test/common/src/test/utils/doctest/check_kv.cc
new file mode 100644
index 0000000000..d3c1ee335e
--- /dev/null
+++ b/lib/utils/test/common/src/test/utils/doctest/check_kv.cc
@@ -0,0 +1,17 @@
+#include "test/utils/doctest/check_kv.h"
+#include "utils/indent.h"
+#include <sstream>
+
+namespace FlexFlow {
+
+std::string check_kv(std::string const &k, std::string const &v) {
+  std::ostringstream oss;
+
+  oss << std::endl
+      << indent(k + "=", /*indent_size=*/4) << std::endl
+      << indent(v, /*indent_size=*/6);
+
+  return oss.str();
+}
+
+} // namespace FlexFlow
diff --git a/lib/utils/test/src/utils/containers/subvec.cc b/lib/utils/test/src/utils/containers/slice.cc
similarity index 69%
rename from lib/utils/test/src/utils/containers/subvec.cc
rename to lib/utils/test/src/utils/containers/slice.cc
index 610fc55b5a..4e4d840bfe 100644
--- a/lib/utils/test/src/utils/containers/subvec.cc
+++ b/lib/utils/test/src/utils/containers/slice.cc
@@ -1,4 +1,4 @@
-#include "utils/containers/subvec.h"
+#include "utils/containers/slice.h"
 #include "test/utils/doctest/fmt/vector.h"
 #include <doctest/doctest.h>
 #include <vector>
@@ -6,57 +6,57 @@
 using namespace FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE("subvec") {
+  TEST_CASE("slice") {
     std::vector<int> v = {1, 2, 3, 4, 5};
 
-    SUBCASE("Basic subvector") {
-      auto result = subvec(v, 1, 4);
+    SUBCASE("Basic slice") {
+      auto result = slice(v, 1, 4);
       std::vector<int> correct = {2, 3, 4};
       CHECK(result == correct);
     }
 
     SUBCASE("From beginning to index") {
-      auto result = subvec(v, std::nullopt, 3);
+      auto result = slice(v, 0, 3);
       std::vector<int> correct = {1, 2, 3};
       CHECK(result == correct);
     }
 
     SUBCASE("From index to end") {
-      auto result = subvec(v, 2, std::nullopt);
+      auto result = slice(v, 2, std::nullopt);
       std::vector<int> correct = {3, 4, 5};
       CHECK(result == correct);
     }
 
     SUBCASE("All of the vector") {
-      auto result = subvec(v, std::nullopt, std::nullopt);
+      auto result = slice(v, 0, std::nullopt);
       std::vector<int> correct = {1, 2, 3, 4, 5};
       CHECK(result == correct);
     }
 
     SUBCASE("Start greater than end") {
-      auto result = subvec(v, 3, 1);
+      auto result = slice(v, 3, 1);
       std::vector<int> correct = {};
       CHECK(result == correct);
     }
 
     SUBCASE("Start equal to end") {
-      auto result = subvec(v, 3, 3);
+      auto result = slice(v, 3, 3);
       std::vector<int> correct = {};
       CHECK(result == correct);
     }
 
     SUBCASE("Negative indices") {
-      auto result = subvec(v, -3, -1);
+      auto result = slice(v, -3, -1);
       std::vector<int> correct = {3, 4};
       CHECK(result == correct);
     }
 
     SUBCASE("Upper index is out of bounds by 1") {
-      CHECK_THROWS(subvec(v, 2, 6));
+      CHECK_THROWS(slice(v, 2, 6));
     }
 
     SUBCASE("Lower index is out of bounds by 1") {
-      CHECK_THROWS(subvec(v, -6, 2));
+      CHECK_THROWS(slice(v, -6, 2));
     }
   }
 }
diff --git a/lib/utils/test/src/utils/indent.cc b/lib/utils/test/src/utils/indent.cc
new file mode 100644
index 0000000000..b137253fae
--- /dev/null
+++ b/lib/utils/test/src/utils/indent.cc
@@ -0,0 +1,66 @@
+#include "utils/indent.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("indent") {
+    SUBCASE("string is empty") {
+      std::string input = "";
+
+      std::string result = indent(input);
+      std::string correct = "  ";
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("string is one line") {
+      std::string input = "hello world";
+      std::string result = indent(input);
+      std::string correct = "  hello world";
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("string has multiple lines") {
+      std::string input = "\n"
+                          "a b\n"
+                          "c d\n"
+                          "e f\n"
+                          "g\n";
+
+      std::string result = indent(input);
+      std::string correct = "  \n"
+                            "  a b\n"
+                            "  c d\n"
+                            "  e f\n"
+                            "  g\n"
+                            "  ";
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("leading and trailing whitespace is preserved") {
+      std::string input = "   a b  \n"
+                          "c   d e\n"
+                          "     ";
+
+      std::string result = indent(input);
+      std::string correct = "     a b  \n"
+                            "  c   d e\n"
+                            "       ";
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("allows custom indent size") {
+      std::string input = "hello\nworld";
+
+      std::string result = indent(input, /*indent_size=*/4);
+      std::string correct = "    hello\n"
+                            "    world";
+
+      CHECK(result == correct);
+    }
+  }
+}
diff --git a/lib/utils/test/src/utils/stack_vector/stack_vector.cc b/lib/utils/test/src/utils/stack_vector/stack_vector.cc
index c36de733b6..6eb2cc0d88 100644
--- a/lib/utils/test/src/utils/stack_vector/stack_vector.cc
+++ b/lib/utils/test/src/utils/stack_vector/stack_vector.cc
@@ -1,12 +1,97 @@
 #include "utils/stack_vector/stack_vector.h"
 #include "test/utils/doctest/fmt/vector.h"
 #include "test/utils/rapidcheck.h"
+#include "utils/archetypes/value_type.h"
 #include <doctest/doctest.h>
 #include <iterator>
 
 using namespace FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("operator<(stack_vector<T, MAXSIZE>, stack_vector<T, MAXSIZE>)") {
+    constexpr std::size_t MAXSIZE = 5;
+
+    SUBCASE("T is ordered") {
+      SUBCASE("inputs are the same") {
+        std::vector<int> input = {2, 1, 2, 3};
+
+        bool result = (input < input);
+        bool correct = false;
+
+        CHECK(result == correct);
+      }
+
+      SUBCASE("lhs is strict prefix of rhs") {
+        std::vector<int> lhs = {2, 1, 2};
+        std::vector<int> rhs = {2, 1, 2, 3};
+
+        bool result = (lhs < rhs);
+        bool correct = true;
+
+        CHECK(result == correct);
+      }
+
+      SUBCASE("lhs is empty") {
+        std::vector<int> lhs = {};
+        std::vector<int> rhs = {2, 1, 2, 3};
+
+        bool result = (lhs < rhs);
+        bool correct = true;
+
+        CHECK(result == correct);
+      }
+
+      SUBCASE("lhs has a smaller element first") {
+        std::vector<int> lhs = {2, 1, 0, 3};
+        std::vector<int> rhs = {2, 1, 2};
+
+        bool result = (lhs < rhs);
+        bool correct = true;
+
+        CHECK(result == correct);
+      }
+
+      // from the definition of a strict total order, i.e.,
+      // https://en.wikipedia.org/w/index.php?title=Total_order&oldid=1278541072#Strict_and_non-strict_total_orders
+      RC_SUBCASE("operator< is irreflexive",
+                 [](stack_vector<int, MAXSIZE> const &input) {
+                   RC_ASSERT(!(input < input));
+                 });
+
+      RC_SUBCASE("operator< is asymmetric",
+                 [](stack_vector<int, MAXSIZE> const &lhs,
+                    stack_vector<int, MAXSIZE> const &rhs) {
+                   RC_PRE(lhs != rhs);
+
+                   RC_ASSERT((lhs < rhs) == !(rhs < lhs));
+                 });
+
+      RC_SUBCASE("operator< is transitive",
+                 [](stack_vector<int, MAXSIZE> const &a,
+                    stack_vector<int, MAXSIZE> const &b,
+                    stack_vector<int, MAXSIZE> const &c) {
+                   RC_PRE(a < b);
+                   RC_PRE(b < c);
+
+                   RC_ASSERT(a < c);
+                 });
+
+      RC_SUBCASE("operator< is connected",
+                 [](stack_vector<int, MAXSIZE> const &lhs,
+                    stack_vector<int, MAXSIZE> const &rhs) {
+                   RC_PRE(lhs != rhs);
+
+                   RC_ASSERT((lhs < rhs) || (rhs < lhs));
+                 });
+    }
+
+    SUBCASE("T is not ordered") {
+      bool result = is_lt_comparable_v<stack_vector<value_type<0>, MAXSIZE>>;
+
+      CHECK_FALSE(result);
+    }
+  }
+
   TEST_CASE_TEMPLATE(
       "stack_vector<T, MAXSIZE>::push_back", T, int, double, char) {
     constexpr std::size_t MAXSIZE = 5;